rairo commited on
Commit
d7ded78
·
verified ·
1 Parent(s): 2107a21

Update sozo_gen.py

Browse files
Files changed (1) hide show
  1. sozo_gen.py +297 -333
sozo_gen.py CHANGED
@@ -268,388 +268,352 @@ def sanitize_for_firebase_key(text: str) -> str:
268
  return text
269
 
270
  # REPLACE THE OLD generate_report_draft WITH THIS CORRECTED VERSION
271
- def generate_report_draft(buf, name: str, ctx: str, uid: str, project_id: str, bucket):
 
 
 
272
  """
273
- Enhanced autonomous data analysis function that intelligently analyzes any dataset
274
- and generates comprehensive, domain-appropriate reports with contextual visualizations.
275
-
276
- Maintains backward compatibility with existing function signature and outputs.
277
  """
278
- logging.info(f"Generating enhanced autonomous report draft for project {project_id}")
279
 
280
- # Load data safely (existing functionality preserved)
281
- df = load_dataframe_safely(buf, name)
 
 
 
 
 
 
 
282
 
283
- # Initialize LLM (existing setup preserved)
284
- llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=API_KEY, temperature=0.1)
 
285
 
286
- # Enhanced autonomous data analysis
287
- try:
288
- # Stage 1: Intelligent Data Classification and Deep Analysis
289
- autonomous_context = perform_autonomous_data_analysis(df, ctx, name)
290
-
291
- # Stage 2: Generate Enhanced Report with Intelligent Narrative
292
- enhanced_report = generate_intelligent_report(llm, autonomous_context)
293
-
294
- # Stage 3: Smart Chart Generation
295
- chart_urls = generate_autonomous_charts(llm, df, enhanced_report, uid, project_id, bucket)
296
-
297
- # Preserve original output structure
298
- return {"raw_md": enhanced_report, "chartUrls": chart_urls}
299
-
300
- except Exception as e:
301
- logging.error(f"Enhanced analysis failed, falling back to original: {str(e)}")
302
- # Fallback to original logic if enhancement fails
303
- return generate_original_report(df, llm, ctx, uid, project_id, bucket)
304
-
305
-
306
- def perform_autonomous_data_analysis(df: pd.DataFrame, user_ctx: str, filename: str) -> Dict[str, Any]:
307
- """
308
- Performs comprehensive autonomous analysis of the dataset to understand its nature,
309
- domain, and analytical potential.
310
- """
311
- logging.info("Performing autonomous data analysis...")
312
-
313
- # Basic data profiling with JSON-safe types
314
- basic_info = {
315
- "shape": df.shape,
316
- "columns": list(df.columns),
317
- "dtypes": {col: str(dtype) for col, dtype in df.dtypes.items()},
318
- "filename": filename,
319
- "user_context": user_ctx
320
- }
321
 
322
- # Intelligent domain classification
323
- domain_analysis = classify_dataset_domain(df, filename)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
324
 
325
- # Advanced statistical analysis
326
- statistical_profile = generate_statistical_profile(df)
 
 
 
 
 
 
 
 
 
327
 
328
- # Relationship discovery
329
- relationships = discover_data_relationships(df)
330
 
331
- # Temporal analysis if applicable
332
- temporal_insights = analyze_temporal_patterns(df)
333
 
334
- # Data quality assessment
335
- quality_metrics = assess_data_quality(df)
336
 
337
- # Business context inference
338
- business_context = infer_business_context(df, domain_analysis)
339
 
340
  return {
341
- "basic_info": basic_info,
342
- "domain": domain_analysis,
343
- "statistics": statistical_profile,
344
- "relationships": relationships,
345
- "temporal": temporal_insights,
346
- "quality": quality_metrics,
347
- "business_context": business_context,
348
- "analysis_complexity": determine_analysis_complexity(df, domain_analysis)
 
 
 
 
349
  }
350
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351
 
352
- def classify_dataset_domain(df: pd.DataFrame, filename: str) -> Dict[str, Any]:
353
  """
354
- Intelligently classifies the dataset domain based on column patterns, data types,
355
- and semantic analysis.
356
  """
357
- domain_indicators = {
358
- "financial": ["amount", "price", "cost", "revenue", "profit", "transaction", "payment", "invoice"],
359
- "survey": ["rating", "satisfaction", "response", "score", "survey", "feedback", "opinion"],
360
- "scientific": ["measurement", "experiment", "test", "sample", "observation", "hypothesis", "variable"],
361
- "marketing": ["campaign", "click", "conversion", "customer", "lead", "acquisition", "retention"],
362
- "operational": ["process", "time", "duration", "status", "workflow", "performance", "efficiency"],
363
- "sales": ["order", "product", "quantity", "sales", "customer", "deal", "pipeline"],
364
- "hr": ["employee", "salary", "department", "performance", "training", "recruitment"],
365
- "healthcare": ["patient", "diagnosis", "treatment", "medical", "health", "symptom", "medication"]
366
- }
367
 
368
- # Analyze column names for domain indicators
369
- columns_lower = [col.lower() for col in df.columns]
370
- domain_scores = {}
371
-
372
- for domain, keywords in domain_indicators.items():
373
- score = sum(1 for col in columns_lower for keyword in keywords if keyword in col)
374
- domain_scores[domain] = score
375
 
376
- # Filename analysis
377
- filename_lower = filename.lower()
378
- for domain, keywords in domain_indicators.items():
379
- if any(keyword in filename_lower for keyword in keywords):
380
- domain_scores[domain] = domain_scores.get(domain, 0) + 2
381
 
382
- # Data type analysis
383
- numeric_ratio = len(df.select_dtypes(include=[np.number]).columns) / len(df.columns)
384
- categorical_ratio = len(df.select_dtypes(include=['object']).columns) / len(df.columns)
385
-
386
- # Determine primary domain
387
- primary_domain = max(domain_scores, key=domain_scores.get) if domain_scores else "general"
388
-
389
- return {
390
- "primary_domain": primary_domain,
391
- "domain_confidence": int(domain_scores.get(primary_domain, 0)),
392
- "domain_scores": {k: int(v) for k, v in domain_scores.items()},
393
- "data_characteristics": {
394
- "numeric_ratio": float(numeric_ratio),
395
- "categorical_ratio": float(categorical_ratio),
396
- "is_time_series": detect_time_series(df),
397
- "is_transactional": detect_transactional_data(df),
398
- "is_experimental": detect_experimental_data(df)
399
- }
400
- }
401
 
 
 
402
 
403
- def generate_statistical_profile(df: pd.DataFrame) -> Dict[str, Any]:
404
- """
405
- Generates comprehensive statistical profile of the dataset.
406
- """
407
- profile = {
408
- "summary_stats": {},
409
- "correlations": {},
410
- "distributions": {},
411
- "outliers": {},
412
- "missing_data": {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
413
  }
414
 
415
- # Summary statistics for numeric columns with JSON-safe conversion
416
- numeric_cols = df.select_dtypes(include=[np.number]).columns
417
- if len(numeric_cols) > 0:
418
- desc_stats = df[numeric_cols].describe()
419
- # Convert to JSON-safe format
420
- profile["summary_stats"] = {
421
- col: {
422
- stat: float(val) if pd.notna(val) else None
423
- for stat, val in desc_stats[col].items()
424
- }
425
- for col in desc_stats.columns
426
- }
427
-
428
- # Correlation analysis with JSON-safe conversion
429
- if len(numeric_cols) > 1:
430
- corr_matrix = df[numeric_cols].corr()
431
- # Find strong correlations
432
- strong_corrs = []
433
- for i in range(len(corr_matrix.columns)):
434
- for j in range(i+1, len(corr_matrix.columns)):
435
- corr_val = corr_matrix.iloc[i, j]
436
- if abs(corr_val) > 0.7 and pd.notna(corr_val): # Strong correlation threshold
437
- strong_corrs.append({
438
- "var1": corr_matrix.columns[i],
439
- "var2": corr_matrix.columns[j],
440
- "correlation": float(corr_val)
441
- })
442
- profile["correlations"] = {"strong_correlations": strong_corrs}
443
-
444
- # Categorical analysis
445
- categorical_cols = df.select_dtypes(include=['object']).columns
446
- if len(categorical_cols) > 0:
447
- profile["categorical_analysis"] = {}
448
- for col in categorical_cols:
449
- value_counts = df[col].value_counts().head(5)
450
- profile["categorical_analysis"][col] = {
451
- "unique_count": int(df[col].nunique()),
452
- "top_values": {str(k): int(v) for k, v in value_counts.items()}
453
- }
454
 
455
- # Missing data analysis with JSON-safe conversion
456
- missing_data = df.isnull().sum()
457
- missing_dict = {}
458
- for col, missing_count in missing_data.items():
459
- if missing_count > 0:
460
- missing_dict[col] = int(missing_count)
461
 
462
- profile["missing_data"] = {
463
- "columns_with_missing": missing_dict,
464
- "total_missing_percentage": float((df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100)
465
- }
466
 
467
- return profile
468
-
 
 
469
 
470
- def discover_data_relationships(df: pd.DataFrame) -> Dict[str, Any]:
471
- """
472
- Discovers meaningful relationships and patterns in the data.
473
- """
474
- relationships = {
475
- "key_relationships": [],
476
- "patterns": [],
477
- "anomalies": []
478
- }
479
 
480
- # Identify potential key relationships
481
- numeric_cols = df.select_dtypes(include=[np.number]).columns
482
 
483
- if len(numeric_cols) > 1:
484
- # Find interesting relationships
485
- for col1 in numeric_cols:
486
- for col2 in numeric_cols:
487
- if col1 != col2:
488
- correlation = df[col1].corr(df[col2])
489
- if abs(correlation) > 0.5 and pd.notna(correlation): # Moderate to strong correlation
490
- relationships["key_relationships"].append({
491
- "variable1": col1,
492
- "variable2": col2,
493
- "relationship_strength": float(correlation),
494
- "relationship_type": "positive" if correlation > 0 else "negative"
495
- })
496
-
497
- # Identify patterns in categorical data
498
- categorical_cols = df.select_dtypes(include=['object']).columns
499
- for col in categorical_cols:
500
- if df[col].nunique() < 20: # Reasonable number of categories
501
- value_counts = df[col].value_counts()
502
- if len(value_counts) > 0:
503
- relationships["patterns"].append({
504
- "column": col,
505
- "pattern_type": "categorical_distribution",
506
- "dominant_category": str(value_counts.index[0]),
507
- "dominance_percentage": float((value_counts.iloc[0] / len(df)) * 100)
508
- })
509
 
510
- return relationships
511
-
512
-
513
- def analyze_temporal_patterns(df: pd.DataFrame) -> Dict[str, Any]:
514
- """
515
- Analyzes temporal patterns if time-based columns are detected.
516
- """
517
- temporal_insights = {"has_temporal_data": False}
 
 
 
518
 
519
- # Detect date/time columns
520
- date_columns = []
521
- for col in df.columns:
522
- if df[col].dtype == 'datetime64[ns]' or 'date' in col.lower() or 'time' in col.lower():
523
- try:
524
- pd.to_datetime(df[col])
525
- date_columns.append(col)
526
- except:
527
- continue
528
-
529
- if date_columns:
530
- temporal_insights["has_temporal_data"] = True
531
- temporal_insights["date_columns"] = date_columns
532
-
533
- # Analyze temporal patterns for the first date column
534
- primary_date_col = date_columns[0]
535
- df_temp = df.copy()
536
- df_temp[primary_date_col] = pd.to_datetime(df_temp[primary_date_col])
537
-
538
- temporal_insights["temporal_analysis"] = {
539
- "date_range": {
540
- "start": df_temp[primary_date_col].min().strftime('%Y-%m-%d'),
541
- "end": df_temp[primary_date_col].max().strftime('%Y-%m-%d')
542
- },
543
- "time_span_days": int((df_temp[primary_date_col].max() - df_temp[primary_date_col].min()).days),
544
- "frequency": detect_temporal_frequency(df_temp[primary_date_col])
545
  }
546
 
547
- return temporal_insights
548
-
 
 
549
 
550
- def assess_data_quality(df: pd.DataFrame) -> Dict[str, Any]:
551
  """
552
- Assesses data quality and identifies potential issues.
553
  """
554
- quality_metrics = {
555
- "overall_quality_score": 0,
556
- "quality_issues": [],
557
- "data_completeness": 0,
558
- "data_consistency": {}
559
- }
560
 
561
- # Completeness assessment with JSON-safe conversion
562
- completeness = float((1 - df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100)
563
- quality_metrics["data_completeness"] = completeness
564
 
565
- # Identify quality issues
566
- if completeness < 95:
567
- quality_metrics["quality_issues"].append("Missing data detected")
568
 
569
- # Check for duplicates with JSON-safe conversion
570
- duplicate_rows = int(df.duplicated().sum())
571
- if duplicate_rows > 0:
572
- quality_metrics["quality_issues"].append(f"{duplicate_rows} duplicate rows found")
573
 
574
- # Check for inconsistent data types
575
- for col in df.columns:
576
- if df[col].dtype == 'object':
577
- if df[col].str.isnumeric().any() and not df[col].str.isnumeric().all():
578
- quality_metrics["quality_issues"].append(f"Inconsistent data types in {col}")
579
 
580
- # Calculate overall quality score with JSON-safe conversion
581
- base_score = 100.0
582
- base_score -= (100 - completeness) * 0.5 # Penalize missing data
583
- base_score -= len(quality_metrics["quality_issues"]) * 5 # Penalize each quality issue
584
- quality_metrics["overall_quality_score"] = float(max(0, base_score))
585
 
586
- return quality_metrics
587
-
588
-
589
- def infer_business_context(df: pd.DataFrame, domain_analysis: Dict[str, Any]) -> Dict[str, Any]:
590
- """
591
- Infers business context and potential use cases based on the data characteristics.
592
- """
593
- domain = domain_analysis["primary_domain"]
594
 
595
- context_mapping = {
596
- "financial": {
597
- "key_metrics": ["Revenue", "Profit", "Cost", "ROI"],
598
- "typical_analyses": ["Trend analysis", "Profitability analysis", "Budget vs actual"],
599
- "stakeholders": ["CFO", "Finance team", "Executive leadership"]
600
- },
601
- "survey": {
602
- "key_metrics": ["Satisfaction scores", "Response rates", "Sentiment"],
603
- "typical_analyses": ["Satisfaction analysis", "Demographic breakdown", "Correlation analysis"],
604
- "stakeholders": ["Marketing team", "Product managers", "Customer success"]
605
- },
606
- "scientific": {
607
- "key_metrics": ["Statistical significance", "Effect size", "Confidence intervals"],
608
- "typical_analyses": ["Hypothesis testing", "Regression analysis", "Experimental validation"],
609
- "stakeholders": ["Researchers", "Scientists", "Academic community"]
610
- },
611
- "marketing": {
612
- "key_metrics": ["Conversion rates", "Customer acquisition cost", "Campaign ROI"],
613
- "typical_analyses": ["Campaign performance", "Customer segmentation", "Attribution analysis"],
614
- "stakeholders": ["Marketing team", "CMO", "Sales team"]
615
- }
616
- }
 
617
 
618
- return context_mapping.get(domain, {
619
- "key_metrics": ["Performance indicators", "Trends", "Patterns"],
620
- "typical_analyses": ["Descriptive analysis", "Trend identification", "Pattern recognition"],
621
- "stakeholders": ["Business stakeholders", "Decision makers"]
622
- })
623
-
624
-
625
- def generate_intelligent_report(llm, autonomous_context: Dict[str, Any]) -> str:
626
- """
627
- Generates an intelligent, domain-appropriate report with organic storytelling.
628
- """
629
- # Create truly autonomous prompt that lets AI decide everything
630
- enhanced_prompt = f"""
631
- You are a world-class data analyst who has just been handed this dataset to analyze. Look at the data characteristics and tell me the most compelling story you can find.
632
-
633
- **DATASET CONTEXT:**
634
- {json.dumps(autonomous_context, indent=2)}
635
-
636
- **YOUR MISSION:**
637
- Analyze this data like you would if a CEO walked into your office and said "I need to understand what this data is telling us." Write a report that would make them say "This is exactly what I needed to know."
638
 
639
- **GUIDELINES:**
640
- - Don't follow a rigid structure - let the data guide your narrative
641
- - Choose your own headings and sections based on what the data reveals
642
- - Write like you're presenting findings to someone who needs to make important decisions
643
- - Include specific numbers and insights that matter
644
- - Insert chart recommendations like: `<generate_chart: "chart_type | description">`
645
- - Valid chart types: bar, pie, line, scatter, hist, box, heatmap
646
- - Only recommend charts that truly support your narrative
647
 
648
- **FORGET TEMPLATES - TELL THE STORY:**
649
- What's the most interesting, important, or surprising thing this data reveals? Start there and build your entire report around that central insight. Make it compelling, make it actionable, make it memorable.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
650
 
651
- Be the data analyst who gets promoted because they don't just present data - they reveal insights that drive business decisions.
652
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
653
 
654
 
655
  # Removed - no longer needed since we're letting AI decide everything organically
 
268
  return text
269
 
270
  # REPLACE THE OLD generate_report_draft WITH THIS CORRECTED VERSION
271
+ from scipy import stats
272
+ import re
273
+
274
+ def analyze_data_intelligence(df: pd.DataFrame, ctx_dict: Dict) -> Dict[str, Any]:
275
  """
276
+ Autonomous data intelligence system that classifies domain,
277
+ detects patterns, and determines optimal analytical approach.
 
 
278
  """
 
279
 
280
+ # Domain Classification Engine
281
+ domain_signals = {
282
+ 'financial': ['amount', 'price', 'cost', 'revenue', 'profit', 'balance', 'transaction', 'payment'],
283
+ 'survey': ['rating', 'satisfaction', 'score', 'response', 'feedback', 'opinion', 'agree', 'likert'],
284
+ 'scientific': ['measurement', 'experiment', 'trial', 'test', 'control', 'variable', 'hypothesis'],
285
+ 'marketing': ['campaign', 'conversion', 'click', 'impression', 'engagement', 'customer', 'segment'],
286
+ 'operational': ['performance', 'efficiency', 'throughput', 'capacity', 'utilization', 'process'],
287
+ 'temporal': ['date', 'time', 'timestamp', 'period', 'month', 'year', 'day', 'hour']
288
+ }
289
 
290
+ # Analyze column patterns
291
+ columns_lower = [col.lower() for col in df.columns]
292
+ domain_scores = {}
293
 
294
+ for domain, keywords in domain_signals.items():
295
+ score = sum(1 for col in columns_lower if any(keyword in col for keyword in keywords))
296
+ domain_scores[domain] = score
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
297
 
298
+ # Determine primary domain
299
+ primary_domain = max(domain_scores, key=domain_scores.get) if max(domain_scores.values()) > 0 else 'general'
300
+
301
+ # Data Structure Analysis
302
+ numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
303
+ categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
304
+ datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
305
+
306
+ # Detect time series
307
+ is_timeseries = len(datetime_cols) > 0 or any('date' in col.lower() or 'time' in col.lower() for col in columns_lower)
308
+
309
+ # Statistical Profile
310
+ statistical_summary = {}
311
+ if numeric_cols:
312
+ statistical_summary = {
313
+ 'correlations': df[numeric_cols].corr().abs().max().to_dict(),
314
+ 'distributions': {col: 'normal' if stats.normaltest(df[col].dropna())[1] > 0.05 else 'non_normal'
315
+ for col in numeric_cols if len(df[col].dropna()) > 8},
316
+ 'outliers': {col: len(df[col][np.abs(stats.zscore(df[col].dropna())) > 3])
317
+ for col in numeric_cols if len(df[col].dropna()) > 0}
318
+ }
319
 
320
+ # Pattern Detection
321
+ patterns = {
322
+ 'has_missing_data': df.isnull().sum().sum() > 0,
323
+ 'has_duplicates': df.duplicated().sum() > 0,
324
+ 'has_negative_values': any(df[col].min() < 0 for col in numeric_cols if len(df[col].dropna()) > 0),
325
+ 'has_categorical_hierarchy': any(len(df[col].unique()) > 10 for col in categorical_cols),
326
+ 'potential_segments': len(categorical_cols) > 0
327
+ }
328
+
329
+ # Insight Opportunities
330
+ insight_opportunities = []
331
 
332
+ if is_timeseries:
333
+ insight_opportunities.append("temporal_trends")
334
 
335
+ if len(numeric_cols) > 1:
336
+ insight_opportunities.append("correlations")
337
 
338
+ if len(categorical_cols) > 0 and len(numeric_cols) > 0:
339
+ insight_opportunities.append("segmentation")
340
 
341
+ if any(statistical_summary.get('outliers', {}).values()):
342
+ insight_opportunities.append("anomalies")
343
 
344
  return {
345
+ 'primary_domain': primary_domain,
346
+ 'domain_confidence': domain_scores,
347
+ 'data_structure': {
348
+ 'is_timeseries': is_timeseries,
349
+ 'numeric_cols': numeric_cols,
350
+ 'categorical_cols': categorical_cols,
351
+ 'datetime_cols': datetime_cols
352
+ },
353
+ 'statistical_profile': statistical_summary,
354
+ 'patterns': patterns,
355
+ 'insight_opportunities': insight_opportunities,
356
+ 'narrative_suggestions': get_narrative_suggestions(primary_domain, insight_opportunities, patterns)
357
  }
358
 
359
+ def get_narrative_suggestions(domain: str, opportunities: List[str], patterns: Dict) -> Dict[str, str]:
360
+ """Generate narrative direction based on domain and data characteristics"""
361
+
362
+ narrative_frameworks = {
363
+ 'financial': {
364
+ 'hook': "Follow the money trail that reveals your business's hidden opportunities",
365
+ 'structure': "performance → trends → risks → opportunities",
366
+ 'focus': "profitability, efficiency, growth patterns, risk indicators"
367
+ },
368
+ 'survey': {
369
+ 'hook': "Your customers are speaking - here's what they're really saying",
370
+ 'structure': "sentiment → segments → drivers → actions",
371
+ 'focus': "satisfaction drivers, demographic patterns, improvement areas"
372
+ },
373
+ 'scientific': {
374
+ 'hook': "The data reveals relationships that challenge conventional thinking",
375
+ 'structure': "hypothesis → evidence → significance → implications",
376
+ 'focus': "statistical significance, correlations, experimental validity"
377
+ },
378
+ 'marketing': {
379
+ 'hook': "Discover the customer journey patterns driving your growth",
380
+ 'structure': "performance → segments → optimization → strategy",
381
+ 'focus': "conversion funnels, customer segments, campaign effectiveness"
382
+ },
383
+ 'operational': {
384
+ 'hook': "Operational excellence lives in the details - here's where to look",
385
+ 'structure': "efficiency → bottlenecks → optimization → impact",
386
+ 'focus': "process efficiency, capacity utilization, improvement opportunities"
387
+ },
388
+ 'general': {
389
+ 'hook': "Every dataset tells a story - here's what yours is saying",
390
+ 'structure': "overview → patterns → insights → implications",
391
+ 'focus': "key patterns, significant relationships, actionable insights"
392
+ }
393
+ }
394
+
395
+ return narrative_frameworks.get(domain, narrative_frameworks['general'])
396
 
397
+ def create_autonomous_prompt(df: pd.DataFrame, enhanced_ctx: Dict, intelligence: Dict) -> str:
398
  """
399
+ Generate a dynamic, intelligence-driven prompt that creates compelling narratives
400
+ rather than following templates.
401
  """
 
 
 
 
 
 
 
 
 
 
402
 
403
+ domain = intelligence['primary_domain']
404
+ opportunities = intelligence['insight_opportunities']
405
+ narrative = intelligence['narrative_suggestions']
 
 
 
 
406
 
407
+ # Dynamic chart strategy based on data characteristics
408
+ chart_strategy = generate_chart_strategy(intelligence)
 
 
 
409
 
410
+ prompt = f"""You are an elite data storyteller with deep expertise in {domain} analytics. Your mission is to uncover the compelling narrative hidden in this dataset and present it as a captivating story that drives action.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
411
 
412
+ **THE DATA'S STORY CONTEXT:**
413
+ {json.dumps(enhanced_ctx, indent=2)}
414
 
415
+ **INTELLIGENCE ANALYSIS:**
416
+ - Primary Domain: {domain}
417
+ - Key Opportunities: {', '.join(opportunities)}
418
+ - Data Characteristics: {intelligence['data_structure']}
419
+ - Narrative Framework: {narrative['structure']}
420
+
421
+ **YOUR STORYTELLING MISSION:**
422
+ {narrative['hook']}
423
+
424
+ **NARRATIVE CONSTRUCTION GUIDELINES:**
425
+ 1. **LEAD WITH INTRIGUE**: Start with the most compelling finding that hooks the reader
426
+ 2. **BUILD TENSION**: Present contrasts, surprises, or unexpected patterns
427
+ 3. **REVEAL INSIGHTS**: Use data to resolve the tension with clear explanations
428
+ 4. **DRIVE ACTION**: End with specific, actionable recommendations
429
+
430
+ **VISUALIZATION STRATEGY:**
431
+ {chart_strategy}
432
+
433
+ **CRITICAL INSTRUCTIONS:**
434
+ - Write as if you're revealing a detective story, not filling a template
435
+ - Every insight must be supported by data evidence
436
+ - Use compelling headers that create curiosity (not "Executive Summary")
437
+ - Weave charts naturally into the narrative flow
438
+ - Focus on business impact and actionable outcomes
439
+ - Let the data's personality shine through your writing style
440
+
441
+ **CHART INTEGRATION:**
442
+ Insert charts using: `<generate_chart: "chart_type | compelling description that advances the story">`
443
+ Available types: bar, pie, line, scatter, hist
444
+
445
+ Transform this data into a story that decision-makers can't stop reading."""
446
+
447
+ return prompt
448
+
449
+ def generate_chart_strategy(intelligence: Dict) -> str:
450
+ """Generate visualization strategy based on data intelligence"""
451
+
452
+ domain = intelligence['primary_domain']
453
+ opportunities = intelligence['insight_opportunities']
454
+ structure = intelligence['data_structure']
455
+
456
+ strategies = {
457
+ 'financial': "Focus on trend lines showing performance over time, comparative bars for different categories, and scatter plots revealing correlations between financial metrics.",
458
+ 'survey': "Emphasize distribution histograms for satisfaction scores, segmented bar charts for demographic breakdowns, and correlation matrices for response patterns.",
459
+ 'scientific': "Prioritize scatter plots with regression lines, distribution comparisons, and statistical significance visualizations.",
460
+ 'marketing': "Highlight conversion funnels, customer segment comparisons, and campaign performance trends.",
461
+ 'operational': "Show efficiency trends, capacity utilization charts, and process performance comparisons."
462
  }
463
 
464
+ base_strategy = strategies.get(domain, "Create visualizations that best tell your data's unique story.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
465
 
466
+ # Add specific guidance based on data characteristics
467
+ if structure['is_timeseries']:
468
+ base_strategy += " Leverage time-series visualizations to show trends and patterns over time."
 
 
 
469
 
470
+ if 'correlations' in opportunities:
471
+ base_strategy += " Include correlation visualizations to reveal hidden relationships."
 
 
472
 
473
+ if 'segmentation' in opportunities:
474
+ base_strategy += " Use segmented charts to highlight different groups or categories."
475
+
476
+ return base_strategy
477
 
478
+ def enhance_data_context(df: pd.DataFrame, ctx_dict: Dict) -> Dict[str, Any]:
479
+ """Enhanced context generation with AI-driven analysis"""
 
 
 
 
 
 
 
480
 
481
+ # Get autonomous intelligence analysis
482
+ intelligence = analyze_data_intelligence(df, ctx_dict)
483
 
484
+ # Original context enhancement
485
+ enhanced = ctx_dict.copy()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
486
 
487
+ # Add statistical context
488
+ if not df.empty:
489
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
490
+ if len(numeric_cols) > 0:
491
+ enhanced['statistical_summary'] = {
492
+ 'numeric_columns': len(numeric_cols),
493
+ 'total_records': len(df),
494
+ 'missing_data_percentage': (df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100,
495
+ 'key_metrics': {col: {'mean': df[col].mean(), 'std': df[col].std()}
496
+ for col in numeric_cols[:3]} # Top 3 numeric columns
497
+ }
498
 
499
+ # Add categorical context
500
+ categorical_cols = df.select_dtypes(include=['object', 'category']).columns
501
+ if len(categorical_cols) > 0:
502
+ enhanced['categorical_summary'] = {
503
+ 'categorical_columns': len(categorical_cols),
504
+ 'unique_values': {col: df[col].nunique() for col in categorical_cols[:3]}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
505
  }
506
 
507
+ # Merge with intelligence analysis
508
+ enhanced['ai_intelligence'] = intelligence
509
+
510
+ return enhanced
511
 
512
+ def generate_report_draft(buf, name: str, ctx: str, uid: str, project_id: str, bucket):
513
  """
514
+ Enhanced autonomous report generation with intelligent narrative creation
515
  """
516
+ logging.info(f"Generating autonomous report draft for project {project_id}")
 
 
 
 
 
517
 
518
+ df = load_dataframe_safely(buf, name)
519
+ llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=API_KEY, temperature=0.1)
 
520
 
521
+ # Build enhanced context with AI intelligence
522
+ ctx_dict = {"shape": df.shape, "columns": list(df.columns), "user_ctx": ctx}
523
+ enhanced_ctx = enhance_data_context(df, ctx_dict)
524
 
525
+ # Get AI intelligence analysis
526
+ intelligence = analyze_data_intelligence(df, ctx_dict)
 
 
527
 
528
+ # Generate autonomous prompt
529
+ report_prompt = create_autonomous_prompt(df, enhanced_ctx, intelligence)
 
 
 
530
 
531
+ # Generate the report
532
+ md = llm.invoke(report_prompt).content
 
 
 
533
 
534
+ # Extract and process charts
535
+ chart_descs = extract_chart_tags(md)[:MAX_CHARTS]
536
+ chart_urls = {}
537
+ chart_generator = ChartGenerator(llm, df)
 
 
 
 
538
 
539
+ for desc in chart_descs:
540
+ # Create a safe key for Firebase
541
+ safe_desc = sanitize_for_firebase_key(desc)
542
+
543
+ # Replace the original description in the markdown with the safe one
544
+ md = md.replace(f'<generate_chart: "{desc}">', f'<generate_chart: "{safe_desc}">')
545
+ md = md.replace(f'<generate_chart: {desc}>', f'<generate_chart: "{safe_desc}">') # Handle no quotes case
546
+
547
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file:
548
+ img_path = Path(temp_file.name)
549
+ try:
550
+ chart_spec = chart_generator.generate_chart_spec(desc) # Still generate spec from original desc
551
+ if execute_chart_spec(chart_spec, df, img_path):
552
+ blob_name = f"sozo_projects/{uid}/{project_id}/charts/{uuid.uuid4().hex}.png"
553
+ blob = bucket.blob(blob_name)
554
+ blob.upload_from_filename(str(img_path))
555
+
556
+ # Use the safe key in the dictionary
557
+ chart_urls[safe_desc] = blob.public_url
558
+ logging.info(f"Uploaded chart '{desc}' to {blob.public_url} with safe key '{safe_desc}'")
559
+ finally:
560
+ if os.path.exists(img_path):
561
+ os.unlink(img_path)
562
 
563
+ return {"raw_md": md, "chartUrls": chart_urls}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
564
 
565
+ # Additional helper functions for the autonomous system
 
 
 
 
 
 
 
566
 
567
+ def detect_data_relationships(df: pd.DataFrame) -> Dict[str, Any]:
568
+ """Detect relationships and patterns in the data"""
569
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
570
+ relationships = {}
571
+
572
+ if len(numeric_cols) > 1:
573
+ corr_matrix = df[numeric_cols].corr()
574
+ # Find strong correlations (> 0.7 or < -0.7)
575
+ strong_correlations = []
576
+ for i in range(len(corr_matrix.columns)):
577
+ for j in range(i+1, len(corr_matrix.columns)):
578
+ corr_val = corr_matrix.iloc[i, j]
579
+ if abs(corr_val) > 0.7:
580
+ strong_correlations.append({
581
+ 'var1': corr_matrix.columns[i],
582
+ 'var2': corr_matrix.columns[j],
583
+ 'correlation': corr_val
584
+ })
585
+ relationships['strong_correlations'] = strong_correlations
586
+
587
+ return relationships
588
 
589
+ def identify_key_metrics(df: pd.DataFrame, domain: str) -> List[str]:
590
+ """Identify the most important metrics based on domain and data characteristics"""
591
+ numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
592
+
593
+ domain_priorities = {
594
+ 'financial': ['revenue', 'profit', 'cost', 'amount', 'price', 'margin'],
595
+ 'survey': ['rating', 'score', 'satisfaction', 'response'],
596
+ 'marketing': ['conversion', 'click', 'impression', 'engagement'],
597
+ 'operational': ['efficiency', 'utilization', 'throughput', 'performance']
598
+ }
599
+
600
+ priorities = domain_priorities.get(domain, [])
601
+ key_metrics = []
602
+
603
+ # Match column names with domain priorities
604
+ for col in numeric_cols:
605
+ col_lower = col.lower()
606
+ for priority in priorities:
607
+ if priority in col_lower:
608
+ key_metrics.append(col)
609
+ break
610
+
611
+ # If no matches, use columns with highest variance (most interesting)
612
+ if not key_metrics and numeric_cols:
613
+ variances = df[numeric_cols].var().sort_values(ascending=False)
614
+ key_metrics = variances.head(3).index.tolist()
615
+
616
+ return key_metrics[:5] # Return top 5 key metrics
617
 
618
 
619
  # Removed - no longer needed since we're letting AI decide everything organically