rairo commited on
Commit
6bf9436
·
verified ·
1 Parent(s): dbb1090

Update sozo_gen.py

Browse files
Files changed (1) hide show
  1. sozo_gen.py +78 -248
sozo_gen.py CHANGED
@@ -1,4 +1,5 @@
1
  # sozo_gen.py
 
2
 
3
  import os
4
  import re
@@ -77,7 +78,7 @@ def clean_narration(txt: str) -> str:
77
  def placeholder_img() -> Image.Image: return Image.new("RGB", (WIDTH, HEIGHT), (230, 230, 230))
78
 
79
  def generate_image_from_prompt(prompt: str) -> Image.Image:
80
- model_main = "gemini-1.5-flash-latest";
81
  full_prompt = "A clean business-presentation illustration: " + prompt
82
  try:
83
  model = genai.GenerativeModel(model_main)
@@ -309,13 +310,35 @@ def analyze_data_intelligence(df: pd.DataFrame, ctx_dict: Dict) -> Dict[str, Any
309
  # Statistical Profile
310
  statistical_summary = {}
311
  if numeric_cols:
312
- statistical_summary = {
313
- 'correlations': df[numeric_cols].corr().abs().max().to_dict(),
314
- 'distributions': {col: 'normal' if stats.normaltest(df[col].dropna())[1] > 0.05 else 'non_normal'
315
- for col in numeric_cols if len(df[col].dropna()) > 8},
316
- 'outliers': {col: len(df[col][np.abs(stats.zscore(df[col].dropna())) > 3])
317
- for col in numeric_cols if len(df[col].dropna()) > 0}
318
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
 
320
  # Pattern Detection
321
  patterns = {
@@ -394,6 +417,23 @@ def get_narrative_suggestions(domain: str, opportunities: List[str], patterns: D
394
 
395
  return narrative_frameworks.get(domain, narrative_frameworks['general'])
396
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
397
  def create_autonomous_prompt(df: pd.DataFrame, enhanced_ctx: Dict, intelligence: Dict) -> str:
398
  """
399
  Generate a dynamic, intelligence-driven prompt that creates compelling narratives
@@ -407,15 +447,18 @@ def create_autonomous_prompt(df: pd.DataFrame, enhanced_ctx: Dict, intelligence:
407
  # Dynamic chart strategy based on data characteristics
408
  chart_strategy = generate_chart_strategy(intelligence)
409
 
 
 
 
410
  prompt = f"""You are an elite data storyteller with deep expertise in {domain} analytics. Your mission is to uncover the compelling narrative hidden in this dataset and present it as a captivating story that drives action.
411
 
412
  **THE DATA'S STORY CONTEXT:**
413
- {json.dumps(enhanced_ctx, indent=2)}
414
 
415
  **INTELLIGENCE ANALYSIS:**
416
  - Primary Domain: {domain}
417
  - Key Opportunities: {', '.join(opportunities)}
418
- - Data Characteristics: {intelligence['data_structure']}
419
  - Narrative Framework: {narrative['structure']}
420
 
421
  **YOUR STORYTELLING MISSION:**
@@ -488,20 +531,38 @@ def enhance_data_context(df: pd.DataFrame, ctx_dict: Dict) -> Dict[str, Any]:
488
  if not df.empty:
489
  numeric_cols = df.select_dtypes(include=[np.number]).columns
490
  if len(numeric_cols) > 0:
 
 
 
 
 
 
 
 
 
 
 
 
491
  enhanced['statistical_summary'] = {
492
- 'numeric_columns': len(numeric_cols),
493
- 'total_records': len(df),
494
- 'missing_data_percentage': (df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100,
495
- 'key_metrics': {col: {'mean': df[col].mean(), 'std': df[col].std()}
496
- for col in numeric_cols[:3]} # Top 3 numeric columns
497
  }
498
 
499
  # Add categorical context
500
  categorical_cols = df.select_dtypes(include=['object', 'category']).columns
501
  if len(categorical_cols) > 0:
 
 
 
 
 
 
 
502
  enhanced['categorical_summary'] = {
503
- 'categorical_columns': len(categorical_cols),
504
- 'unique_values': {col: df[col].nunique() for col in categorical_cols[:3]}
505
  }
506
 
507
  # Merge with intelligence analysis
@@ -614,237 +675,6 @@ def identify_key_metrics(df: pd.DataFrame, domain: str) -> List[str]:
614
  key_metrics = variances.head(3).index.tolist()
615
 
616
  return key_metrics[:5] # Return top 5 key metrics
617
-
618
-
619
- # Removed - no longer needed since we're letting AI decide everything organically
620
-
621
-
622
- def generate_autonomous_charts(llm, df: pd.DataFrame, report_md: str, uid: str, project_id: str, bucket) -> Dict[str, str]:
623
- """
624
- Generates charts autonomously based on the report content and data characteristics.
625
- """
626
- # Extract chart descriptions from the enhanced report
627
- chart_descs = extract_chart_tags(report_md)[:MAX_CHARTS]
628
- chart_urls = {}
629
-
630
- if not chart_descs:
631
- # If no charts specified, generate intelligent defaults
632
- chart_descs = generate_intelligent_chart_suggestions(df, llm)
633
-
634
- chart_generator = ChartGenerator(llm, df)
635
-
636
- for desc in chart_descs:
637
- try:
638
- # Create a safe key for Firebase
639
- safe_desc = sanitize_for_firebase_key(desc)
640
-
641
- # Replace chart tags in markdown
642
- report_md = report_md.replace(f'<generate_chart: "{desc}">', f'<generate_chart: "{safe_desc}">')
643
- report_md = report_md.replace(f'<generate_chart: {desc}>', f'<generate_chart: "{safe_desc}">')
644
-
645
- # Generate chart
646
- with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file:
647
- img_path = Path(temp_file.name)
648
- try:
649
- chart_spec = chart_generator.generate_chart_spec(desc)
650
- if execute_chart_spec(chart_spec, df, img_path):
651
- blob_name = f"sozo_projects/{uid}/{project_id}/charts/{uuid.uuid4().hex}.png"
652
- blob = bucket.blob(blob_name)
653
- blob.upload_from_filename(str(img_path))
654
-
655
- chart_urls[safe_desc] = blob.public_url
656
- logging.info(f"Generated autonomous chart: {safe_desc}")
657
- finally:
658
- if os.path.exists(img_path):
659
- os.unlink(img_path)
660
-
661
- except Exception as e:
662
- logging.error(f"Failed to generate chart '{desc}': {str(e)}")
663
- continue
664
-
665
- return chart_urls
666
-
667
-
668
- def generate_intelligent_chart_suggestions(df: pd.DataFrame, llm) -> List[str]:
669
- """
670
- Generates intelligent chart suggestions based on data characteristics.
671
- """
672
- numeric_cols = df.select_dtypes(include=[np.number]).columns
673
- categorical_cols = df.select_dtypes(include=['object']).columns
674
-
675
- suggestions = []
676
-
677
- # Time series chart if temporal data exists
678
- if detect_time_series(df):
679
- suggestions.append("line | Time series trend analysis | Show temporal patterns")
680
-
681
- # Distribution chart for numeric data
682
- if len(numeric_cols) > 0:
683
- main_numeric = numeric_cols[0]
684
- suggestions.append(f"hist | Distribution of {main_numeric} | Understand data distribution")
685
-
686
- # Correlation analysis if multiple numeric columns
687
- if len(numeric_cols) > 1:
688
- suggestions.append("scatter | Correlation analysis | Identify relationships between variables")
689
-
690
- # Categorical breakdown
691
- if len(categorical_cols) > 0:
692
- main_categorical = categorical_cols[0]
693
- suggestions.append(f"bar | {main_categorical} breakdown | Show categorical distribution")
694
-
695
- return suggestions[:MAX_CHARTS]
696
-
697
-
698
- # Helper functions (preserve existing functionality)
699
- def detect_time_series(df: pd.DataFrame) -> bool:
700
- """Detect if dataset contains time series data."""
701
- for col in df.columns:
702
- if 'date' in col.lower() or 'time' in col.lower():
703
- return True
704
- try:
705
- pd.to_datetime(df[col])
706
- return True
707
- except:
708
- continue
709
- return False
710
-
711
-
712
- def detect_transactional_data(df: pd.DataFrame) -> bool:
713
- """Detect if dataset contains transactional data."""
714
- transaction_indicators = ['transaction', 'payment', 'order', 'invoice', 'amount', 'quantity']
715
- columns_lower = [col.lower() for col in df.columns]
716
- return any(indicator in col for col in columns_lower for indicator in transaction_indicators)
717
-
718
-
719
- def detect_experimental_data(df: pd.DataFrame) -> bool:
720
- """Detect if dataset contains experimental data."""
721
- experimental_indicators = ['test', 'experiment', 'trial', 'group', 'treatment', 'control']
722
- columns_lower = [col.lower() for col in df.columns]
723
- return any(indicator in col for col in columns_lower for indicator in experimental_indicators)
724
-
725
-
726
- def detect_temporal_frequency(date_series: pd.Series) -> str:
727
- """Detect the frequency of temporal data."""
728
- if len(date_series) < 2:
729
- return "insufficient_data"
730
-
731
- # Calculate time differences
732
- time_diffs = date_series.sort_values().diff().dropna()
733
- median_diff = time_diffs.median()
734
-
735
- if median_diff <= pd.Timedelta(days=1):
736
- return "daily"
737
- elif median_diff <= pd.Timedelta(days=7):
738
- return "weekly"
739
- elif median_diff <= pd.Timedelta(days=31):
740
- return "monthly"
741
- else:
742
- return "irregular"
743
-
744
-
745
- def determine_analysis_complexity(df: pd.DataFrame, domain_analysis: Dict[str, Any]) -> str:
746
- """Determine the complexity level of analysis required."""
747
- complexity_factors = 0
748
-
749
- # Data size factor
750
- if len(df) > 10000:
751
- complexity_factors += 1
752
- if len(df.columns) > 20:
753
- complexity_factors += 1
754
-
755
- # Data type diversity
756
- if len(df.select_dtypes(include=[np.number]).columns) > 5:
757
- complexity_factors += 1
758
- if len(df.select_dtypes(include=['object']).columns) > 5:
759
- complexity_factors += 1
760
-
761
- # Domain complexity
762
- if domain_analysis["primary_domain"] in ["scientific", "financial"]:
763
- complexity_factors += 1
764
-
765
- if complexity_factors >= 3:
766
- return "high"
767
- elif complexity_factors >= 2:
768
- return "medium"
769
- else:
770
- return "low"
771
-
772
-
773
- def generate_original_report(df: pd.DataFrame, llm, ctx: str, uid: str, project_id: str, bucket) -> Dict[str, str]:
774
- """
775
- Fallback to original report generation logic if enhanced version fails.
776
- """
777
- logging.info("Using fallback report generation")
778
-
779
- # Original logic preserved
780
- ctx_dict = {"shape": df.shape, "columns": list(df.columns), "user_ctx": ctx}
781
- enhanced_ctx = enhance_data_context(df, ctx_dict)
782
-
783
- report_prompt = f"""
784
- You are a senior data analyst and business intelligence expert. Analyze the provided dataset and write a comprehensive executive-level Markdown report.
785
- **Dataset Analysis Context:** {json.dumps(enhanced_ctx, indent=2)}
786
- **Instructions:**
787
- 1. **Executive Summary**: Start with a high-level summary of key findings.
788
- 2. **Key Insights**: Provide 3-5 key insights, each with its own chart tag.
789
- 3. **Visual Support**: Insert chart tags like: `<generate_chart: "chart_type | specific description">`.
790
- Valid chart types: bar, pie, line, scatter, hist.
791
- Generate insights that would be valuable to C-level executives.
792
- """
793
-
794
- md = llm.invoke(report_prompt).content
795
- chart_descs = extract_chart_tags(md)[:MAX_CHARTS]
796
- chart_urls = {}
797
- chart_generator = ChartGenerator(llm, df)
798
-
799
- for desc in chart_descs:
800
- safe_desc = sanitize_for_firebase_key(desc)
801
- md = md.replace(f'<generate_chart: "{desc}">', f'<generate_chart: "{safe_desc}">')
802
- md = md.replace(f'<generate_chart: {desc}>', f'<generate_chart: "{safe_desc}">')
803
-
804
- with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file:
805
- img_path = Path(temp_file.name)
806
- try:
807
- chart_spec = chart_generator.generate_chart_spec(desc)
808
- if execute_chart_spec(chart_spec, df, img_path):
809
- blob_name = f"sozo_projects/{uid}/{project_id}/charts/{uuid.uuid4().hex}.png"
810
- blob = bucket.blob(blob_name)
811
- blob.upload_from_filename(str(img_path))
812
- chart_urls[safe_desc] = blob.public_url
813
- finally:
814
- if os.path.exists(img_path):
815
- os.unlink(img_path)
816
-
817
- return {"raw_md": md, "chartUrls": chart_urls}
818
-
819
-
820
- def generate_fallback_report(autonomous_context: Dict[str, Any]) -> str:
821
- """
822
- Generates a basic fallback report when enhanced generation fails.
823
- """
824
- basic_info = autonomous_context["basic_info"]
825
- domain = autonomous_context["domain"]["primary_domain"]
826
-
827
- return f"""
828
- # What This Data Reveals
829
-
830
- Looking at this {domain} dataset with {basic_info['shape'][0]} records, there are several key insights worth highlighting.
831
-
832
- ## The Numbers Tell a Story
833
-
834
- This dataset contains {basic_info['shape'][1]} different variables, suggesting a comprehensive view of the underlying processes or behaviors being measured.
835
-
836
- <generate_chart: "bar | Data overview showing key metrics">
837
-
838
- ## What You Should Know
839
-
840
- The data structure and patterns suggest this is worth deeper investigation. The variety of data types and relationships indicate multiple analytical opportunities.
841
-
842
- ## Next Steps
843
-
844
- Based on this initial analysis, I recommend diving deeper into the specific patterns and relationships within the data to unlock more actionable insights.
845
-
846
- *Note: This is a simplified analysis. Enhanced storytelling temporarily unavailable.*
847
- """
848
  # Removed - no longer needed since we're letting AI decide everything organically
849
 
850
 
 
1
  # sozo_gen.py
2
+ # sozo_gen.py
3
 
4
  import os
5
  import re
 
78
  def placeholder_img() -> Image.Image: return Image.new("RGB", (WIDTH, HEIGHT), (230, 230, 230))
79
 
80
  def generate_image_from_prompt(prompt: str) -> Image.Image:
81
+ model_main = "gemini-2.0-flash-exp";
82
  full_prompt = "A clean business-presentation illustration: " + prompt
83
  try:
84
  model = genai.GenerativeModel(model_main)
 
310
  # Statistical Profile
311
  statistical_summary = {}
312
  if numeric_cols:
313
+ try:
314
+ correlations = df[numeric_cols].corr().abs().max()
315
+ correlations_dict = {k: float(v) if pd.notna(v) else 0.0 for k, v in correlations.to_dict().items()}
316
+
317
+ distributions = {}
318
+ for col in numeric_cols:
319
+ if len(df[col].dropna()) > 8:
320
+ try:
321
+ p_value = stats.normaltest(df[col].dropna())[1]
322
+ distributions[col] = 'normal' if p_value > 0.05 else 'non_normal'
323
+ except:
324
+ distributions[col] = 'unknown'
325
+
326
+ outliers = {}
327
+ for col in numeric_cols:
328
+ if len(df[col].dropna()) > 0:
329
+ try:
330
+ z_scores = np.abs(stats.zscore(df[col].dropna()))
331
+ outliers[col] = int(len(df[col][z_scores > 3]))
332
+ except:
333
+ outliers[col] = 0
334
+
335
+ statistical_summary = {
336
+ 'correlations': correlations_dict,
337
+ 'distributions': distributions,
338
+ 'outliers': outliers
339
+ }
340
+ except Exception as e:
341
+ statistical_summary = {'error': 'Could not compute statistical summary'}
342
 
343
  # Pattern Detection
344
  patterns = {
 
417
 
418
  return narrative_frameworks.get(domain, narrative_frameworks['general'])
419
 
420
+ def json_serializable(obj):
421
+ """Convert objects to JSON-serializable format"""
422
+ if isinstance(obj, (np.integer, np.floating)):
423
+ return float(obj)
424
+ elif isinstance(obj, np.ndarray):
425
+ return obj.tolist()
426
+ elif isinstance(obj, (np.bool_, bool)):
427
+ return bool(obj)
428
+ elif isinstance(obj, dict):
429
+ return {k: json_serializable(v) for k, v in obj.items()}
430
+ elif isinstance(obj, (list, tuple)):
431
+ return [json_serializable(item) for item in obj]
432
+ elif pd.isna(obj):
433
+ return None
434
+ else:
435
+ return obj
436
+
437
  def create_autonomous_prompt(df: pd.DataFrame, enhanced_ctx: Dict, intelligence: Dict) -> str:
438
  """
439
  Generate a dynamic, intelligence-driven prompt that creates compelling narratives
 
447
  # Dynamic chart strategy based on data characteristics
448
  chart_strategy = generate_chart_strategy(intelligence)
449
 
450
+ # Make context JSON serializable
451
+ serializable_ctx = json_serializable(enhanced_ctx)
452
+
453
  prompt = f"""You are an elite data storyteller with deep expertise in {domain} analytics. Your mission is to uncover the compelling narrative hidden in this dataset and present it as a captivating story that drives action.
454
 
455
  **THE DATA'S STORY CONTEXT:**
456
+ {json.dumps(serializable_ctx, indent=2)}
457
 
458
  **INTELLIGENCE ANALYSIS:**
459
  - Primary Domain: {domain}
460
  - Key Opportunities: {', '.join(opportunities)}
461
+ - Data Characteristics: {json_serializable(intelligence['data_structure'])}
462
  - Narrative Framework: {narrative['structure']}
463
 
464
  **YOUR STORYTELLING MISSION:**
 
531
  if not df.empty:
532
  numeric_cols = df.select_dtypes(include=[np.number]).columns
533
  if len(numeric_cols) > 0:
534
+ key_metrics = {}
535
+ for col in numeric_cols[:3]: # Top 3 numeric columns
536
+ try:
537
+ mean_val = df[col].mean()
538
+ std_val = df[col].std()
539
+ key_metrics[col] = {
540
+ 'mean': float(mean_val) if pd.notna(mean_val) else 0.0,
541
+ 'std': float(std_val) if pd.notna(std_val) else 0.0
542
+ }
543
+ except:
544
+ key_metrics[col] = {'mean': 0.0, 'std': 0.0}
545
+
546
  enhanced['statistical_summary'] = {
547
+ 'numeric_columns': int(len(numeric_cols)),
548
+ 'total_records': int(len(df)),
549
+ 'missing_data_percentage': float((df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100),
550
+ 'key_metrics': key_metrics
 
551
  }
552
 
553
  # Add categorical context
554
  categorical_cols = df.select_dtypes(include=['object', 'category']).columns
555
  if len(categorical_cols) > 0:
556
+ unique_values = {}
557
+ for col in categorical_cols[:3]:
558
+ try:
559
+ unique_values[col] = int(df[col].nunique())
560
+ except:
561
+ unique_values[col] = 0
562
+
563
  enhanced['categorical_summary'] = {
564
+ 'categorical_columns': int(len(categorical_cols)),
565
+ 'unique_values': unique_values
566
  }
567
 
568
  # Merge with intelligence analysis
 
675
  key_metrics = variances.head(3).index.tolist()
676
 
677
  return key_metrics[:5] # Return top 5 key metrics
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
678
  # Removed - no longer needed since we're letting AI decide everything organically
679
 
680