Update sozo_gen.py
Browse files- sozo_gen.py +78 -248
sozo_gen.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
# sozo_gen.py
|
|
|
|
| 2 |
|
| 3 |
import os
|
| 4 |
import re
|
|
@@ -77,7 +78,7 @@ def clean_narration(txt: str) -> str:
|
|
| 77 |
def placeholder_img() -> Image.Image: return Image.new("RGB", (WIDTH, HEIGHT), (230, 230, 230))
|
| 78 |
|
| 79 |
def generate_image_from_prompt(prompt: str) -> Image.Image:
|
| 80 |
-
model_main = "gemini-
|
| 81 |
full_prompt = "A clean business-presentation illustration: " + prompt
|
| 82 |
try:
|
| 83 |
model = genai.GenerativeModel(model_main)
|
|
@@ -309,13 +310,35 @@ def analyze_data_intelligence(df: pd.DataFrame, ctx_dict: Dict) -> Dict[str, Any
|
|
| 309 |
# Statistical Profile
|
| 310 |
statistical_summary = {}
|
| 311 |
if numeric_cols:
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 319 |
|
| 320 |
# Pattern Detection
|
| 321 |
patterns = {
|
|
@@ -394,6 +417,23 @@ def get_narrative_suggestions(domain: str, opportunities: List[str], patterns: D
|
|
| 394 |
|
| 395 |
return narrative_frameworks.get(domain, narrative_frameworks['general'])
|
| 396 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 397 |
def create_autonomous_prompt(df: pd.DataFrame, enhanced_ctx: Dict, intelligence: Dict) -> str:
|
| 398 |
"""
|
| 399 |
Generate a dynamic, intelligence-driven prompt that creates compelling narratives
|
|
@@ -407,15 +447,18 @@ def create_autonomous_prompt(df: pd.DataFrame, enhanced_ctx: Dict, intelligence:
|
|
| 407 |
# Dynamic chart strategy based on data characteristics
|
| 408 |
chart_strategy = generate_chart_strategy(intelligence)
|
| 409 |
|
|
|
|
|
|
|
|
|
|
| 410 |
prompt = f"""You are an elite data storyteller with deep expertise in {domain} analytics. Your mission is to uncover the compelling narrative hidden in this dataset and present it as a captivating story that drives action.
|
| 411 |
|
| 412 |
**THE DATA'S STORY CONTEXT:**
|
| 413 |
-
{json.dumps(
|
| 414 |
|
| 415 |
**INTELLIGENCE ANALYSIS:**
|
| 416 |
- Primary Domain: {domain}
|
| 417 |
- Key Opportunities: {', '.join(opportunities)}
|
| 418 |
-
- Data Characteristics: {intelligence['data_structure']}
|
| 419 |
- Narrative Framework: {narrative['structure']}
|
| 420 |
|
| 421 |
**YOUR STORYTELLING MISSION:**
|
|
@@ -488,20 +531,38 @@ def enhance_data_context(df: pd.DataFrame, ctx_dict: Dict) -> Dict[str, Any]:
|
|
| 488 |
if not df.empty:
|
| 489 |
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
| 490 |
if len(numeric_cols) > 0:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 491 |
enhanced['statistical_summary'] = {
|
| 492 |
-
'numeric_columns': len(numeric_cols),
|
| 493 |
-
'total_records': len(df),
|
| 494 |
-
'missing_data_percentage': (df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100,
|
| 495 |
-
'key_metrics':
|
| 496 |
-
for col in numeric_cols[:3]} # Top 3 numeric columns
|
| 497 |
}
|
| 498 |
|
| 499 |
# Add categorical context
|
| 500 |
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
|
| 501 |
if len(categorical_cols) > 0:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 502 |
enhanced['categorical_summary'] = {
|
| 503 |
-
'categorical_columns': len(categorical_cols),
|
| 504 |
-
'unique_values':
|
| 505 |
}
|
| 506 |
|
| 507 |
# Merge with intelligence analysis
|
|
@@ -614,237 +675,6 @@ def identify_key_metrics(df: pd.DataFrame, domain: str) -> List[str]:
|
|
| 614 |
key_metrics = variances.head(3).index.tolist()
|
| 615 |
|
| 616 |
return key_metrics[:5] # Return top 5 key metrics
|
| 617 |
-
|
| 618 |
-
|
| 619 |
-
# Removed - no longer needed since we're letting AI decide everything organically
|
| 620 |
-
|
| 621 |
-
|
| 622 |
-
def generate_autonomous_charts(llm, df: pd.DataFrame, report_md: str, uid: str, project_id: str, bucket) -> Dict[str, str]:
|
| 623 |
-
"""
|
| 624 |
-
Generates charts autonomously based on the report content and data characteristics.
|
| 625 |
-
"""
|
| 626 |
-
# Extract chart descriptions from the enhanced report
|
| 627 |
-
chart_descs = extract_chart_tags(report_md)[:MAX_CHARTS]
|
| 628 |
-
chart_urls = {}
|
| 629 |
-
|
| 630 |
-
if not chart_descs:
|
| 631 |
-
# If no charts specified, generate intelligent defaults
|
| 632 |
-
chart_descs = generate_intelligent_chart_suggestions(df, llm)
|
| 633 |
-
|
| 634 |
-
chart_generator = ChartGenerator(llm, df)
|
| 635 |
-
|
| 636 |
-
for desc in chart_descs:
|
| 637 |
-
try:
|
| 638 |
-
# Create a safe key for Firebase
|
| 639 |
-
safe_desc = sanitize_for_firebase_key(desc)
|
| 640 |
-
|
| 641 |
-
# Replace chart tags in markdown
|
| 642 |
-
report_md = report_md.replace(f'<generate_chart: "{desc}">', f'<generate_chart: "{safe_desc}">')
|
| 643 |
-
report_md = report_md.replace(f'<generate_chart: {desc}>', f'<generate_chart: "{safe_desc}">')
|
| 644 |
-
|
| 645 |
-
# Generate chart
|
| 646 |
-
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file:
|
| 647 |
-
img_path = Path(temp_file.name)
|
| 648 |
-
try:
|
| 649 |
-
chart_spec = chart_generator.generate_chart_spec(desc)
|
| 650 |
-
if execute_chart_spec(chart_spec, df, img_path):
|
| 651 |
-
blob_name = f"sozo_projects/{uid}/{project_id}/charts/{uuid.uuid4().hex}.png"
|
| 652 |
-
blob = bucket.blob(blob_name)
|
| 653 |
-
blob.upload_from_filename(str(img_path))
|
| 654 |
-
|
| 655 |
-
chart_urls[safe_desc] = blob.public_url
|
| 656 |
-
logging.info(f"Generated autonomous chart: {safe_desc}")
|
| 657 |
-
finally:
|
| 658 |
-
if os.path.exists(img_path):
|
| 659 |
-
os.unlink(img_path)
|
| 660 |
-
|
| 661 |
-
except Exception as e:
|
| 662 |
-
logging.error(f"Failed to generate chart '{desc}': {str(e)}")
|
| 663 |
-
continue
|
| 664 |
-
|
| 665 |
-
return chart_urls
|
| 666 |
-
|
| 667 |
-
|
| 668 |
-
def generate_intelligent_chart_suggestions(df: pd.DataFrame, llm) -> List[str]:
|
| 669 |
-
"""
|
| 670 |
-
Generates intelligent chart suggestions based on data characteristics.
|
| 671 |
-
"""
|
| 672 |
-
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
| 673 |
-
categorical_cols = df.select_dtypes(include=['object']).columns
|
| 674 |
-
|
| 675 |
-
suggestions = []
|
| 676 |
-
|
| 677 |
-
# Time series chart if temporal data exists
|
| 678 |
-
if detect_time_series(df):
|
| 679 |
-
suggestions.append("line | Time series trend analysis | Show temporal patterns")
|
| 680 |
-
|
| 681 |
-
# Distribution chart for numeric data
|
| 682 |
-
if len(numeric_cols) > 0:
|
| 683 |
-
main_numeric = numeric_cols[0]
|
| 684 |
-
suggestions.append(f"hist | Distribution of {main_numeric} | Understand data distribution")
|
| 685 |
-
|
| 686 |
-
# Correlation analysis if multiple numeric columns
|
| 687 |
-
if len(numeric_cols) > 1:
|
| 688 |
-
suggestions.append("scatter | Correlation analysis | Identify relationships between variables")
|
| 689 |
-
|
| 690 |
-
# Categorical breakdown
|
| 691 |
-
if len(categorical_cols) > 0:
|
| 692 |
-
main_categorical = categorical_cols[0]
|
| 693 |
-
suggestions.append(f"bar | {main_categorical} breakdown | Show categorical distribution")
|
| 694 |
-
|
| 695 |
-
return suggestions[:MAX_CHARTS]
|
| 696 |
-
|
| 697 |
-
|
| 698 |
-
# Helper functions (preserve existing functionality)
|
| 699 |
-
def detect_time_series(df: pd.DataFrame) -> bool:
|
| 700 |
-
"""Detect if dataset contains time series data."""
|
| 701 |
-
for col in df.columns:
|
| 702 |
-
if 'date' in col.lower() or 'time' in col.lower():
|
| 703 |
-
return True
|
| 704 |
-
try:
|
| 705 |
-
pd.to_datetime(df[col])
|
| 706 |
-
return True
|
| 707 |
-
except:
|
| 708 |
-
continue
|
| 709 |
-
return False
|
| 710 |
-
|
| 711 |
-
|
| 712 |
-
def detect_transactional_data(df: pd.DataFrame) -> bool:
|
| 713 |
-
"""Detect if dataset contains transactional data."""
|
| 714 |
-
transaction_indicators = ['transaction', 'payment', 'order', 'invoice', 'amount', 'quantity']
|
| 715 |
-
columns_lower = [col.lower() for col in df.columns]
|
| 716 |
-
return any(indicator in col for col in columns_lower for indicator in transaction_indicators)
|
| 717 |
-
|
| 718 |
-
|
| 719 |
-
def detect_experimental_data(df: pd.DataFrame) -> bool:
|
| 720 |
-
"""Detect if dataset contains experimental data."""
|
| 721 |
-
experimental_indicators = ['test', 'experiment', 'trial', 'group', 'treatment', 'control']
|
| 722 |
-
columns_lower = [col.lower() for col in df.columns]
|
| 723 |
-
return any(indicator in col for col in columns_lower for indicator in experimental_indicators)
|
| 724 |
-
|
| 725 |
-
|
| 726 |
-
def detect_temporal_frequency(date_series: pd.Series) -> str:
|
| 727 |
-
"""Detect the frequency of temporal data."""
|
| 728 |
-
if len(date_series) < 2:
|
| 729 |
-
return "insufficient_data"
|
| 730 |
-
|
| 731 |
-
# Calculate time differences
|
| 732 |
-
time_diffs = date_series.sort_values().diff().dropna()
|
| 733 |
-
median_diff = time_diffs.median()
|
| 734 |
-
|
| 735 |
-
if median_diff <= pd.Timedelta(days=1):
|
| 736 |
-
return "daily"
|
| 737 |
-
elif median_diff <= pd.Timedelta(days=7):
|
| 738 |
-
return "weekly"
|
| 739 |
-
elif median_diff <= pd.Timedelta(days=31):
|
| 740 |
-
return "monthly"
|
| 741 |
-
else:
|
| 742 |
-
return "irregular"
|
| 743 |
-
|
| 744 |
-
|
| 745 |
-
def determine_analysis_complexity(df: pd.DataFrame, domain_analysis: Dict[str, Any]) -> str:
|
| 746 |
-
"""Determine the complexity level of analysis required."""
|
| 747 |
-
complexity_factors = 0
|
| 748 |
-
|
| 749 |
-
# Data size factor
|
| 750 |
-
if len(df) > 10000:
|
| 751 |
-
complexity_factors += 1
|
| 752 |
-
if len(df.columns) > 20:
|
| 753 |
-
complexity_factors += 1
|
| 754 |
-
|
| 755 |
-
# Data type diversity
|
| 756 |
-
if len(df.select_dtypes(include=[np.number]).columns) > 5:
|
| 757 |
-
complexity_factors += 1
|
| 758 |
-
if len(df.select_dtypes(include=['object']).columns) > 5:
|
| 759 |
-
complexity_factors += 1
|
| 760 |
-
|
| 761 |
-
# Domain complexity
|
| 762 |
-
if domain_analysis["primary_domain"] in ["scientific", "financial"]:
|
| 763 |
-
complexity_factors += 1
|
| 764 |
-
|
| 765 |
-
if complexity_factors >= 3:
|
| 766 |
-
return "high"
|
| 767 |
-
elif complexity_factors >= 2:
|
| 768 |
-
return "medium"
|
| 769 |
-
else:
|
| 770 |
-
return "low"
|
| 771 |
-
|
| 772 |
-
|
| 773 |
-
def generate_original_report(df: pd.DataFrame, llm, ctx: str, uid: str, project_id: str, bucket) -> Dict[str, str]:
|
| 774 |
-
"""
|
| 775 |
-
Fallback to original report generation logic if enhanced version fails.
|
| 776 |
-
"""
|
| 777 |
-
logging.info("Using fallback report generation")
|
| 778 |
-
|
| 779 |
-
# Original logic preserved
|
| 780 |
-
ctx_dict = {"shape": df.shape, "columns": list(df.columns), "user_ctx": ctx}
|
| 781 |
-
enhanced_ctx = enhance_data_context(df, ctx_dict)
|
| 782 |
-
|
| 783 |
-
report_prompt = f"""
|
| 784 |
-
You are a senior data analyst and business intelligence expert. Analyze the provided dataset and write a comprehensive executive-level Markdown report.
|
| 785 |
-
**Dataset Analysis Context:** {json.dumps(enhanced_ctx, indent=2)}
|
| 786 |
-
**Instructions:**
|
| 787 |
-
1. **Executive Summary**: Start with a high-level summary of key findings.
|
| 788 |
-
2. **Key Insights**: Provide 3-5 key insights, each with its own chart tag.
|
| 789 |
-
3. **Visual Support**: Insert chart tags like: `<generate_chart: "chart_type | specific description">`.
|
| 790 |
-
Valid chart types: bar, pie, line, scatter, hist.
|
| 791 |
-
Generate insights that would be valuable to C-level executives.
|
| 792 |
-
"""
|
| 793 |
-
|
| 794 |
-
md = llm.invoke(report_prompt).content
|
| 795 |
-
chart_descs = extract_chart_tags(md)[:MAX_CHARTS]
|
| 796 |
-
chart_urls = {}
|
| 797 |
-
chart_generator = ChartGenerator(llm, df)
|
| 798 |
-
|
| 799 |
-
for desc in chart_descs:
|
| 800 |
-
safe_desc = sanitize_for_firebase_key(desc)
|
| 801 |
-
md = md.replace(f'<generate_chart: "{desc}">', f'<generate_chart: "{safe_desc}">')
|
| 802 |
-
md = md.replace(f'<generate_chart: {desc}>', f'<generate_chart: "{safe_desc}">')
|
| 803 |
-
|
| 804 |
-
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file:
|
| 805 |
-
img_path = Path(temp_file.name)
|
| 806 |
-
try:
|
| 807 |
-
chart_spec = chart_generator.generate_chart_spec(desc)
|
| 808 |
-
if execute_chart_spec(chart_spec, df, img_path):
|
| 809 |
-
blob_name = f"sozo_projects/{uid}/{project_id}/charts/{uuid.uuid4().hex}.png"
|
| 810 |
-
blob = bucket.blob(blob_name)
|
| 811 |
-
blob.upload_from_filename(str(img_path))
|
| 812 |
-
chart_urls[safe_desc] = blob.public_url
|
| 813 |
-
finally:
|
| 814 |
-
if os.path.exists(img_path):
|
| 815 |
-
os.unlink(img_path)
|
| 816 |
-
|
| 817 |
-
return {"raw_md": md, "chartUrls": chart_urls}
|
| 818 |
-
|
| 819 |
-
|
| 820 |
-
def generate_fallback_report(autonomous_context: Dict[str, Any]) -> str:
|
| 821 |
-
"""
|
| 822 |
-
Generates a basic fallback report when enhanced generation fails.
|
| 823 |
-
"""
|
| 824 |
-
basic_info = autonomous_context["basic_info"]
|
| 825 |
-
domain = autonomous_context["domain"]["primary_domain"]
|
| 826 |
-
|
| 827 |
-
return f"""
|
| 828 |
-
# What This Data Reveals
|
| 829 |
-
|
| 830 |
-
Looking at this {domain} dataset with {basic_info['shape'][0]} records, there are several key insights worth highlighting.
|
| 831 |
-
|
| 832 |
-
## The Numbers Tell a Story
|
| 833 |
-
|
| 834 |
-
This dataset contains {basic_info['shape'][1]} different variables, suggesting a comprehensive view of the underlying processes or behaviors being measured.
|
| 835 |
-
|
| 836 |
-
<generate_chart: "bar | Data overview showing key metrics">
|
| 837 |
-
|
| 838 |
-
## What You Should Know
|
| 839 |
-
|
| 840 |
-
The data structure and patterns suggest this is worth deeper investigation. The variety of data types and relationships indicate multiple analytical opportunities.
|
| 841 |
-
|
| 842 |
-
## Next Steps
|
| 843 |
-
|
| 844 |
-
Based on this initial analysis, I recommend diving deeper into the specific patterns and relationships within the data to unlock more actionable insights.
|
| 845 |
-
|
| 846 |
-
*Note: This is a simplified analysis. Enhanced storytelling temporarily unavailable.*
|
| 847 |
-
"""
|
| 848 |
# Removed - no longer needed since we're letting AI decide everything organically
|
| 849 |
|
| 850 |
|
|
|
|
| 1 |
# sozo_gen.py
|
| 2 |
+
# sozo_gen.py
|
| 3 |
|
| 4 |
import os
|
| 5 |
import re
|
|
|
|
| 78 |
def placeholder_img() -> Image.Image: return Image.new("RGB", (WIDTH, HEIGHT), (230, 230, 230))
|
| 79 |
|
| 80 |
def generate_image_from_prompt(prompt: str) -> Image.Image:
|
| 81 |
+
model_main = "gemini-2.0-flash-exp";
|
| 82 |
full_prompt = "A clean business-presentation illustration: " + prompt
|
| 83 |
try:
|
| 84 |
model = genai.GenerativeModel(model_main)
|
|
|
|
| 310 |
# Statistical Profile
|
| 311 |
statistical_summary = {}
|
| 312 |
if numeric_cols:
|
| 313 |
+
try:
|
| 314 |
+
correlations = df[numeric_cols].corr().abs().max()
|
| 315 |
+
correlations_dict = {k: float(v) if pd.notna(v) else 0.0 for k, v in correlations.to_dict().items()}
|
| 316 |
+
|
| 317 |
+
distributions = {}
|
| 318 |
+
for col in numeric_cols:
|
| 319 |
+
if len(df[col].dropna()) > 8:
|
| 320 |
+
try:
|
| 321 |
+
p_value = stats.normaltest(df[col].dropna())[1]
|
| 322 |
+
distributions[col] = 'normal' if p_value > 0.05 else 'non_normal'
|
| 323 |
+
except:
|
| 324 |
+
distributions[col] = 'unknown'
|
| 325 |
+
|
| 326 |
+
outliers = {}
|
| 327 |
+
for col in numeric_cols:
|
| 328 |
+
if len(df[col].dropna()) > 0:
|
| 329 |
+
try:
|
| 330 |
+
z_scores = np.abs(stats.zscore(df[col].dropna()))
|
| 331 |
+
outliers[col] = int(len(df[col][z_scores > 3]))
|
| 332 |
+
except:
|
| 333 |
+
outliers[col] = 0
|
| 334 |
+
|
| 335 |
+
statistical_summary = {
|
| 336 |
+
'correlations': correlations_dict,
|
| 337 |
+
'distributions': distributions,
|
| 338 |
+
'outliers': outliers
|
| 339 |
+
}
|
| 340 |
+
except Exception as e:
|
| 341 |
+
statistical_summary = {'error': 'Could not compute statistical summary'}
|
| 342 |
|
| 343 |
# Pattern Detection
|
| 344 |
patterns = {
|
|
|
|
| 417 |
|
| 418 |
return narrative_frameworks.get(domain, narrative_frameworks['general'])
|
| 419 |
|
| 420 |
+
def json_serializable(obj):
|
| 421 |
+
"""Convert objects to JSON-serializable format"""
|
| 422 |
+
if isinstance(obj, (np.integer, np.floating)):
|
| 423 |
+
return float(obj)
|
| 424 |
+
elif isinstance(obj, np.ndarray):
|
| 425 |
+
return obj.tolist()
|
| 426 |
+
elif isinstance(obj, (np.bool_, bool)):
|
| 427 |
+
return bool(obj)
|
| 428 |
+
elif isinstance(obj, dict):
|
| 429 |
+
return {k: json_serializable(v) for k, v in obj.items()}
|
| 430 |
+
elif isinstance(obj, (list, tuple)):
|
| 431 |
+
return [json_serializable(item) for item in obj]
|
| 432 |
+
elif pd.isna(obj):
|
| 433 |
+
return None
|
| 434 |
+
else:
|
| 435 |
+
return obj
|
| 436 |
+
|
| 437 |
def create_autonomous_prompt(df: pd.DataFrame, enhanced_ctx: Dict, intelligence: Dict) -> str:
|
| 438 |
"""
|
| 439 |
Generate a dynamic, intelligence-driven prompt that creates compelling narratives
|
|
|
|
| 447 |
# Dynamic chart strategy based on data characteristics
|
| 448 |
chart_strategy = generate_chart_strategy(intelligence)
|
| 449 |
|
| 450 |
+
# Make context JSON serializable
|
| 451 |
+
serializable_ctx = json_serializable(enhanced_ctx)
|
| 452 |
+
|
| 453 |
prompt = f"""You are an elite data storyteller with deep expertise in {domain} analytics. Your mission is to uncover the compelling narrative hidden in this dataset and present it as a captivating story that drives action.
|
| 454 |
|
| 455 |
**THE DATA'S STORY CONTEXT:**
|
| 456 |
+
{json.dumps(serializable_ctx, indent=2)}
|
| 457 |
|
| 458 |
**INTELLIGENCE ANALYSIS:**
|
| 459 |
- Primary Domain: {domain}
|
| 460 |
- Key Opportunities: {', '.join(opportunities)}
|
| 461 |
+
- Data Characteristics: {json_serializable(intelligence['data_structure'])}
|
| 462 |
- Narrative Framework: {narrative['structure']}
|
| 463 |
|
| 464 |
**YOUR STORYTELLING MISSION:**
|
|
|
|
| 531 |
if not df.empty:
|
| 532 |
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
| 533 |
if len(numeric_cols) > 0:
|
| 534 |
+
key_metrics = {}
|
| 535 |
+
for col in numeric_cols[:3]: # Top 3 numeric columns
|
| 536 |
+
try:
|
| 537 |
+
mean_val = df[col].mean()
|
| 538 |
+
std_val = df[col].std()
|
| 539 |
+
key_metrics[col] = {
|
| 540 |
+
'mean': float(mean_val) if pd.notna(mean_val) else 0.0,
|
| 541 |
+
'std': float(std_val) if pd.notna(std_val) else 0.0
|
| 542 |
+
}
|
| 543 |
+
except:
|
| 544 |
+
key_metrics[col] = {'mean': 0.0, 'std': 0.0}
|
| 545 |
+
|
| 546 |
enhanced['statistical_summary'] = {
|
| 547 |
+
'numeric_columns': int(len(numeric_cols)),
|
| 548 |
+
'total_records': int(len(df)),
|
| 549 |
+
'missing_data_percentage': float((df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100),
|
| 550 |
+
'key_metrics': key_metrics
|
|
|
|
| 551 |
}
|
| 552 |
|
| 553 |
# Add categorical context
|
| 554 |
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
|
| 555 |
if len(categorical_cols) > 0:
|
| 556 |
+
unique_values = {}
|
| 557 |
+
for col in categorical_cols[:3]:
|
| 558 |
+
try:
|
| 559 |
+
unique_values[col] = int(df[col].nunique())
|
| 560 |
+
except:
|
| 561 |
+
unique_values[col] = 0
|
| 562 |
+
|
| 563 |
enhanced['categorical_summary'] = {
|
| 564 |
+
'categorical_columns': int(len(categorical_cols)),
|
| 565 |
+
'unique_values': unique_values
|
| 566 |
}
|
| 567 |
|
| 568 |
# Merge with intelligence analysis
|
|
|
|
| 675 |
key_metrics = variances.head(3).index.tolist()
|
| 676 |
|
| 677 |
return key_metrics[:5] # Return top 5 key metrics
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 678 |
# Removed - no longer needed since we're letting AI decide everything organically
|
| 679 |
|
| 680 |
|