Update sozo_gen.py
Browse files- sozo_gen.py +297 -333
sozo_gen.py
CHANGED
|
@@ -268,388 +268,352 @@ def sanitize_for_firebase_key(text: str) -> str:
|
|
| 268 |
return text
|
| 269 |
|
| 270 |
# REPLACE THE OLD generate_report_draft WITH THIS CORRECTED VERSION
|
| 271 |
-
|
|
|
|
|
|
|
|
|
|
| 272 |
"""
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
Maintains backward compatibility with existing function signature and outputs.
|
| 277 |
"""
|
| 278 |
-
logging.info(f"Generating enhanced autonomous report draft for project {project_id}")
|
| 279 |
|
| 280 |
-
#
|
| 281 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 282 |
|
| 283 |
-
#
|
| 284 |
-
|
|
|
|
| 285 |
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
autonomous_context = perform_autonomous_data_analysis(df, ctx, name)
|
| 290 |
-
|
| 291 |
-
# Stage 2: Generate Enhanced Report with Intelligent Narrative
|
| 292 |
-
enhanced_report = generate_intelligent_report(llm, autonomous_context)
|
| 293 |
-
|
| 294 |
-
# Stage 3: Smart Chart Generation
|
| 295 |
-
chart_urls = generate_autonomous_charts(llm, df, enhanced_report, uid, project_id, bucket)
|
| 296 |
-
|
| 297 |
-
# Preserve original output structure
|
| 298 |
-
return {"raw_md": enhanced_report, "chartUrls": chart_urls}
|
| 299 |
-
|
| 300 |
-
except Exception as e:
|
| 301 |
-
logging.error(f"Enhanced analysis failed, falling back to original: {str(e)}")
|
| 302 |
-
# Fallback to original logic if enhancement fails
|
| 303 |
-
return generate_original_report(df, llm, ctx, uid, project_id, bucket)
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
def perform_autonomous_data_analysis(df: pd.DataFrame, user_ctx: str, filename: str) -> Dict[str, Any]:
|
| 307 |
-
"""
|
| 308 |
-
Performs comprehensive autonomous analysis of the dataset to understand its nature,
|
| 309 |
-
domain, and analytical potential.
|
| 310 |
-
"""
|
| 311 |
-
logging.info("Performing autonomous data analysis...")
|
| 312 |
-
|
| 313 |
-
# Basic data profiling with JSON-safe types
|
| 314 |
-
basic_info = {
|
| 315 |
-
"shape": df.shape,
|
| 316 |
-
"columns": list(df.columns),
|
| 317 |
-
"dtypes": {col: str(dtype) for col, dtype in df.dtypes.items()},
|
| 318 |
-
"filename": filename,
|
| 319 |
-
"user_context": user_ctx
|
| 320 |
-
}
|
| 321 |
|
| 322 |
-
#
|
| 323 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 324 |
|
| 325 |
-
#
|
| 326 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 327 |
|
| 328 |
-
|
| 329 |
-
|
| 330 |
|
| 331 |
-
|
| 332 |
-
|
| 333 |
|
| 334 |
-
|
| 335 |
-
|
| 336 |
|
| 337 |
-
|
| 338 |
-
|
| 339 |
|
| 340 |
return {
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 349 |
}
|
| 350 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 351 |
|
| 352 |
-
def
|
| 353 |
"""
|
| 354 |
-
|
| 355 |
-
|
| 356 |
"""
|
| 357 |
-
domain_indicators = {
|
| 358 |
-
"financial": ["amount", "price", "cost", "revenue", "profit", "transaction", "payment", "invoice"],
|
| 359 |
-
"survey": ["rating", "satisfaction", "response", "score", "survey", "feedback", "opinion"],
|
| 360 |
-
"scientific": ["measurement", "experiment", "test", "sample", "observation", "hypothesis", "variable"],
|
| 361 |
-
"marketing": ["campaign", "click", "conversion", "customer", "lead", "acquisition", "retention"],
|
| 362 |
-
"operational": ["process", "time", "duration", "status", "workflow", "performance", "efficiency"],
|
| 363 |
-
"sales": ["order", "product", "quantity", "sales", "customer", "deal", "pipeline"],
|
| 364 |
-
"hr": ["employee", "salary", "department", "performance", "training", "recruitment"],
|
| 365 |
-
"healthcare": ["patient", "diagnosis", "treatment", "medical", "health", "symptom", "medication"]
|
| 366 |
-
}
|
| 367 |
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
for domain, keywords in domain_indicators.items():
|
| 373 |
-
score = sum(1 for col in columns_lower for keyword in keywords if keyword in col)
|
| 374 |
-
domain_scores[domain] = score
|
| 375 |
|
| 376 |
-
#
|
| 377 |
-
|
| 378 |
-
for domain, keywords in domain_indicators.items():
|
| 379 |
-
if any(keyword in filename_lower for keyword in keywords):
|
| 380 |
-
domain_scores[domain] = domain_scores.get(domain, 0) + 2
|
| 381 |
|
| 382 |
-
|
| 383 |
-
numeric_ratio = len(df.select_dtypes(include=[np.number]).columns) / len(df.columns)
|
| 384 |
-
categorical_ratio = len(df.select_dtypes(include=['object']).columns) / len(df.columns)
|
| 385 |
-
|
| 386 |
-
# Determine primary domain
|
| 387 |
-
primary_domain = max(domain_scores, key=domain_scores.get) if domain_scores else "general"
|
| 388 |
-
|
| 389 |
-
return {
|
| 390 |
-
"primary_domain": primary_domain,
|
| 391 |
-
"domain_confidence": int(domain_scores.get(primary_domain, 0)),
|
| 392 |
-
"domain_scores": {k: int(v) for k, v in domain_scores.items()},
|
| 393 |
-
"data_characteristics": {
|
| 394 |
-
"numeric_ratio": float(numeric_ratio),
|
| 395 |
-
"categorical_ratio": float(categorical_ratio),
|
| 396 |
-
"is_time_series": detect_time_series(df),
|
| 397 |
-
"is_transactional": detect_transactional_data(df),
|
| 398 |
-
"is_experimental": detect_experimental_data(df)
|
| 399 |
-
}
|
| 400 |
-
}
|
| 401 |
|
|
|
|
|
|
|
| 402 |
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 413 |
}
|
| 414 |
|
| 415 |
-
|
| 416 |
-
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
| 417 |
-
if len(numeric_cols) > 0:
|
| 418 |
-
desc_stats = df[numeric_cols].describe()
|
| 419 |
-
# Convert to JSON-safe format
|
| 420 |
-
profile["summary_stats"] = {
|
| 421 |
-
col: {
|
| 422 |
-
stat: float(val) if pd.notna(val) else None
|
| 423 |
-
for stat, val in desc_stats[col].items()
|
| 424 |
-
}
|
| 425 |
-
for col in desc_stats.columns
|
| 426 |
-
}
|
| 427 |
-
|
| 428 |
-
# Correlation analysis with JSON-safe conversion
|
| 429 |
-
if len(numeric_cols) > 1:
|
| 430 |
-
corr_matrix = df[numeric_cols].corr()
|
| 431 |
-
# Find strong correlations
|
| 432 |
-
strong_corrs = []
|
| 433 |
-
for i in range(len(corr_matrix.columns)):
|
| 434 |
-
for j in range(i+1, len(corr_matrix.columns)):
|
| 435 |
-
corr_val = corr_matrix.iloc[i, j]
|
| 436 |
-
if abs(corr_val) > 0.7 and pd.notna(corr_val): # Strong correlation threshold
|
| 437 |
-
strong_corrs.append({
|
| 438 |
-
"var1": corr_matrix.columns[i],
|
| 439 |
-
"var2": corr_matrix.columns[j],
|
| 440 |
-
"correlation": float(corr_val)
|
| 441 |
-
})
|
| 442 |
-
profile["correlations"] = {"strong_correlations": strong_corrs}
|
| 443 |
-
|
| 444 |
-
# Categorical analysis
|
| 445 |
-
categorical_cols = df.select_dtypes(include=['object']).columns
|
| 446 |
-
if len(categorical_cols) > 0:
|
| 447 |
-
profile["categorical_analysis"] = {}
|
| 448 |
-
for col in categorical_cols:
|
| 449 |
-
value_counts = df[col].value_counts().head(5)
|
| 450 |
-
profile["categorical_analysis"][col] = {
|
| 451 |
-
"unique_count": int(df[col].nunique()),
|
| 452 |
-
"top_values": {str(k): int(v) for k, v in value_counts.items()}
|
| 453 |
-
}
|
| 454 |
|
| 455 |
-
#
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
for col, missing_count in missing_data.items():
|
| 459 |
-
if missing_count > 0:
|
| 460 |
-
missing_dict[col] = int(missing_count)
|
| 461 |
|
| 462 |
-
|
| 463 |
-
"
|
| 464 |
-
"total_missing_percentage": float((df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100)
|
| 465 |
-
}
|
| 466 |
|
| 467 |
-
|
| 468 |
-
|
|
|
|
|
|
|
| 469 |
|
| 470 |
-
def
|
| 471 |
-
"""
|
| 472 |
-
Discovers meaningful relationships and patterns in the data.
|
| 473 |
-
"""
|
| 474 |
-
relationships = {
|
| 475 |
-
"key_relationships": [],
|
| 476 |
-
"patterns": [],
|
| 477 |
-
"anomalies": []
|
| 478 |
-
}
|
| 479 |
|
| 480 |
-
#
|
| 481 |
-
|
| 482 |
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
for col1 in numeric_cols:
|
| 486 |
-
for col2 in numeric_cols:
|
| 487 |
-
if col1 != col2:
|
| 488 |
-
correlation = df[col1].corr(df[col2])
|
| 489 |
-
if abs(correlation) > 0.5 and pd.notna(correlation): # Moderate to strong correlation
|
| 490 |
-
relationships["key_relationships"].append({
|
| 491 |
-
"variable1": col1,
|
| 492 |
-
"variable2": col2,
|
| 493 |
-
"relationship_strength": float(correlation),
|
| 494 |
-
"relationship_type": "positive" if correlation > 0 else "negative"
|
| 495 |
-
})
|
| 496 |
-
|
| 497 |
-
# Identify patterns in categorical data
|
| 498 |
-
categorical_cols = df.select_dtypes(include=['object']).columns
|
| 499 |
-
for col in categorical_cols:
|
| 500 |
-
if df[col].nunique() < 20: # Reasonable number of categories
|
| 501 |
-
value_counts = df[col].value_counts()
|
| 502 |
-
if len(value_counts) > 0:
|
| 503 |
-
relationships["patterns"].append({
|
| 504 |
-
"column": col,
|
| 505 |
-
"pattern_type": "categorical_distribution",
|
| 506 |
-
"dominant_category": str(value_counts.index[0]),
|
| 507 |
-
"dominance_percentage": float((value_counts.iloc[0] / len(df)) * 100)
|
| 508 |
-
})
|
| 509 |
|
| 510 |
-
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
|
| 515 |
-
|
| 516 |
-
|
| 517 |
-
|
|
|
|
|
|
|
|
|
|
| 518 |
|
| 519 |
-
#
|
| 520 |
-
|
| 521 |
-
|
| 522 |
-
|
| 523 |
-
|
| 524 |
-
|
| 525 |
-
date_columns.append(col)
|
| 526 |
-
except:
|
| 527 |
-
continue
|
| 528 |
-
|
| 529 |
-
if date_columns:
|
| 530 |
-
temporal_insights["has_temporal_data"] = True
|
| 531 |
-
temporal_insights["date_columns"] = date_columns
|
| 532 |
-
|
| 533 |
-
# Analyze temporal patterns for the first date column
|
| 534 |
-
primary_date_col = date_columns[0]
|
| 535 |
-
df_temp = df.copy()
|
| 536 |
-
df_temp[primary_date_col] = pd.to_datetime(df_temp[primary_date_col])
|
| 537 |
-
|
| 538 |
-
temporal_insights["temporal_analysis"] = {
|
| 539 |
-
"date_range": {
|
| 540 |
-
"start": df_temp[primary_date_col].min().strftime('%Y-%m-%d'),
|
| 541 |
-
"end": df_temp[primary_date_col].max().strftime('%Y-%m-%d')
|
| 542 |
-
},
|
| 543 |
-
"time_span_days": int((df_temp[primary_date_col].max() - df_temp[primary_date_col].min()).days),
|
| 544 |
-
"frequency": detect_temporal_frequency(df_temp[primary_date_col])
|
| 545 |
}
|
| 546 |
|
| 547 |
-
|
| 548 |
-
|
|
|
|
|
|
|
| 549 |
|
| 550 |
-
def
|
| 551 |
"""
|
| 552 |
-
|
| 553 |
"""
|
| 554 |
-
|
| 555 |
-
"overall_quality_score": 0,
|
| 556 |
-
"quality_issues": [],
|
| 557 |
-
"data_completeness": 0,
|
| 558 |
-
"data_consistency": {}
|
| 559 |
-
}
|
| 560 |
|
| 561 |
-
|
| 562 |
-
|
| 563 |
-
quality_metrics["data_completeness"] = completeness
|
| 564 |
|
| 565 |
-
#
|
| 566 |
-
|
| 567 |
-
|
| 568 |
|
| 569 |
-
#
|
| 570 |
-
|
| 571 |
-
if duplicate_rows > 0:
|
| 572 |
-
quality_metrics["quality_issues"].append(f"{duplicate_rows} duplicate rows found")
|
| 573 |
|
| 574 |
-
#
|
| 575 |
-
|
| 576 |
-
if df[col].dtype == 'object':
|
| 577 |
-
if df[col].str.isnumeric().any() and not df[col].str.isnumeric().all():
|
| 578 |
-
quality_metrics["quality_issues"].append(f"Inconsistent data types in {col}")
|
| 579 |
|
| 580 |
-
#
|
| 581 |
-
|
| 582 |
-
base_score -= (100 - completeness) * 0.5 # Penalize missing data
|
| 583 |
-
base_score -= len(quality_metrics["quality_issues"]) * 5 # Penalize each quality issue
|
| 584 |
-
quality_metrics["overall_quality_score"] = float(max(0, base_score))
|
| 585 |
|
| 586 |
-
|
| 587 |
-
|
| 588 |
-
|
| 589 |
-
|
| 590 |
-
"""
|
| 591 |
-
Infers business context and potential use cases based on the data characteristics.
|
| 592 |
-
"""
|
| 593 |
-
domain = domain_analysis["primary_domain"]
|
| 594 |
|
| 595 |
-
|
| 596 |
-
|
| 597 |
-
|
| 598 |
-
|
| 599 |
-
|
| 600 |
-
},
|
| 601 |
-
"
|
| 602 |
-
|
| 603 |
-
|
| 604 |
-
|
| 605 |
-
|
| 606 |
-
|
| 607 |
-
|
| 608 |
-
|
| 609 |
-
|
| 610 |
-
|
| 611 |
-
|
| 612 |
-
|
| 613 |
-
|
| 614 |
-
|
| 615 |
-
|
| 616 |
-
|
|
|
|
| 617 |
|
| 618 |
-
return
|
| 619 |
-
"key_metrics": ["Performance indicators", "Trends", "Patterns"],
|
| 620 |
-
"typical_analyses": ["Descriptive analysis", "Trend identification", "Pattern recognition"],
|
| 621 |
-
"stakeholders": ["Business stakeholders", "Decision makers"]
|
| 622 |
-
})
|
| 623 |
-
|
| 624 |
-
|
| 625 |
-
def generate_intelligent_report(llm, autonomous_context: Dict[str, Any]) -> str:
|
| 626 |
-
"""
|
| 627 |
-
Generates an intelligent, domain-appropriate report with organic storytelling.
|
| 628 |
-
"""
|
| 629 |
-
# Create truly autonomous prompt that lets AI decide everything
|
| 630 |
-
enhanced_prompt = f"""
|
| 631 |
-
You are a world-class data analyst who has just been handed this dataset to analyze. Look at the data characteristics and tell me the most compelling story you can find.
|
| 632 |
-
|
| 633 |
-
**DATASET CONTEXT:**
|
| 634 |
-
{json.dumps(autonomous_context, indent=2)}
|
| 635 |
-
|
| 636 |
-
**YOUR MISSION:**
|
| 637 |
-
Analyze this data like you would if a CEO walked into your office and said "I need to understand what this data is telling us." Write a report that would make them say "This is exactly what I needed to know."
|
| 638 |
|
| 639 |
-
|
| 640 |
-
- Don't follow a rigid structure - let the data guide your narrative
|
| 641 |
-
- Choose your own headings and sections based on what the data reveals
|
| 642 |
-
- Write like you're presenting findings to someone who needs to make important decisions
|
| 643 |
-
- Include specific numbers and insights that matter
|
| 644 |
-
- Insert chart recommendations like: `<generate_chart: "chart_type | description">`
|
| 645 |
-
- Valid chart types: bar, pie, line, scatter, hist, box, heatmap
|
| 646 |
-
- Only recommend charts that truly support your narrative
|
| 647 |
|
| 648 |
-
|
| 649 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 650 |
|
| 651 |
-
|
| 652 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 653 |
|
| 654 |
|
| 655 |
# Removed - no longer needed since we're letting AI decide everything organically
|
|
|
|
| 268 |
return text
|
| 269 |
|
| 270 |
# REPLACE THE OLD generate_report_draft WITH THIS CORRECTED VERSION
|
| 271 |
+
from scipy import stats
|
| 272 |
+
import re
|
| 273 |
+
|
| 274 |
+
def analyze_data_intelligence(df: pd.DataFrame, ctx_dict: Dict) -> Dict[str, Any]:
|
| 275 |
"""
|
| 276 |
+
Autonomous data intelligence system that classifies domain,
|
| 277 |
+
detects patterns, and determines optimal analytical approach.
|
|
|
|
|
|
|
| 278 |
"""
|
|
|
|
| 279 |
|
| 280 |
+
# Domain Classification Engine
|
| 281 |
+
domain_signals = {
|
| 282 |
+
'financial': ['amount', 'price', 'cost', 'revenue', 'profit', 'balance', 'transaction', 'payment'],
|
| 283 |
+
'survey': ['rating', 'satisfaction', 'score', 'response', 'feedback', 'opinion', 'agree', 'likert'],
|
| 284 |
+
'scientific': ['measurement', 'experiment', 'trial', 'test', 'control', 'variable', 'hypothesis'],
|
| 285 |
+
'marketing': ['campaign', 'conversion', 'click', 'impression', 'engagement', 'customer', 'segment'],
|
| 286 |
+
'operational': ['performance', 'efficiency', 'throughput', 'capacity', 'utilization', 'process'],
|
| 287 |
+
'temporal': ['date', 'time', 'timestamp', 'period', 'month', 'year', 'day', 'hour']
|
| 288 |
+
}
|
| 289 |
|
| 290 |
+
# Analyze column patterns
|
| 291 |
+
columns_lower = [col.lower() for col in df.columns]
|
| 292 |
+
domain_scores = {}
|
| 293 |
|
| 294 |
+
for domain, keywords in domain_signals.items():
|
| 295 |
+
score = sum(1 for col in columns_lower if any(keyword in col for keyword in keywords))
|
| 296 |
+
domain_scores[domain] = score
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 297 |
|
| 298 |
+
# Determine primary domain
|
| 299 |
+
primary_domain = max(domain_scores, key=domain_scores.get) if max(domain_scores.values()) > 0 else 'general'
|
| 300 |
+
|
| 301 |
+
# Data Structure Analysis
|
| 302 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
| 303 |
+
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
|
| 304 |
+
datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
|
| 305 |
+
|
| 306 |
+
# Detect time series
|
| 307 |
+
is_timeseries = len(datetime_cols) > 0 or any('date' in col.lower() or 'time' in col.lower() for col in columns_lower)
|
| 308 |
+
|
| 309 |
+
# Statistical Profile
|
| 310 |
+
statistical_summary = {}
|
| 311 |
+
if numeric_cols:
|
| 312 |
+
statistical_summary = {
|
| 313 |
+
'correlations': df[numeric_cols].corr().abs().max().to_dict(),
|
| 314 |
+
'distributions': {col: 'normal' if stats.normaltest(df[col].dropna())[1] > 0.05 else 'non_normal'
|
| 315 |
+
for col in numeric_cols if len(df[col].dropna()) > 8},
|
| 316 |
+
'outliers': {col: len(df[col][np.abs(stats.zscore(df[col].dropna())) > 3])
|
| 317 |
+
for col in numeric_cols if len(df[col].dropna()) > 0}
|
| 318 |
+
}
|
| 319 |
|
| 320 |
+
# Pattern Detection
|
| 321 |
+
patterns = {
|
| 322 |
+
'has_missing_data': df.isnull().sum().sum() > 0,
|
| 323 |
+
'has_duplicates': df.duplicated().sum() > 0,
|
| 324 |
+
'has_negative_values': any(df[col].min() < 0 for col in numeric_cols if len(df[col].dropna()) > 0),
|
| 325 |
+
'has_categorical_hierarchy': any(len(df[col].unique()) > 10 for col in categorical_cols),
|
| 326 |
+
'potential_segments': len(categorical_cols) > 0
|
| 327 |
+
}
|
| 328 |
+
|
| 329 |
+
# Insight Opportunities
|
| 330 |
+
insight_opportunities = []
|
| 331 |
|
| 332 |
+
if is_timeseries:
|
| 333 |
+
insight_opportunities.append("temporal_trends")
|
| 334 |
|
| 335 |
+
if len(numeric_cols) > 1:
|
| 336 |
+
insight_opportunities.append("correlations")
|
| 337 |
|
| 338 |
+
if len(categorical_cols) > 0 and len(numeric_cols) > 0:
|
| 339 |
+
insight_opportunities.append("segmentation")
|
| 340 |
|
| 341 |
+
if any(statistical_summary.get('outliers', {}).values()):
|
| 342 |
+
insight_opportunities.append("anomalies")
|
| 343 |
|
| 344 |
return {
|
| 345 |
+
'primary_domain': primary_domain,
|
| 346 |
+
'domain_confidence': domain_scores,
|
| 347 |
+
'data_structure': {
|
| 348 |
+
'is_timeseries': is_timeseries,
|
| 349 |
+
'numeric_cols': numeric_cols,
|
| 350 |
+
'categorical_cols': categorical_cols,
|
| 351 |
+
'datetime_cols': datetime_cols
|
| 352 |
+
},
|
| 353 |
+
'statistical_profile': statistical_summary,
|
| 354 |
+
'patterns': patterns,
|
| 355 |
+
'insight_opportunities': insight_opportunities,
|
| 356 |
+
'narrative_suggestions': get_narrative_suggestions(primary_domain, insight_opportunities, patterns)
|
| 357 |
}
|
| 358 |
|
| 359 |
+
def get_narrative_suggestions(domain: str, opportunities: List[str], patterns: Dict) -> Dict[str, str]:
|
| 360 |
+
"""Generate narrative direction based on domain and data characteristics"""
|
| 361 |
+
|
| 362 |
+
narrative_frameworks = {
|
| 363 |
+
'financial': {
|
| 364 |
+
'hook': "Follow the money trail that reveals your business's hidden opportunities",
|
| 365 |
+
'structure': "performance → trends → risks → opportunities",
|
| 366 |
+
'focus': "profitability, efficiency, growth patterns, risk indicators"
|
| 367 |
+
},
|
| 368 |
+
'survey': {
|
| 369 |
+
'hook': "Your customers are speaking - here's what they're really saying",
|
| 370 |
+
'structure': "sentiment → segments → drivers → actions",
|
| 371 |
+
'focus': "satisfaction drivers, demographic patterns, improvement areas"
|
| 372 |
+
},
|
| 373 |
+
'scientific': {
|
| 374 |
+
'hook': "The data reveals relationships that challenge conventional thinking",
|
| 375 |
+
'structure': "hypothesis → evidence → significance → implications",
|
| 376 |
+
'focus': "statistical significance, correlations, experimental validity"
|
| 377 |
+
},
|
| 378 |
+
'marketing': {
|
| 379 |
+
'hook': "Discover the customer journey patterns driving your growth",
|
| 380 |
+
'structure': "performance → segments → optimization → strategy",
|
| 381 |
+
'focus': "conversion funnels, customer segments, campaign effectiveness"
|
| 382 |
+
},
|
| 383 |
+
'operational': {
|
| 384 |
+
'hook': "Operational excellence lives in the details - here's where to look",
|
| 385 |
+
'structure': "efficiency → bottlenecks → optimization → impact",
|
| 386 |
+
'focus': "process efficiency, capacity utilization, improvement opportunities"
|
| 387 |
+
},
|
| 388 |
+
'general': {
|
| 389 |
+
'hook': "Every dataset tells a story - here's what yours is saying",
|
| 390 |
+
'structure': "overview → patterns → insights → implications",
|
| 391 |
+
'focus': "key patterns, significant relationships, actionable insights"
|
| 392 |
+
}
|
| 393 |
+
}
|
| 394 |
+
|
| 395 |
+
return narrative_frameworks.get(domain, narrative_frameworks['general'])
|
| 396 |
|
| 397 |
+
def create_autonomous_prompt(df: pd.DataFrame, enhanced_ctx: Dict, intelligence: Dict) -> str:
|
| 398 |
"""
|
| 399 |
+
Generate a dynamic, intelligence-driven prompt that creates compelling narratives
|
| 400 |
+
rather than following templates.
|
| 401 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 402 |
|
| 403 |
+
domain = intelligence['primary_domain']
|
| 404 |
+
opportunities = intelligence['insight_opportunities']
|
| 405 |
+
narrative = intelligence['narrative_suggestions']
|
|
|
|
|
|
|
|
|
|
|
|
|
| 406 |
|
| 407 |
+
# Dynamic chart strategy based on data characteristics
|
| 408 |
+
chart_strategy = generate_chart_strategy(intelligence)
|
|
|
|
|
|
|
|
|
|
| 409 |
|
| 410 |
+
prompt = f"""You are an elite data storyteller with deep expertise in {domain} analytics. Your mission is to uncover the compelling narrative hidden in this dataset and present it as a captivating story that drives action.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 411 |
|
| 412 |
+
**THE DATA'S STORY CONTEXT:**
|
| 413 |
+
{json.dumps(enhanced_ctx, indent=2)}
|
| 414 |
|
| 415 |
+
**INTELLIGENCE ANALYSIS:**
|
| 416 |
+
- Primary Domain: {domain}
|
| 417 |
+
- Key Opportunities: {', '.join(opportunities)}
|
| 418 |
+
- Data Characteristics: {intelligence['data_structure']}
|
| 419 |
+
- Narrative Framework: {narrative['structure']}
|
| 420 |
+
|
| 421 |
+
**YOUR STORYTELLING MISSION:**
|
| 422 |
+
{narrative['hook']}
|
| 423 |
+
|
| 424 |
+
**NARRATIVE CONSTRUCTION GUIDELINES:**
|
| 425 |
+
1. **LEAD WITH INTRIGUE**: Start with the most compelling finding that hooks the reader
|
| 426 |
+
2. **BUILD TENSION**: Present contrasts, surprises, or unexpected patterns
|
| 427 |
+
3. **REVEAL INSIGHTS**: Use data to resolve the tension with clear explanations
|
| 428 |
+
4. **DRIVE ACTION**: End with specific, actionable recommendations
|
| 429 |
+
|
| 430 |
+
**VISUALIZATION STRATEGY:**
|
| 431 |
+
{chart_strategy}
|
| 432 |
+
|
| 433 |
+
**CRITICAL INSTRUCTIONS:**
|
| 434 |
+
- Write as if you're revealing a detective story, not filling a template
|
| 435 |
+
- Every insight must be supported by data evidence
|
| 436 |
+
- Use compelling headers that create curiosity (not "Executive Summary")
|
| 437 |
+
- Weave charts naturally into the narrative flow
|
| 438 |
+
- Focus on business impact and actionable outcomes
|
| 439 |
+
- Let the data's personality shine through your writing style
|
| 440 |
+
|
| 441 |
+
**CHART INTEGRATION:**
|
| 442 |
+
Insert charts using: `<generate_chart: "chart_type | compelling description that advances the story">`
|
| 443 |
+
Available types: bar, pie, line, scatter, hist
|
| 444 |
+
|
| 445 |
+
Transform this data into a story that decision-makers can't stop reading."""
|
| 446 |
+
|
| 447 |
+
return prompt
|
| 448 |
+
|
| 449 |
+
def generate_chart_strategy(intelligence: Dict) -> str:
|
| 450 |
+
"""Generate visualization strategy based on data intelligence"""
|
| 451 |
+
|
| 452 |
+
domain = intelligence['primary_domain']
|
| 453 |
+
opportunities = intelligence['insight_opportunities']
|
| 454 |
+
structure = intelligence['data_structure']
|
| 455 |
+
|
| 456 |
+
strategies = {
|
| 457 |
+
'financial': "Focus on trend lines showing performance over time, comparative bars for different categories, and scatter plots revealing correlations between financial metrics.",
|
| 458 |
+
'survey': "Emphasize distribution histograms for satisfaction scores, segmented bar charts for demographic breakdowns, and correlation matrices for response patterns.",
|
| 459 |
+
'scientific': "Prioritize scatter plots with regression lines, distribution comparisons, and statistical significance visualizations.",
|
| 460 |
+
'marketing': "Highlight conversion funnels, customer segment comparisons, and campaign performance trends.",
|
| 461 |
+
'operational': "Show efficiency trends, capacity utilization charts, and process performance comparisons."
|
| 462 |
}
|
| 463 |
|
| 464 |
+
base_strategy = strategies.get(domain, "Create visualizations that best tell your data's unique story.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 465 |
|
| 466 |
+
# Add specific guidance based on data characteristics
|
| 467 |
+
if structure['is_timeseries']:
|
| 468 |
+
base_strategy += " Leverage time-series visualizations to show trends and patterns over time."
|
|
|
|
|
|
|
|
|
|
| 469 |
|
| 470 |
+
if 'correlations' in opportunities:
|
| 471 |
+
base_strategy += " Include correlation visualizations to reveal hidden relationships."
|
|
|
|
|
|
|
| 472 |
|
| 473 |
+
if 'segmentation' in opportunities:
|
| 474 |
+
base_strategy += " Use segmented charts to highlight different groups or categories."
|
| 475 |
+
|
| 476 |
+
return base_strategy
|
| 477 |
|
| 478 |
+
def enhance_data_context(df: pd.DataFrame, ctx_dict: Dict) -> Dict[str, Any]:
|
| 479 |
+
"""Enhanced context generation with AI-driven analysis"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 480 |
|
| 481 |
+
# Get autonomous intelligence analysis
|
| 482 |
+
intelligence = analyze_data_intelligence(df, ctx_dict)
|
| 483 |
|
| 484 |
+
# Original context enhancement
|
| 485 |
+
enhanced = ctx_dict.copy()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 486 |
|
| 487 |
+
# Add statistical context
|
| 488 |
+
if not df.empty:
|
| 489 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
| 490 |
+
if len(numeric_cols) > 0:
|
| 491 |
+
enhanced['statistical_summary'] = {
|
| 492 |
+
'numeric_columns': len(numeric_cols),
|
| 493 |
+
'total_records': len(df),
|
| 494 |
+
'missing_data_percentage': (df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100,
|
| 495 |
+
'key_metrics': {col: {'mean': df[col].mean(), 'std': df[col].std()}
|
| 496 |
+
for col in numeric_cols[:3]} # Top 3 numeric columns
|
| 497 |
+
}
|
| 498 |
|
| 499 |
+
# Add categorical context
|
| 500 |
+
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
|
| 501 |
+
if len(categorical_cols) > 0:
|
| 502 |
+
enhanced['categorical_summary'] = {
|
| 503 |
+
'categorical_columns': len(categorical_cols),
|
| 504 |
+
'unique_values': {col: df[col].nunique() for col in categorical_cols[:3]}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 505 |
}
|
| 506 |
|
| 507 |
+
# Merge with intelligence analysis
|
| 508 |
+
enhanced['ai_intelligence'] = intelligence
|
| 509 |
+
|
| 510 |
+
return enhanced
|
| 511 |
|
| 512 |
+
def generate_report_draft(buf, name: str, ctx: str, uid: str, project_id: str, bucket):
|
| 513 |
"""
|
| 514 |
+
Enhanced autonomous report generation with intelligent narrative creation
|
| 515 |
"""
|
| 516 |
+
logging.info(f"Generating autonomous report draft for project {project_id}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 517 |
|
| 518 |
+
df = load_dataframe_safely(buf, name)
|
| 519 |
+
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=API_KEY, temperature=0.1)
|
|
|
|
| 520 |
|
| 521 |
+
# Build enhanced context with AI intelligence
|
| 522 |
+
ctx_dict = {"shape": df.shape, "columns": list(df.columns), "user_ctx": ctx}
|
| 523 |
+
enhanced_ctx = enhance_data_context(df, ctx_dict)
|
| 524 |
|
| 525 |
+
# Get AI intelligence analysis
|
| 526 |
+
intelligence = analyze_data_intelligence(df, ctx_dict)
|
|
|
|
|
|
|
| 527 |
|
| 528 |
+
# Generate autonomous prompt
|
| 529 |
+
report_prompt = create_autonomous_prompt(df, enhanced_ctx, intelligence)
|
|
|
|
|
|
|
|
|
|
| 530 |
|
| 531 |
+
# Generate the report
|
| 532 |
+
md = llm.invoke(report_prompt).content
|
|
|
|
|
|
|
|
|
|
| 533 |
|
| 534 |
+
# Extract and process charts
|
| 535 |
+
chart_descs = extract_chart_tags(md)[:MAX_CHARTS]
|
| 536 |
+
chart_urls = {}
|
| 537 |
+
chart_generator = ChartGenerator(llm, df)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 538 |
|
| 539 |
+
for desc in chart_descs:
|
| 540 |
+
# Create a safe key for Firebase
|
| 541 |
+
safe_desc = sanitize_for_firebase_key(desc)
|
| 542 |
+
|
| 543 |
+
# Replace the original description in the markdown with the safe one
|
| 544 |
+
md = md.replace(f'<generate_chart: "{desc}">', f'<generate_chart: "{safe_desc}">')
|
| 545 |
+
md = md.replace(f'<generate_chart: {desc}>', f'<generate_chart: "{safe_desc}">') # Handle no quotes case
|
| 546 |
+
|
| 547 |
+
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file:
|
| 548 |
+
img_path = Path(temp_file.name)
|
| 549 |
+
try:
|
| 550 |
+
chart_spec = chart_generator.generate_chart_spec(desc) # Still generate spec from original desc
|
| 551 |
+
if execute_chart_spec(chart_spec, df, img_path):
|
| 552 |
+
blob_name = f"sozo_projects/{uid}/{project_id}/charts/{uuid.uuid4().hex}.png"
|
| 553 |
+
blob = bucket.blob(blob_name)
|
| 554 |
+
blob.upload_from_filename(str(img_path))
|
| 555 |
+
|
| 556 |
+
# Use the safe key in the dictionary
|
| 557 |
+
chart_urls[safe_desc] = blob.public_url
|
| 558 |
+
logging.info(f"Uploaded chart '{desc}' to {blob.public_url} with safe key '{safe_desc}'")
|
| 559 |
+
finally:
|
| 560 |
+
if os.path.exists(img_path):
|
| 561 |
+
os.unlink(img_path)
|
| 562 |
|
| 563 |
+
return {"raw_md": md, "chartUrls": chart_urls}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 564 |
|
| 565 |
+
# Additional helper functions for the autonomous system
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 566 |
|
| 567 |
+
def detect_data_relationships(df: pd.DataFrame) -> Dict[str, Any]:
|
| 568 |
+
"""Detect relationships and patterns in the data"""
|
| 569 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
| 570 |
+
relationships = {}
|
| 571 |
+
|
| 572 |
+
if len(numeric_cols) > 1:
|
| 573 |
+
corr_matrix = df[numeric_cols].corr()
|
| 574 |
+
# Find strong correlations (> 0.7 or < -0.7)
|
| 575 |
+
strong_correlations = []
|
| 576 |
+
for i in range(len(corr_matrix.columns)):
|
| 577 |
+
for j in range(i+1, len(corr_matrix.columns)):
|
| 578 |
+
corr_val = corr_matrix.iloc[i, j]
|
| 579 |
+
if abs(corr_val) > 0.7:
|
| 580 |
+
strong_correlations.append({
|
| 581 |
+
'var1': corr_matrix.columns[i],
|
| 582 |
+
'var2': corr_matrix.columns[j],
|
| 583 |
+
'correlation': corr_val
|
| 584 |
+
})
|
| 585 |
+
relationships['strong_correlations'] = strong_correlations
|
| 586 |
+
|
| 587 |
+
return relationships
|
| 588 |
|
| 589 |
+
def identify_key_metrics(df: pd.DataFrame, domain: str) -> List[str]:
|
| 590 |
+
"""Identify the most important metrics based on domain and data characteristics"""
|
| 591 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
| 592 |
+
|
| 593 |
+
domain_priorities = {
|
| 594 |
+
'financial': ['revenue', 'profit', 'cost', 'amount', 'price', 'margin'],
|
| 595 |
+
'survey': ['rating', 'score', 'satisfaction', 'response'],
|
| 596 |
+
'marketing': ['conversion', 'click', 'impression', 'engagement'],
|
| 597 |
+
'operational': ['efficiency', 'utilization', 'throughput', 'performance']
|
| 598 |
+
}
|
| 599 |
+
|
| 600 |
+
priorities = domain_priorities.get(domain, [])
|
| 601 |
+
key_metrics = []
|
| 602 |
+
|
| 603 |
+
# Match column names with domain priorities
|
| 604 |
+
for col in numeric_cols:
|
| 605 |
+
col_lower = col.lower()
|
| 606 |
+
for priority in priorities:
|
| 607 |
+
if priority in col_lower:
|
| 608 |
+
key_metrics.append(col)
|
| 609 |
+
break
|
| 610 |
+
|
| 611 |
+
# If no matches, use columns with highest variance (most interesting)
|
| 612 |
+
if not key_metrics and numeric_cols:
|
| 613 |
+
variances = df[numeric_cols].var().sort_values(ascending=False)
|
| 614 |
+
key_metrics = variances.head(3).index.tolist()
|
| 615 |
+
|
| 616 |
+
return key_metrics[:5] # Return top 5 key metrics
|
| 617 |
|
| 618 |
|
| 619 |
# Removed - no longer needed since we're letting AI decide everything organically
|