Data-Science-Agent / examples /titanic_example.py
Pulastya B
feat: Initial commit - Data Science Agent with React frontend and FastAPI backend
226ac39
"""
Titanic Example - Demonstrating the complete Data Science Copilot workflow
"""
import sys
import os
from pathlib import Path
# Add src to path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
from orchestrator import DataScienceCopilot
from rich.console import Console
from rich.panel import Panel
console = Console()
def main():
"""
Complete example using the Titanic dataset.
This demonstrates the full workflow:
1. Dataset profiling
2. Quality issue detection
3. Data cleaning
4. Feature engineering
5. Model training
6. Report generation
"""
console.print(Panel.fit(
"🚒 Titanic Survival Prediction - Complete Workflow Example",
style="bold blue"
))
# Setup
titanic_path = "./data/titanic.csv"
# Check if dataset exists
if not Path(titanic_path).exists():
console.print("\n[yellow]⚠ Titanic dataset not found at ./data/titanic.csv[/yellow]")
console.print("[yellow]Please download it from: https://www.kaggle.com/c/titanic/data[/yellow]")
console.print("[yellow]Or place your own CSV file in the data directory[/yellow]\n")
# Use a sample path instead
console.print("[blue]Using sample dataset path for demonstration...[/blue]\n")
titanic_path = "your_dataset.csv" # User should replace this
# Initialize copilot
console.print("\n[bold]Step 1: Initialize Data Science Copilot[/bold]")
try:
copilot = DataScienceCopilot(reasoning_effort="medium")
console.print("[green]βœ“ Copilot initialized successfully[/green]")
except Exception as e:
console.print(f"[red]βœ— Error: {e}[/red]")
console.print("[yellow]Make sure to set GROQ_API_KEY in .env file[/yellow]")
return
# Define the task
task_description = """
Analyze the Titanic dataset and build a model to predict passenger survival.
Key objectives:
1. Understand the data structure and identify quality issues
2. Handle missing values appropriately
3. Engineer relevant features from available data (e.g., family size, titles from names)
4. Train and compare multiple baseline models
5. Identify the most important features for prediction
6. Provide recommendations for improvement
Target: Achieve competitive performance (aim for 50-70th percentile on Kaggle leaderboard)
"""
target_column = "Survived"
console.print("\n[bold]Step 2: Run Complete Analysis Workflow[/bold]")
console.print(f"Dataset: {titanic_path}")
console.print(f"Target: {target_column}")
console.print(f"Task: Predict passenger survival\n")
# Run analysis
try:
result = copilot.analyze(
file_path=titanic_path,
task_description=task_description,
target_col=target_column,
use_cache=True,
max_iterations=15 # Allow more iterations for complex workflow
)
# Display results
if result["status"] == "success":
console.print("\n[green]βœ“ Analysis Complete![/green]\n")
# Display summary
console.print(Panel(
result["summary"],
title="πŸ“‹ Final Analysis Summary",
border_style="green"
))
# Display workflow steps
console.print("\n[bold]πŸ”§ Workflow Steps Executed:[/bold]")
for i, step in enumerate(result["workflow_history"], 1):
tool = step["tool"]
success = step["result"].get("success", False)
icon = "βœ“" if success else "βœ—"
color = "green" if success else "red"
console.print(f"{i}. [{color}]{icon}[/{color}] {tool}")
# Display statistics
console.print(f"\n[bold]πŸ“Š Execution Statistics:[/bold]")
console.print(f" Total Iterations: {result['iterations']}")
console.print(f" API Calls Made: {result['api_calls']}")
console.print(f" Execution Time: {result['execution_time']}s")
# Check for trained models
console.print("\n[bold]πŸ€– Model Training Results:[/bold]")
for step in result["workflow_history"]:
if step["tool"] == "train_baseline_models":
if step["result"].get("success"):
models_result = step["result"]["result"]
best_model = models_result.get("best_model", {})
console.print(f" Best Model: {best_model.get('name')}")
console.print(f" Score: {best_model.get('score'):.4f}")
console.print(f" Model Path: {best_model.get('model_path')}")
# Save results
output_file = "./outputs/reports/titanic_analysis.json"
Path(output_file).parent.mkdir(parents=True, exist_ok=True)
import json
with open(output_file, "w") as f:
json.dump(result, f, indent=2)
console.print(f"\n[cyan]πŸ’Ύ Full results saved to: {output_file}[/cyan]")
# Next steps
console.print("\n[bold]🎯 Next Steps:[/bold]")
console.print(" 1. Review the generated models in ./outputs/models/")
console.print(" 2. Check data quality reports in ./outputs/reports/")
console.print(" 3. Examine cleaned datasets in ./outputs/data/")
console.print(" 4. Use the best model for predictions on new data")
elif result["status"] == "error":
console.print(f"\n[red]βœ— Analysis failed: {result['error']}[/red]")
console.print(f"Error type: {result['error_type']}")
else:
console.print(f"\n[yellow]⚠ Analysis incomplete: {result.get('message')}[/yellow]")
except Exception as e:
console.print(f"\n[red]βœ— Unexpected error: {e}[/red]")
import traceback
console.print(traceback.format_exc())
# Cache statistics
console.print("\n[bold]πŸ“¦ Cache Statistics:[/bold]")
cache_stats = copilot.get_cache_stats()
console.print(f" Valid Entries: {cache_stats['valid_entries']}")
console.print(f" Cache Size: {cache_stats['size_mb']} MB")
if __name__ == "__main__":
main()