Spaces:
Running
Running
Pulastya B
feat: Initial commit - Data Science Agent with React frontend and FastAPI backend
226ac39
| """ | |
| Titanic Example - Demonstrating the complete Data Science Copilot workflow | |
| """ | |
| import sys | |
| import os | |
| from pathlib import Path | |
| # Add src to path | |
| sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src')) | |
| from orchestrator import DataScienceCopilot | |
| from rich.console import Console | |
| from rich.panel import Panel | |
| console = Console() | |
| def main(): | |
| """ | |
| Complete example using the Titanic dataset. | |
| This demonstrates the full workflow: | |
| 1. Dataset profiling | |
| 2. Quality issue detection | |
| 3. Data cleaning | |
| 4. Feature engineering | |
| 5. Model training | |
| 6. Report generation | |
| """ | |
| console.print(Panel.fit( | |
| "π’ Titanic Survival Prediction - Complete Workflow Example", | |
| style="bold blue" | |
| )) | |
| # Setup | |
| titanic_path = "./data/titanic.csv" | |
| # Check if dataset exists | |
| if not Path(titanic_path).exists(): | |
| console.print("\n[yellow]β Titanic dataset not found at ./data/titanic.csv[/yellow]") | |
| console.print("[yellow]Please download it from: https://www.kaggle.com/c/titanic/data[/yellow]") | |
| console.print("[yellow]Or place your own CSV file in the data directory[/yellow]\n") | |
| # Use a sample path instead | |
| console.print("[blue]Using sample dataset path for demonstration...[/blue]\n") | |
| titanic_path = "your_dataset.csv" # User should replace this | |
| # Initialize copilot | |
| console.print("\n[bold]Step 1: Initialize Data Science Copilot[/bold]") | |
| try: | |
| copilot = DataScienceCopilot(reasoning_effort="medium") | |
| console.print("[green]β Copilot initialized successfully[/green]") | |
| except Exception as e: | |
| console.print(f"[red]β Error: {e}[/red]") | |
| console.print("[yellow]Make sure to set GROQ_API_KEY in .env file[/yellow]") | |
| return | |
| # Define the task | |
| task_description = """ | |
| Analyze the Titanic dataset and build a model to predict passenger survival. | |
| Key objectives: | |
| 1. Understand the data structure and identify quality issues | |
| 2. Handle missing values appropriately | |
| 3. Engineer relevant features from available data (e.g., family size, titles from names) | |
| 4. Train and compare multiple baseline models | |
| 5. Identify the most important features for prediction | |
| 6. Provide recommendations for improvement | |
| Target: Achieve competitive performance (aim for 50-70th percentile on Kaggle leaderboard) | |
| """ | |
| target_column = "Survived" | |
| console.print("\n[bold]Step 2: Run Complete Analysis Workflow[/bold]") | |
| console.print(f"Dataset: {titanic_path}") | |
| console.print(f"Target: {target_column}") | |
| console.print(f"Task: Predict passenger survival\n") | |
| # Run analysis | |
| try: | |
| result = copilot.analyze( | |
| file_path=titanic_path, | |
| task_description=task_description, | |
| target_col=target_column, | |
| use_cache=True, | |
| max_iterations=15 # Allow more iterations for complex workflow | |
| ) | |
| # Display results | |
| if result["status"] == "success": | |
| console.print("\n[green]β Analysis Complete![/green]\n") | |
| # Display summary | |
| console.print(Panel( | |
| result["summary"], | |
| title="π Final Analysis Summary", | |
| border_style="green" | |
| )) | |
| # Display workflow steps | |
| console.print("\n[bold]π§ Workflow Steps Executed:[/bold]") | |
| for i, step in enumerate(result["workflow_history"], 1): | |
| tool = step["tool"] | |
| success = step["result"].get("success", False) | |
| icon = "β" if success else "β" | |
| color = "green" if success else "red" | |
| console.print(f"{i}. [{color}]{icon}[/{color}] {tool}") | |
| # Display statistics | |
| console.print(f"\n[bold]π Execution Statistics:[/bold]") | |
| console.print(f" Total Iterations: {result['iterations']}") | |
| console.print(f" API Calls Made: {result['api_calls']}") | |
| console.print(f" Execution Time: {result['execution_time']}s") | |
| # Check for trained models | |
| console.print("\n[bold]π€ Model Training Results:[/bold]") | |
| for step in result["workflow_history"]: | |
| if step["tool"] == "train_baseline_models": | |
| if step["result"].get("success"): | |
| models_result = step["result"]["result"] | |
| best_model = models_result.get("best_model", {}) | |
| console.print(f" Best Model: {best_model.get('name')}") | |
| console.print(f" Score: {best_model.get('score'):.4f}") | |
| console.print(f" Model Path: {best_model.get('model_path')}") | |
| # Save results | |
| output_file = "./outputs/reports/titanic_analysis.json" | |
| Path(output_file).parent.mkdir(parents=True, exist_ok=True) | |
| import json | |
| with open(output_file, "w") as f: | |
| json.dump(result, f, indent=2) | |
| console.print(f"\n[cyan]πΎ Full results saved to: {output_file}[/cyan]") | |
| # Next steps | |
| console.print("\n[bold]π― Next Steps:[/bold]") | |
| console.print(" 1. Review the generated models in ./outputs/models/") | |
| console.print(" 2. Check data quality reports in ./outputs/reports/") | |
| console.print(" 3. Examine cleaned datasets in ./outputs/data/") | |
| console.print(" 4. Use the best model for predictions on new data") | |
| elif result["status"] == "error": | |
| console.print(f"\n[red]β Analysis failed: {result['error']}[/red]") | |
| console.print(f"Error type: {result['error_type']}") | |
| else: | |
| console.print(f"\n[yellow]β Analysis incomplete: {result.get('message')}[/yellow]") | |
| except Exception as e: | |
| console.print(f"\n[red]β Unexpected error: {e}[/red]") | |
| import traceback | |
| console.print(traceback.format_exc()) | |
| # Cache statistics | |
| console.print("\n[bold]π¦ Cache Statistics:[/bold]") | |
| cache_stats = copilot.get_cache_stats() | |
| console.print(f" Valid Entries: {cache_stats['valid_entries']}") | |
| console.print(f" Cache Size: {cache_stats['size_mb']} MB") | |
| if __name__ == "__main__": | |
| main() | |