diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000000000000000000000000000000000000..31da76a70f16340f00a7a15f371707bdd5660802
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,74 @@
+# Python cache and environment
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+.venv/
+venv/
+ENV/
+env/
+
+# Development files
+.git/
+.gitignore
+.env
+.env.local
+*.log
+
+# Output directories (not needed in container)
+outputs/
+cache_db/
+temp/
+test_data/
+data/
+
+# Frontend development files (will be built in Docker)
+FRRONTEEEND/node_modules/
+FRRONTEEEND/.env
+FRRONTEEEND/.env.local
+
+# Documentation and tests
+*.md
+!README.md
+tests/
+test_*.py
+check_*.py
+
+# Old Gradio UI (no longer used)
+chat_ui.py
+
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+# OS files
+.DS_Store
+Thumbs.db
+
+# Jupyter notebooks
+*.ipynb
+.ipynb_checkpoints/
+
+# Large model files (if any)
+*.pkl
+*.joblib
+*.h5
+*.pt
+*.pth
+
+# Documentation
+docs/
+PHASE*.md
+PROJECT*.md
+TOKEN*.md
+TOOL*.md
+FEATURE*.md
+IMPLEMENTATION*.md
+MIGRATION*.md
+EDA_REPORTS*.md
+GITHUB*.md
+BIGQUERY*.md
diff --git a/.env.example b/.env.example
new file mode 100644
index 0000000000000000000000000000000000000000..3ffe412f7069008a8c3f5cf6ab3284be470fddbd
--- /dev/null
+++ b/.env.example
@@ -0,0 +1,19 @@
+# Google Gemini API Configuration
+GOOGLE_API_KEY=your_google_api_key_here
+
+# Model Configuration
+LLM_PROVIDER=gemini
+REASONING_EFFORT=medium
+
+# Cache Configuration
+CACHE_DB_PATH=./cache_db/cache.db
+CACHE_TTL_SECONDS=86400
+
+# Output Configuration
+OUTPUT_DIR=./outputs
+DATA_DIR=./data
+
+# Performance Configuration
+MAX_PARALLEL_TOOLS=5
+MAX_RETRIES=3
+TIMEOUT_SECONDS=300
diff --git a/.gcloudignore b/.gcloudignore
new file mode 100644
index 0000000000000000000000000000000000000000..0a4e0a796dbedc00057e8c2a564a796cdcfb6b24
--- /dev/null
+++ b/.gcloudignore
@@ -0,0 +1,59 @@
+# This file specifies files that are *not* uploaded to Google Cloud
+# using gcloud. It follows the same syntax as .gitignore
+
+.gcloudignore
+.git
+.gitignore
+
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+.venv/
+venv/
+ENV/
+env/
+
+# Local development
+.env
+.env.local
+*.log
+
+# Outputs and cache (regenerated in cloud)
+outputs/
+cache_db/
+temp/
+test_data/
+data/
+
+# Documentation
+*.md
+!README.md
+
+# Tests
+tests/
+test_*.py
+check_*.py
+
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+
+# OS
+.DS_Store
+Thumbs.db
+
+# Jupyter
+*.ipynb
+.ipynb_checkpoints/
+
+# Build artifacts
+*.pkl
+*.joblib
+*.h5
+*.pt
+*.pth
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..fc805344ff2d616e7d19e3fe37cabae534b0a9d2
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,71 @@
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# Virtual Environment
+venv/
+.venv/
+env/
+ENV/
+
+# Environment Variables
+.env
+
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+# Cache & Outputs
+cache_db/*.db
+cache_db/*.db-journal
+cache_db/
+outputs/
+temp/
+*.pkl
+*.joblib
+
+# Data files (except examples)
+data/*.csv
+data/*.parquet
+!data/.gitkeep
+
+# Cloud Run URL
+.cloud_run_url
+
+# Jupyter
+.ipynb_checkpoints/
+*.ipynb
+
+# OS
+.DS_Store
+Thumbs.db
+
+# Testing
+.pytest_cache/
+.coverage
+htmlcov/
+.tox/
+
+# Logs
+*.log
diff --git a/BIGQUERY_SCHEMAS.md b/BIGQUERY_SCHEMAS.md
new file mode 100644
index 0000000000000000000000000000000000000000..feb751923f08eb893cde28cd9b2f7ee4fc8112b5
--- /dev/null
+++ b/BIGQUERY_SCHEMAS.md
@@ -0,0 +1,691 @@
+# BigQuery Output Schemas for Looker Compatibility
+
+**Purpose**: Define stable BigQuery table schemas that BI tools (Looker, Data Studio) can query reliably.
+
+**Design Principles**:
+- ✅ **Stable Schema**: No breaking changes without versioning
+- ✅ **Consistent Naming**: snake_case columns, clear dimension/metric separation
+- ✅ **BI-Friendly Types**: Standard SQL types, no complex nested structures
+- ✅ **Documented Grain**: Clear primary keys and update patterns
+- ✅ **Dashboard-Ready**: Metrics aligned with common visualizations
+
+---
+
+## 📊 Table 1: `model_metrics`
+
+**Description**: Model performance metrics tracked over time for monitoring and comparison.
+
+**Use Cases**:
+- Performance dashboards
+- Model comparison reports
+- Drift detection alerts
+- A/B test analysis
+
+**Update Frequency**: On every model training run
+
+**Grain**: One row per model training execution
+
+### Schema
+
+| Column Name | Type | Description | Dimension/Metric | Example |
+|------------|------|-------------|------------------|---------|
+| `project_id` | STRING | Google Cloud project ID | Dimension | `my-ml-project` |
+| `dataset_id` | STRING | BigQuery dataset name | Dimension | `ml_models` |
+| `model_id` | STRING | Unique model identifier | Dimension (Primary Key) | `xgboost_churn_20251223_153045` |
+| `model_name` | STRING | Human-readable model name | Dimension | `Customer Churn Predictor` |
+| `model_type` | STRING | Algorithm used | Dimension | `XGBoost`, `RandomForest`, `LightGBM` |
+| `task_type` | STRING | ML task category | Dimension | `classification`, `regression` |
+| `training_dataset` | STRING | Source table/file reference | Dimension | `project.dataset.train_data` |
+| `target_column` | STRING | Prediction target name | Dimension | `churn`, `price`, `survived` |
+| `created_at` | TIMESTAMP | Model training timestamp | Dimension (Time) | `2025-12-23 15:30:45 UTC` |
+| `created_date` | DATE | Training date (for partitioning) | Dimension (Time) | `2025-12-23` |
+| `feature_count` | INTEGER | Number of features used | Metric | `42` |
+| `training_rows` | INTEGER | Training set size | Metric | `10000` |
+| `test_rows` | INTEGER | Test set size | Metric | `2500` |
+| `training_duration_seconds` | FLOAT | Time to train model | Metric | `123.45` |
+| `accuracy` | FLOAT | Overall accuracy (0-1) | Metric | `0.95` |
+| `precision` | FLOAT | Precision score (0-1) | Metric | `0.92` |
+| `recall` | FLOAT | Recall score (0-1) | Metric | `0.88` |
+| `f1_score` | FLOAT | F1 score (0-1) | Metric | `0.90` |
+| `roc_auc` | FLOAT | ROC AUC score (0-1) | Metric | `0.94` |
+| `pr_auc` | FLOAT | Precision-Recall AUC (0-1) | Metric | `0.91` |
+| `mae` | FLOAT | Mean Absolute Error (regression) | Metric | `1234.56` |
+| `mse` | FLOAT | Mean Squared Error (regression) | Metric | `567890.12` |
+| `rmse` | FLOAT | Root Mean Squared Error (regression) | Metric | `753.59` |
+| `r2_score` | FLOAT | R² coefficient (regression) | Metric | `0.85` |
+| `cross_val_mean` | FLOAT | Mean CV score | Metric | `0.93` |
+| `cross_val_std` | FLOAT | CV score std deviation | Metric | `0.02` |
+| `hyperparameters` | STRING (JSON) | Model hyperparameters | Metadata | `{"max_depth": 6, "n_estimators": 100}` |
+| `version` | STRING | Model version tag | Dimension | `v1.2.3` |
+| `environment` | STRING | Training environment | Dimension | `production`, `staging`, `development` |
+| `user_email` | STRING | User who trained model | Dimension | `data-scientist@company.com` |
+
+### Partitioning & Clustering
+
+```sql
+-- Recommended table setup
+CREATE TABLE `project.dataset.model_metrics`
+(
+  -- columns as above
+)
+PARTITION BY created_date
+CLUSTER BY model_type, task_type, environment
+OPTIONS(
+  description="Model performance metrics for BI dashboards",
+  require_partition_filter=true
+);
+```
+
+### Primary Dimensions for Looker
+
+- **Time**: `created_at`, `created_date`
+- **Model**: `model_type`, `model_name`, `task_type`
+- **Performance Tier**: CASE expression on `accuracy`/`f1_score`
+  - `Excellent` (>0.90)
+  - `Good` (0.80-0.90)
+  - `Fair` (0.70-0.80)
+  - `Poor` (<0.70)
+
+### Sample Looker View
+
+```lookml
+view: model_metrics {
+  sql_table_name: `project.dataset.model_metrics` ;;
+
+  dimension: model_id {
+    primary_key: yes
+    type: string
+    sql: ${TABLE}.model_id ;;
+  }
+
+  dimension_group: created {
+    type: time
+    timeframes: [date, week, month, quarter, year]
+    sql: ${TABLE}.created_at ;;
+  }
+
+  dimension: model_type {
+    type: string
+    sql: ${TABLE}.model_type ;;
+  }
+
+  dimension: performance_tier {
+    type: string
+    sql: CASE
+      WHEN ${TABLE}.accuracy >= 0.90 THEN 'Excellent'
+      WHEN ${TABLE}.accuracy >= 0.80 THEN 'Good'
+      WHEN ${TABLE}.accuracy >= 0.70 THEN 'Fair'
+      ELSE 'Poor'
+    END ;;
+  }
+
+  measure: count {
+    type: count
+  }
+
+  measure: avg_accuracy {
+    type: average
+    sql: ${TABLE}.accuracy ;;
+    value_format_name: percent_2
+  }
+
+  measure: avg_f1_score {
+    type: average
+    sql: ${TABLE}.f1_score ;;
+    value_format_name: percent_2
+  }
+}
+```
+
+---
+
+## 🎯 Table 2: `feature_importance`
+
+**Description**: Feature importance scores for model interpretability.
+
+**Use Cases**:
+- Feature impact analysis
+- Feature selection dashboards
+- Model explainability reports
+
+**Update Frequency**: On every model training run
+
+**Grain**: One row per feature per model
+
+### Schema
+
+| Column Name | Type | Description | Dimension/Metric | Example |
+|------------|------|-------------|------------------|---------|
+| `model_id` | STRING | Foreign key to model_metrics | Dimension (Foreign Key) | `xgboost_churn_20251223_153045` |
+| `feature_name` | STRING | Name of the feature | Dimension (Primary Key) | `age`, `total_purchases`, `days_since_last_login` |
+| `importance_score` | FLOAT | Importance value (0-1) | Metric | `0.35` |
+| `importance_rank` | INTEGER | Rank by importance (1=most important) | Metric | `1`, `2`, `3` |
+| `importance_type` | STRING | Calculation method | Dimension | `gain`, `weight`, `cover`, `shap` |
+| `feature_type` | STRING | Data type category | Dimension | `numeric`, `categorical`, `datetime`, `text` |
+| `is_engineered` | BOOLEAN | Created by feature engineering? | Dimension | `true`, `false` |
+| `created_at` | TIMESTAMP | When importance was calculated | Dimension (Time) | `2025-12-23 15:30:45 UTC` |
+| `created_date` | DATE | Calculation date | Dimension (Time) | `2025-12-23` |
+
+### Partitioning & Clustering
+
+```sql
+CREATE TABLE `project.dataset.feature_importance`
+(
+  -- columns as above
+)
+PARTITION BY created_date
+CLUSTER BY model_id, importance_rank
+OPTIONS(
+  description="Feature importance scores for model explainability",
+  require_partition_filter=false  -- Allow cross-model queries
+);
+```
+
+### Primary Dimensions for Looker
+
+- **Feature**: `feature_name`, `feature_type`, `is_engineered`
+- **Model**: `model_id` (join to model_metrics)
+- **Importance**: `importance_rank`, `importance_type`
+
+### Sample Looker View
+
+```lookml
+view: feature_importance {
+  sql_table_name: `project.dataset.feature_importance` ;;
+
+  dimension: compound_key {
+    primary_key: yes
+    hidden: yes
+    sql: CONCAT(${TABLE}.model_id, '|', ${TABLE}.feature_name) ;;
+  }
+
+  dimension: feature_name {
+    type: string
+    sql: ${TABLE}.feature_name ;;
+  }
+
+  dimension: is_top_10 {
+    type: yesno
+    sql: ${TABLE}.importance_rank <= 10 ;;
+  }
+
+  measure: avg_importance {
+    type: average
+    sql: ${TABLE}.importance_score ;;
+    value_format_name: percent_2
+  }
+
+  measure: count_features {
+    type: count_distinct
+    sql: ${TABLE}.feature_name ;;
+  }
+}
+```
+
+---
+
+## 🔮 Table 3: `predictions`
+
+**Description**: Model predictions with actuals for monitoring and evaluation.
+
+**Use Cases**:
+- Prediction monitoring
+- Accuracy tracking over time
+- Segment performance analysis
+- Business impact measurement
+
+**Update Frequency**: Real-time or batch (daily/hourly)
+
+**Grain**: One row per prediction
+
+### Schema
+
+| Column Name | Type | Description | Dimension/Metric | Example |
+|------------|------|-------------|------------------|---------|
+| `prediction_id` | STRING | Unique prediction identifier | Dimension (Primary Key) | `pred_abc123xyz` |
+| `model_id` | STRING | Model used for prediction | Dimension (Foreign Key) | `xgboost_churn_20251223_153045` |
+| `entity_id` | STRING | Entity being predicted (customer_id, product_id, etc.) | Dimension | `customer_12345` |
+| `predicted_at` | TIMESTAMP | When prediction was made | Dimension (Time) | `2025-12-23 15:30:45 UTC` |
+| `predicted_date` | DATE | Prediction date (for partitioning) | Dimension (Time) | `2025-12-23` |
+| `prediction_value` | FLOAT | Predicted value | Metric | `0.85` (probability), `49.99` (price) |
+| `prediction_class` | STRING | Predicted class (classification) | Dimension | `churn`, `not_churn` |
+| `prediction_confidence` | FLOAT | Model confidence (0-1) | Metric | `0.92` |
+| `actual_value` | FLOAT | True value (when available) | Metric | `1.0` (churned), `52.50` (actual price) |
+| `actual_class` | STRING | True class (when available) | Dimension | `churn`, `not_churn` |
+| `actual_recorded_at` | TIMESTAMP | When actual became known | Dimension (Time) | `2025-12-30 10:00:00 UTC` |
+| `is_correct` | BOOLEAN | Prediction was correct? | Dimension | `true`, `false` |
+| `absolute_error` | FLOAT | \|predicted - actual\| | Metric | `2.51` |
+| `squared_error` | FLOAT | (predicted - actual)² | Metric | `6.30` |
+| `feature_values` | STRING (JSON) | Input features used | Metadata | `{"age": 35, "tenure": 24}` |
+| `segment` | STRING | Business segment | Dimension | `enterprise`, `smb`, `consumer` |
+| `region` | STRING | Geographic region | Dimension | `us-west`, `eu-central` |
+| `model_version` | STRING | Model version | Dimension | `v1.2.3` |
+| `prediction_latency_ms` | FLOAT | Inference time | Metric | `23.4` |
+
+### Partitioning & Clustering
+
+```sql
+CREATE TABLE `project.dataset.predictions`
+(
+  -- columns as above
+)
+PARTITION BY predicted_date
+CLUSTER BY model_id, segment, is_correct
+OPTIONS(
+  description="Model predictions with actuals for monitoring",
+  require_partition_filter=true,
+  partition_expiration_days=730  -- 2 years retention
+);
+```
+
+### Primary Dimensions for Looker
+
+- **Time**: `predicted_date`, days since prediction
+- **Model**: `model_id`, `model_version`
+- **Segment**: `segment`, `region`
+- **Accuracy**: `is_correct`, error buckets
+
+### Sample Looker View
+
+```lookml
+view: predictions {
+  sql_table_name: `project.dataset.predictions` ;;
+
+  dimension: prediction_id {
+    primary_key: yes
+    type: string
+    sql: ${TABLE}.prediction_id ;;
+  }
+
+  dimension_group: predicted {
+    type: time
+    timeframes: [date, week, month]
+    sql: ${TABLE}.predicted_at ;;
+  }
+
+  dimension: segment {
+    type: string
+    sql: ${TABLE}.segment ;;
+  }
+
+  dimension: error_bucket {
+    type: string
+    sql: CASE
+      WHEN ${TABLE}.absolute_error IS NULL THEN 'No Actual Yet'
+      WHEN ${TABLE}.absolute_error <= 0.1 THEN '0-10%'
+      WHEN ${TABLE}.absolute_error <= 0.2 THEN '10-20%'
+      ELSE '>20%'
+    END ;;
+  }
+
+  measure: count {
+    type: count
+  }
+
+  measure: accuracy_rate {
+    type: average
+    sql: CAST(${TABLE}.is_correct AS FLOAT64) ;;
+    value_format_name: percent_1
+  }
+
+  measure: avg_confidence {
+    type: average
+    sql: ${TABLE}.prediction_confidence ;;
+    value_format_name: percent_2
+  }
+
+  measure: mae {
+    type: average
+    sql: ${TABLE}.absolute_error ;;
+    value_format_name: decimal_2
+  }
+}
+```
+
+---
+
+## 📋 Table 4: `data_profile_summary`
+
+**Description**: Dataset profiling statistics for data quality monitoring.
+
+**Use Cases**:
+- Data quality dashboards
+- Schema drift detection
+- Data validation reports
+- Column-level monitoring
+
+**Update Frequency**: Daily or on-demand
+
+**Grain**: One row per column per dataset per run
+
+### Schema
+
+| Column Name | Type | Description | Dimension/Metric | Example |
+|------------|------|-------------|------------------|---------|
+| `profile_id` | STRING | Unique profile run identifier | Dimension (Primary Key) | `profile_abc123xyz` |
+| `dataset_name` | STRING | Source table/file name | Dimension | `project.dataset.customers` |
+| `column_name` | STRING | Column being profiled | Dimension | `age`, `email`, `signup_date` |
+| `profiled_at` | TIMESTAMP | When profiling ran | Dimension (Time) | `2025-12-23 15:30:45 UTC` |
+| `profiled_date` | DATE | Profiling date | Dimension (Time) | `2025-12-23` |
+| `data_type` | STRING | Column data type | Dimension | `INTEGER`, `STRING`, `FLOAT`, `TIMESTAMP` |
+| `inferred_type` | STRING | Smart type inference | Dimension | `numeric`, `categorical`, `datetime`, `text`, `email` |
+| `row_count` | INTEGER | Total rows in dataset | Metric | `10000` |
+| `non_null_count` | INTEGER | Non-null values | Metric | `9850` |
+| `null_count` | INTEGER | Null values | Metric | `150` |
+| `null_percentage` | FLOAT | % null (0-100) | Metric | `1.5` |
+| `unique_count` | INTEGER | Distinct values | Metric | `450` |
+| `uniqueness_percentage` | FLOAT | % unique (0-100) | Metric | `4.5` |
+| `min_value` | STRING | Minimum value (as string) | Metadata | `18`, `2020-01-01` |
+| `max_value` | STRING | Maximum value (as string) | Metadata | `95`, `2025-12-23` |
+| `mean_value` | FLOAT | Mean (numeric only) | Metric | `42.5` |
+| `median_value` | FLOAT | Median (numeric only) | Metric | `38.0` |
+| `std_dev` | FLOAT | Standard deviation (numeric only) | Metric | `15.2` |
+| `skewness` | FLOAT | Distribution skewness | Metric | `0.85` |
+| `kurtosis` | FLOAT | Distribution kurtosis | Metric | `2.1` |
+| `top_value` | STRING | Most common value | Metadata | `male`, `active` |
+| `top_value_frequency` | INTEGER | Count of most common value | Metric | `6500` |
+| `top_value_percentage` | FLOAT | % of most common value | Metric | `65.0` |
+| `has_outliers` | BOOLEAN | Outliers detected? | Dimension | `true`, `false` |
+| `outlier_count` | INTEGER | Number of outliers | Metric | `23` |
+| `outlier_percentage` | FLOAT | % outliers | Metric | `0.23` |
+| `quality_score` | FLOAT | Overall quality score (0-100) | Metric | `92.5` |
+| `quality_issues` | STRING (JSON) | Detected issues | Metadata | `["high_nulls", "duplicate_values"]` |
+| `validation_status` | STRING | Quality check result | Dimension | `pass`, `warn`, `fail` |
+
+### Partitioning & Clustering
+
+```sql
+CREATE TABLE `project.dataset.data_profile_summary`
+(
+  -- columns as above
+)
+PARTITION BY profiled_date
+CLUSTER BY dataset_name, validation_status
+OPTIONS(
+  description="Dataset profiling for data quality monitoring",
+  require_partition_filter=true,
+  partition_expiration_days=90  -- 3 months retention
+);
+```
+
+### Primary Dimensions for Looker
+
+- **Dataset**: `dataset_name`
+- **Column**: `column_name`, `data_type`, `inferred_type`
+- **Quality**: `validation_status`, `quality_score` buckets
+- **Time**: `profiled_date`
+
+### Sample Looker View
+
+```lookml
+view: data_profile_summary {
+  sql_table_name: `project.dataset.data_profile_summary` ;;
+
+  dimension: compound_key {
+    primary_key: yes
+    hidden: yes
+    sql: CONCAT(${TABLE}.profile_id, '|', ${TABLE}.column_name) ;;
+  }
+
+  dimension: column_name {
+    type: string
+    sql: ${TABLE}.column_name ;;
+  }
+
+  dimension: quality_tier {
+    type: string
+    sql: CASE
+      WHEN ${TABLE}.quality_score >= 90 THEN 'Excellent'
+      WHEN ${TABLE}.quality_score >= 75 THEN 'Good'
+      WHEN ${TABLE}.quality_score >= 60 THEN 'Fair'
+      ELSE 'Poor'
+    END ;;
+  }
+
+  dimension: has_quality_issues {
+    type: yesno
+    sql: ${TABLE}.validation_status IN ('warn', 'fail') ;;
+  }
+
+  measure: count_columns {
+    type: count_distinct
+    sql: ${TABLE}.column_name ;;
+  }
+
+  measure: avg_quality_score {
+    type: average
+    sql: ${TABLE}.quality_score ;;
+    value_format_name: decimal_1
+  }
+
+  measure: avg_null_percentage {
+    type: average
+    sql: ${TABLE}.null_percentage ;;
+    value_format_name: percent_1
+  }
+
+  measure: columns_with_issues {
+    type: count_distinct
+    sql: ${TABLE}.column_name ;;
+    filters: [has_quality_issues: "yes"]
+  }
+}
+```
+
+---
+
+## 🔄 Schema Evolution Guidelines
+
+### ✅ **SAFE Changes** (Non-Breaking)
+
+1. **Add new columns** (always nullable or with defaults)
+   ```sql
+   ALTER TABLE `project.dataset.model_metrics`
+   ADD COLUMN IF NOT EXISTS new_metric FLOAT64;
+   ```
+
+2. **Add new tables** (doesn't affect existing dashboards)
+
+3. **Lengthen STRING columns** (VARCHAR(50) → VARCHAR(100))
+
+4. **Add indexes/clustering** (performance only)
+
+5. **Add column descriptions**
+   ```sql
+   ALTER TABLE `project.dataset.model_metrics`
+   ALTER COLUMN accuracy SET OPTIONS (description='Model accuracy (0-1)');
+   ```
+
+### ❌ **BREAKING Changes** (Require Dashboard Updates)
+
+1. **Rename columns** → Use views for backward compatibility:
+   ```sql
+   CREATE OR REPLACE VIEW `project.dataset.model_metrics_v2` AS
+   SELECT
+     model_id,
+     accuracy AS acc,  -- renamed column
+     ...
+   FROM `project.dataset.model_metrics`;
+   ```
+
+2. **Change data types** → Create new column, migrate, deprecate old:
+   ```sql
+   -- Step 1: Add new column
+   ALTER TABLE model_metrics ADD COLUMN created_at_new TIMESTAMP;
+   
+   -- Step 2: Backfill
+   UPDATE model_metrics SET created_at_new = CAST(created_at AS TIMESTAMP) WHERE true;
+   
+   -- Step 3: Update dashboards to use new column
+   
+   -- Step 4: Drop old column after validation period
+   ALTER TABLE model_metrics DROP COLUMN created_at;
+   ```
+
+3. **Remove columns** → Deprecate first, remove after 90 days
+
+4. **Change partitioning** → Requires table recreation
+
+### 🔄 **Versioning Strategy**
+
+For major schema changes, create versioned tables:
+
+```
+project.dataset.model_metrics_v1  (deprecated, keep 90 days)
+project.dataset.model_metrics_v2  (current)
+project.dataset.model_metrics     (view pointing to latest version)
+```
+
+---
+
+## 📊 Dashboard-Ready Metrics Catalog
+
+### Model Performance Metrics
+
+| Metric Name | Calculation | Use Case |
+|------------|-------------|----------|
+| **Model Count** | `COUNT(DISTINCT model_id)` | Total models trained |
+| **Avg Accuracy** | `AVG(accuracy)` | Overall model quality |
+| **Accuracy Trend** | `AVG(accuracy) OVER (ORDER BY created_date)` | Performance over time |
+| **Best Model** | `model_id WHERE accuracy = MAX(accuracy)` | Top performer |
+| **Models by Type** | `COUNT(*) GROUP BY model_type` | Algorithm distribution |
+| **Training Time** | `AVG(training_duration_seconds)` | Resource usage |
+| **Recent Models** | `WHERE created_date >= DATE_SUB(CURRENT_DATE(), INTERVAL 7 DAY)` | Latest activity |
+
+### Feature Importance Metrics
+
+| Metric Name | Calculation | Use Case |
+|------------|-------------|----------|
+| **Top Features** | `WHERE importance_rank <= 10` | Most impactful features |
+| **Avg Importance** | `AVG(importance_score)` | Feature impact distribution |
+| **Engineered Features** | `COUNT(*) WHERE is_engineered = true` | Feature engineering effectiveness |
+| **Feature Stability** | `STDDEV(importance_score) GROUP BY feature_name` | Consistent predictors |
+
+### Prediction Metrics
+
+| Metric Name | Calculation | Use Case |
+|------------|-------------|----------|
+| **Accuracy Rate** | `AVG(CAST(is_correct AS FLOAT64))` | Real-world performance |
+| **MAE** | `AVG(absolute_error)` | Average error magnitude |
+| **RMSE** | `SQRT(AVG(squared_error))` | Error with outlier penalty |
+| **Predictions/Day** | `COUNT(*) GROUP BY predicted_date` | Volume tracking |
+| **Confidence Distribution** | `APPROX_QUANTILES(prediction_confidence, 10)` | Model calibration |
+| **Segment Performance** | `AVG(is_correct) GROUP BY segment` | Fairness check |
+
+### Data Quality Metrics
+
+| Metric Name | Calculation | Use Case |
+|------------|-------------|----------|
+| **Data Quality Score** | `AVG(quality_score)` | Overall health |
+| **Null Rate** | `AVG(null_percentage)` | Completeness |
+| **Columns with Issues** | `COUNT(DISTINCT column_name) WHERE validation_status != 'pass'` | Problem areas |
+| **Quality Trend** | `AVG(quality_score) OVER (ORDER BY profiled_date)` | Improving/degrading? |
+
+---
+
+## 🎯 Sample Looker Explores
+
+### Explore 1: Model Performance Analysis
+
+```lookml
+explore: model_metrics {
+  label: "Model Performance"
+  description: "Track model accuracy, training time, and comparison"
+
+  join: feature_importance {
+    type: left_outer
+    sql_on: ${model_metrics.model_id} = ${feature_importance.model_id} ;;
+    relationship: one_to_many
+  }
+}
+```
+
+### Explore 2: Prediction Monitoring
+
+```lookml
+explore: predictions {
+  label: "Prediction Monitoring"
+  description: "Real-time prediction accuracy and drift"
+
+  join: model_metrics {
+    type: left_outer
+    sql_on: ${predictions.model_id} = ${model_metrics.model_id} ;;
+    relationship: many_to_one
+  }
+}
+```
+
+### Explore 3: Data Quality Dashboard
+
+```lookml
+explore: data_profile_summary {
+  label: "Data Quality"
+  description: "Monitor data health and schema drift"
+}
+```
+
+---
+
+## 📝 Implementation Checklist
+
+### Phase 1: Setup (Week 1)
+- [ ] Create all 4 BigQuery tables with partitioning
+- [ ] Set up service account permissions
+- [ ] Configure table expiration policies
+- [ ] Document table owners and update SLAs
+
+### Phase 2: Integration (Week 2)
+- [ ] Update tools to write to these schemas
+- [ ] Add schema validation in CI/CD
+- [ ] Create data dictionary in Looker
+- [ ] Set up table monitoring alerts
+
+### Phase 3: BI Layer (Week 3)
+- [ ] Create Looker views for all 4 tables
+- [ ] Build explores with joins
+- [ ] Create initial dashboards
+- [ ] Set up scheduled data refreshes
+
+### Phase 4: Validation (Week 4)
+- [ ] Backfill historical data
+- [ ] Verify dashboard accuracy
+- [ ] Train stakeholders on dashboards
+- [ ] Document runbooks for common issues
+
+---
+
+## 🔗 Related Tools
+
+**BigQuery Write Tools** (src/bigquery/):
+- `bigquery_write_results()` - Generic write function
+- Helper: `bigquery_write_model_metrics()` - Specialized writer
+- Helper: `bigquery_write_feature_importance()` - Specialized writer
+- Helper: `bigquery_write_predictions()` - Specialized writer
+- Helper: `bigquery_write_data_profile()` - Specialized writer
+
+**Example Usage**:
+```python
+from src.bigquery import bigquery_write_results
+
+# Write model metrics
+bigquery_write_results(
+    data=metrics_df,
+    table_id="project.dataset.model_metrics",
+    write_disposition="WRITE_APPEND"
+)
+```
+
+---
+
+## 📚 Additional Resources
+
+- [BigQuery Best Practices](https://cloud.google.com/bigquery/docs/best-practices)
+- [Looker LookML Reference](https://cloud.google.com/looker/docs/reference/lookml-quick-reference)
+- [Schema Design for BI](https://cloud.google.com/architecture/bigquery-data-warehouse)
+
+---
+
+**Last Updated**: December 23, 2025  
+**Schema Version**: 1.0.0  
+**Maintained By**: Data Science Team  
+**Review Cadence**: Quarterly
diff --git a/CHECKLIST.md b/CHECKLIST.md
new file mode 100644
index 0000000000000000000000000000000000000000..2d25ef7533123646a384d6b5740c015dc0839ba7
--- /dev/null
+++ b/CHECKLIST.md
@@ -0,0 +1,97 @@
+# ✅ Pre-Launch Checklist
+
+## Before Running the Application
+
+### 1. Environment Variables ⚠️ **REQUIRED**
+
+You MUST set your API key before starting:
+
+```powershell
+# Windows PowerShell
+$env:GOOGLE_API_KEY="your-google-api-key-here"
+
+# Verify it's set
+echo $env:GOOGLE_API_KEY
+```
+
+### 2. Build Status ✅
+
+- [x] Frontend dependencies installed
+- [x] Frontend built (FRRONTEEEND/dist exists)
+- [x] Backend code updated with new endpoints
+- [x] Configuration files in place
+
+### 3. Quick Start Commands
+
+**Option A - Use the start script:**
+```powershell
+.\start.ps1
+```
+
+**Option B - Manual start:**
+```powershell
+# Make sure you're in the project root
+Set-Location "c:\Users\Pulastya\Videos\DS AGENTTTT"
+
+# Set API key (if not already set)
+$env:GOOGLE_API_KEY="your-key-here"
+
+# Start the server
+python src\api\app.py
+```
+
+### 4. Access the Application
+
+Once the server starts, open your browser to:
+**http://localhost:8080**
+
+You should see:
+1. **Landing Page** - Professional homepage with agent features
+2. **Launch Console** button - Click to open the chat interface
+3. **Chat Interface** - Modern conversational UI
+
+### 5. Test the Chat
+
+Try these sample prompts:
+- "What can you do?"
+- "Explain your data science capabilities"
+- "How do I upload a dataset?"
+- "What ML models do you support?"
+
+### 6. Expected Console Output
+
+When you start the server, you should see:
+```
+INFO:     Started server process [####]
+INFO:     Waiting for application startup.
+✅ Agent initialized with provider: groq
+✅ Frontend assets mounted from C:\Users\Pulastya\Videos\DS AGENTTTT\FRRONTEEEND\dist
+INFO:     Application startup complete.
+INFO:     Uvicorn running on http://0.0.0.0:8080
+```
+
+### 7. Troubleshooting Quick Reference
+
+| Issue | Solution |
+|-------|----------|
+| "Agent not initialized" | Set GOOGLE_API_KEY environment variable |
+| "Frontend not found" | Run `cd FRRONTEEEND && npm run build` |
+| Port 8080 in use | Kill the process or change PORT env var |
+| Import errors | Run `pip install -r requirements.txt` |
+
+## Next Steps After Launch
+
+1. **Test the chat** with the agent
+2. **Upload a dataset** (feature coming soon in chat)
+3. **Try the API endpoints** at http://localhost:8080/docs
+4. **Customize the frontend** in FRRONTEEEND/components/
+
+## Documentation
+
+- 📖 [MIGRATION_COMPLETE.md](MIGRATION_COMPLETE.md) - What was changed
+- 📖 [FRONTEND_INTEGRATION.md](FRONTEND_INTEGRATION.md) - Technical details
+- 📖 [README.md](README.md) - Main project docs
+
+---
+
+**Ready to launch?** Run `.\start.ps1` and visit http://localhost:8080 🚀
diff --git a/DEPLOYMENT.md b/DEPLOYMENT.md
new file mode 100644
index 0000000000000000000000000000000000000000..730a645bc23f32c2e8f96839628cd63a74899218
--- /dev/null
+++ b/DEPLOYMENT.md
@@ -0,0 +1,495 @@
+# 🚀 Google Cloud Run Deployment Guide
+
+Complete guide to deploy the Data Science Agent to Google Cloud Run as a serverless API.
+
+## 📋 Prerequisites
+
+1. **Google Cloud Platform Account**
+   - Active GCP account with billing enabled
+   - Project created (or use existing project)
+
+2. **Install Google Cloud SDK**
+   ```bash
+   # macOS (Homebrew)
+   brew install --cask google-cloud-sdk
+   
+   # Or download from: https://cloud.google.com/sdk/install
+   ```
+
+3. **Authenticate with GCP**
+   ```bash
+   gcloud auth login
+   gcloud auth application-default login
+   ```
+
+4. **Set Your Project**
+   ```bash
+   gcloud config set project YOUR_PROJECT_ID
+   ```
+
+---
+
+## 🎯 Deployment Options
+
+### Option 1: Automated Deployment (Recommended)
+
+Use the provided deployment script for one-command deployment:
+
+```bash
+# Set required environment variables
+export GCP_PROJECT_ID="your-project-id"
+export GROQ_API_KEY="your-groq-api-key"
+export GOOGLE_API_KEY="your-google-api-key"  # Optional for Gemini
+
+# Run deployment script
+./deploy.sh
+```
+
+**What it does:**
+- ✅ Enables required GCP APIs (Cloud Build, Cloud Run, Secret Manager)
+- ✅ Creates secrets for API keys
+- ✅ Builds Docker container
+- ✅ Deploys to Cloud Run
+- ✅ Returns service URL
+
+**Configuration options:**
+```bash
+# Optional: Customize deployment
+export CLOUD_RUN_REGION="us-central1"  # Change region
+export MEMORY="4Gi"                     # Increase memory
+export CPU="2"                          # Set CPU count
+export MAX_INSTANCES="10"               # Scale limit
+export TIMEOUT="900"                    # Request timeout (15 min)
+
+./deploy.sh
+```
+
+---
+
+### Option 2: Manual Deployment
+
+Step-by-step manual deployment for full control:
+
+#### Step 1: Enable APIs
+```bash
+gcloud services enable \
+  cloudbuild.googleapis.com \
+  run.googleapis.com \
+  containerregistry.googleapis.com \
+  secretmanager.googleapis.com
+```
+
+#### Step 2: Create Secrets
+```bash
+# Create GROQ API key secret
+echo -n "your-groq-api-key" | gcloud secrets create GROQ_API_KEY --data-file=-
+
+# Create Google API key secret (optional)
+echo -n "your-google-api-key" | gcloud secrets create GOOGLE_API_KEY --data-file=-
+
+# Grant Cloud Run access to secrets
+PROJECT_NUMBER=$(gcloud projects describe $(gcloud config get-value project) --format="value(projectNumber)")
+gcloud secrets add-iam-policy-binding GROQ_API_KEY \
+  --member="serviceAccount:${PROJECT_NUMBER}-compute@developer.gserviceaccount.com" \
+  --role="roles/secretmanager.secretAccessor"
+```
+
+#### Step 3: Build Container
+```bash
+gcloud builds submit --tag gcr.io/$(gcloud config get-value project)/data-science-agent
+```
+
+#### Step 4: Deploy to Cloud Run
+```bash
+gcloud run deploy data-science-agent \
+  --image gcr.io/$(gcloud config get-value project)/data-science-agent \
+  --platform managed \
+  --region us-central1 \
+  --allow-unauthenticated \
+  --memory 4Gi \
+  --cpu 2 \
+  --timeout 900 \
+  --max-instances 10 \
+  --set-env-vars LLM_PROVIDER=groq,REASONING_EFFORT=medium \
+  --set-secrets GROQ_API_KEY=GROQ_API_KEY:latest,GOOGLE_API_KEY=GOOGLE_API_KEY:latest
+```
+
+---
+
+### Option 3: CI/CD with Cloud Build Triggers
+
+Automated deployment on git push:
+
+#### Step 1: Connect Repository
+```bash
+# Connect GitHub/GitLab/Bitbucket repository
+gcloud beta builds connections create github connection-name \
+  --region=us-central1
+```
+
+#### Step 2: Create Build Trigger
+```bash
+gcloud builds triggers create github \
+  --name="deploy-data-science-agent" \
+  --repo-name="Data-Science-Agent" \
+  --repo-owner="Surfing-Ninja" \
+  --branch-pattern="^main$" \
+  --build-config="cloudbuild.yaml"
+```
+
+Now every push to `main` branch automatically deploys! 🎉
+
+---
+
+## 🧪 Testing the Deployment
+
+### 1. Health Check
+```bash
+SERVICE_URL=$(gcloud run services describe data-science-agent \
+  --region us-central1 \
+  --format 'value(status.url)')
+
+curl $SERVICE_URL/health
+```
+
+**Expected response:**
+```json
+{
+  "status": "healthy",
+  "agent_ready": true,
+  "provider": "groq",
+  "tools_count": 82
+}
+```
+
+### 2. List Available Tools
+```bash
+curl $SERVICE_URL/tools | jq
+```
+
+### 3. Profile a Dataset
+```bash
+curl -X POST $SERVICE_URL/profile \
+  -F "file=@test_data/sample.csv"
+```
+
+### 4. Run Full Analysis
+```bash
+curl -X POST $SERVICE_URL/run \
+  -F "file=@test_data/sample.csv" \
+  -F "task_description=Analyze this dataset, detect outliers, and train a prediction model" \
+  -F "target_col=target" \
+  | jq
+```
+
+---
+
+## 📊 Monitoring & Logs
+
+### View Real-time Logs
+```bash
+gcloud run logs tail data-science-agent --region us-central1
+```
+
+### View Recent Logs
+```bash
+gcloud run logs read data-science-agent \
+  --region us-central1 \
+  --limit 50
+```
+
+### Cloud Console Monitoring
+- Go to: https://console.cloud.google.com/run
+- Click on `data-science-agent`
+- View: Metrics, Logs, Revisions
+
+---
+
+## 💰 Cost Estimation
+
+### Cloud Run Pricing (as of Dec 2024)
+**Free Tier** (per month):
+- 2 million requests
+- 360,000 GB-seconds of memory
+- 180,000 vCPU-seconds
+
+**Paid Tier** (us-central1):
+- CPU: $0.00002400 per vCPU-second
+- Memory: $0.00000250 per GB-second
+- Requests: $0.40 per million requests
+
+**Example Cost for 4Gi Memory, 2 vCPU:**
+- 1 request taking 60 seconds
+  - CPU: 2 vCPU × 60s × $0.000024 = $0.00288
+  - Memory: 4GB × 60s × $0.0000025 = $0.0006
+  - Request: $0.0000004
+  - **Total: ~$0.0035 per request**
+
+**Monthly estimate for 1000 requests/month:**
+- ~$3.50/month (well within free tier for testing!)
+
+---
+
+## 🔒 Security Best Practices
+
+### 1. Enable Authentication (Production)
+```bash
+# Deploy with authentication required
+gcloud run deploy data-science-agent \
+  --no-allow-unauthenticated \
+  --region us-central1 \
+  --image gcr.io/PROJECT_ID/data-science-agent
+
+# Create service account for clients
+gcloud iam service-accounts create api-client
+
+# Grant invoker role
+gcloud run services add-iam-policy-binding data-science-agent \
+  --member="serviceAccount:api-client@PROJECT_ID.iam.gserviceaccount.com" \
+  --role="roles/run.invoker" \
+  --region us-central1
+```
+
+### 2. Use VPC Connector (For BigQuery/GCS)
+```bash
+# Create VPC connector
+gcloud compute networks vpc-access connectors create ds-agent-connector \
+  --network default \
+  --region us-central1 \
+  --range 10.8.0.0/28
+
+# Deploy with VPC
+gcloud run deploy data-science-agent \
+  --vpc-connector ds-agent-connector \
+  --region us-central1
+```
+
+### 3. Restrict API Keys
+- Set **Application restrictions** in Google Cloud Console
+- Whitelist only Cloud Run service URL
+- Set **API restrictions** to only required APIs
+
+---
+
+## 🔧 Configuration Options
+
+### Environment Variables
+```bash
+# Set during deployment
+--set-env-vars KEY1=value1,KEY2=value2
+
+# Available variables:
+LLM_PROVIDER=groq                    # or "gemini"
+REASONING_EFFORT=medium              # low, medium, high
+CACHE_TTL_SECONDS=86400              # Cache lifetime
+ARTIFACT_BACKEND=local               # or "gcs" for cloud storage
+GCS_BUCKET_NAME=your-bucket          # If using GCS backend
+OUTPUT_DIR=/tmp/outputs              # Output directory
+MAX_PARALLEL_TOOLS=5                 # Concurrent tool execution
+MAX_RETRIES=3                        # Tool retry attempts
+TIMEOUT_SECONDS=300                  # Tool timeout
+```
+
+### Resource Limits
+```bash
+--memory 4Gi              # 128Mi to 32Gi
+--cpu 2                   # 1 to 8 vCPU
+--timeout 900             # Max 3600s (1 hour)
+--max-instances 10        # Scale limit
+--min-instances 0         # Always-warm instances
+--concurrency 10          # Requests per instance
+```
+
+---
+
+## 🐛 Troubleshooting
+
+### Build Fails
+```bash
+# Check build logs
+gcloud builds list --limit=5
+gcloud builds log BUILD_ID
+
+# Common fixes:
+# - Ensure Dockerfile is in root directory
+# - Check requirements.txt has all dependencies
+# - Increase build timeout: --timeout=1200s
+```
+
+### Deployment Fails
+```bash
+# Check service status
+gcloud run services describe data-science-agent --region us-central1
+
+# Common fixes:
+# - Ensure APIs are enabled
+# - Check secrets exist and are accessible
+# - Verify service account permissions
+```
+
+### Runtime Errors
+```bash
+# View logs
+gcloud run logs tail data-science-agent --region us-central1
+
+# Common issues:
+# - API keys not set: Check secrets
+# - Import errors: Ensure all dependencies in requirements.txt
+# - Memory issues: Increase --memory limit
+# - Timeout: Increase --timeout value
+```
+
+### Container Crashes
+```bash
+# Test locally first
+docker build -t ds-agent .
+docker run -p 8080:8080 \
+  -e GROQ_API_KEY="your-key" \
+  ds-agent
+
+curl http://localhost:8080/health
+```
+
+---
+
+## 🚀 Advanced Features
+
+### Custom Domain
+```bash
+# Map custom domain
+gcloud run domain-mappings create \
+  --service data-science-agent \
+  --domain api.yourdomain.com \
+  --region us-central1
+```
+
+### Load Balancing
+```bash
+# Create multiple regional deployments
+for region in us-central1 us-east1 europe-west1; do
+  gcloud run deploy data-science-agent \
+    --image gcr.io/PROJECT_ID/data-science-agent \
+    --region $region
+done
+
+# Set up global load balancer
+# Follow: https://cloud.google.com/load-balancing/docs/https/setup-global-ext-https-serverless
+```
+
+### Multi-Region Deployment
+```bash
+# Deploy to multiple regions for high availability
+./deploy.sh CLOUD_RUN_REGION=us-central1
+./deploy.sh CLOUD_RUN_REGION=europe-west1
+./deploy.sh CLOUD_RUN_REGION=asia-east1
+```
+
+---
+
+## 📝 API Documentation
+
+Once deployed, access Swagger docs at:
+```
+https://YOUR_SERVICE_URL/docs
+```
+
+### Available Endpoints
+
+#### `GET /` - Health Check
+Returns service status and tool count.
+
+#### `GET /health` - Detailed Health
+Returns agent readiness and provider info.
+
+#### `GET /tools` - List Tools
+Returns all 82 available tools organized by category.
+
+#### `POST /run` - Run Full Analysis
+Upload dataset and execute complete data science workflow.
+
+**Parameters:**
+- `file`: CSV/Parquet file (multipart/form-data)
+- `task_description`: Natural language task description
+- `target_col`: Target column for ML (optional)
+- `use_cache`: Enable caching (default: true)
+- `max_iterations`: Max workflow steps (default: 20)
+
+#### `POST /profile` - Quick Profile
+Quick dataset profiling without full workflow.
+
+**Parameters:**
+- `file`: CSV/Parquet file (multipart/form-data)
+
+---
+
+## 🔄 Updates & Rollbacks
+
+### Update Deployment
+```bash
+# Rebuild and redeploy
+./deploy.sh
+```
+
+### Rollback to Previous Revision
+```bash
+# List revisions
+gcloud run revisions list --service data-science-agent --region us-central1
+
+# Rollback
+gcloud run services update-traffic data-science-agent \
+  --to-revisions REVISION_NAME=100 \
+  --region us-central1
+```
+
+### Blue/Green Deployment
+```bash
+# Deploy new version with tag
+gcloud run deploy data-science-agent \
+  --tag blue \
+  --no-traffic \
+  --region us-central1
+
+# Test: https://blue---data-science-agent-HASH.run.app
+
+# Switch traffic
+gcloud run services update-traffic data-science-agent \
+  --to-tags blue=100 \
+  --region us-central1
+```
+
+---
+
+## 📚 Additional Resources
+
+- **Cloud Run Docs**: https://cloud.google.com/run/docs
+- **Pricing Calculator**: https://cloud.google.com/products/calculator
+- **Best Practices**: https://cloud.google.com/run/docs/tips
+- **Quotas & Limits**: https://cloud.google.com/run/quotas
+
+---
+
+## ✅ Deployment Checklist
+
+- [ ] GCP project created and billing enabled
+- [ ] Google Cloud SDK installed and authenticated
+- [ ] API keys obtained (GROQ_API_KEY, GOOGLE_API_KEY)
+- [ ] Secrets created in Secret Manager
+- [ ] Docker container builds successfully locally
+- [ ] Cloud Run APIs enabled
+- [ ] Service deployed to Cloud Run
+- [ ] Health check endpoint returns 200
+- [ ] Test dataset profiled successfully
+- [ ] Full analysis workflow tested
+- [ ] Monitoring/logging configured
+- [ ] Cost alerts set up (optional)
+- [ ] Custom domain mapped (optional)
+- [ ] CI/CD pipeline configured (optional)
+
+---
+
+**Need help?** Check the troubleshooting section or view logs with:
+```bash
+gcloud run logs tail data-science-agent --region us-central1
+```
+
+Happy deploying! 🎉
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..3efba6cebd27487a0fb3dd7c653390a633e6ecc0
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,78 @@
+# Multi-stage build for Google Cloud Run
+# Stage 1: Build Frontend
+FROM node:20-alpine as frontend-builder
+
+WORKDIR /frontend
+
+# Copy frontend files
+COPY FRRONTEEEND/package*.json ./
+RUN npm install
+
+COPY FRRONTEEEND/ ./
+RUN npm run build
+
+# Stage 2: Build Python environment
+FROM python:3.13-slim as builder
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    gcc \
+    g++ \
+    make \
+    && rm -rf /var/lib/apt/lists/*
+
+# Create virtual environment
+RUN python -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# Copy requirements and install Python packages
+COPY requirements.txt .
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r requirements.txt
+
+# Stage 3: Runtime environment
+FROM python:3.13-slim
+
+# Install runtime dependencies only
+RUN apt-get update && apt-get install -y \
+    libgomp1 \
+    && rm -rf /var/lib/apt/lists/*
+
+# Copy virtual environment from builder
+COPY --from=builder /opt/venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# Set working directory
+WORKDIR /app
+
+# Copy application code
+COPY src/ /app/src/
+COPY examples/ /app/examples/
+
+# Copy built frontend from frontend-builder
+COPY --from=frontend-builder /frontend/dist /app/FRRONTEEEND/dist
+
+# Create necessary directories for Cloud Run ephemeral storage
+RUN mkdir -p /tmp/data_science_agent \
+    /tmp/outputs/models \
+    /tmp/outputs/plots \
+    /tmp/outputs/reports \
+    /tmp/outputs/data \
+    /tmp/cache_db
+
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+ENV PORT=8080
+ENV OUTPUT_DIR=/tmp/outputs
+ENV CACHE_DB_PATH=/tmp/cache_db/cache.db
+ENV ARTIFACT_BACKEND=local
+
+# Cloud Run expects the service to listen on the PORT env variable
+EXPOSE 8080
+
+# Health check (optional, Cloud Run handles this)
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+    CMD python -c "import requests; requests.get('http://localhost:8080/health')" || exit 1
+
+# Run the FastAPI application
+CMD ["python", "src/api/app.py"]
diff --git a/FRONTEND_INTEGRATION.md b/FRONTEND_INTEGRATION.md
new file mode 100644
index 0000000000000000000000000000000000000000..15c0063b94d5b6eefe01556751f79eef56784704
--- /dev/null
+++ b/FRONTEND_INTEGRATION.md
@@ -0,0 +1,234 @@
+# Data Science Agent - Frontend Integration Guide
+
+## 🎉 New React Frontend
+
+The application now features a modern, professional React frontend that replaces the old Gradio interface.
+
+### Features
+
+- **Beautiful Landing Page**: Showcases the agent's capabilities with modern design
+- **Professional Chat Interface**: NextChat-style conversational UI
+- **Direct Backend Integration**: Communicates with your FastAPI backend
+- **Responsive Design**: Works on all devices
+- **Dark Theme**: Modern, eye-friendly interface
+
+## 🚀 Quick Start
+
+### Prerequisites
+
+- Python 3.13+
+- Node.js 20+
+- npm (comes with Node.js)
+
+### Running the Application
+
+#### Option 1: Using the Build Script (Recommended)
+
+**Windows:**
+```powershell
+.\build-and-deploy.ps1
+```
+
+**Linux/Mac:**
+```bash
+chmod +x build-and-deploy.sh
+./build-and-deploy.sh
+```
+
+Then start the server:
+```bash
+python src/api/app.py
+```
+
+#### Option 2: Manual Steps
+
+1. **Build the Frontend:**
+```bash
+cd FRRONTEEEND
+npm.cmd install
+npm.cmd run build
+cd ..
+```
+
+2. **Install Python Dependencies:**
+```bash
+pip install -r requirements.txt
+```
+
+3. **Start the Backend Server:**
+```bash
+python src/api/app.py
+```
+
+4. **Access the Application:**
+Open your browser and navigate to: http://localhost:8080
+
+## 🏗️ Architecture
+
+### Backend (FastAPI)
+- **Location**: `src/api/app.py`
+- **Port**: 8080
+- **Endpoints**:
+  - `GET /` - Health check & landing page
+  - `POST /chat` - Chat interface endpoint
+  - `POST /run` - Full data science workflow
+  - `POST /profile` - Dataset profiling
+  - `GET /tools` - List available tools
+
+### Frontend (React + Vite)
+- **Location**: `FRRONTEEEND/`
+- **Build Output**: `FRRONTEEEND/dist/`
+- **Dev Port**: 3000 (development mode)
+- **Production**: Served by FastAPI at port 8080
+
+## 🔧 Development Mode
+
+If you want to develop the frontend with hot-reloading:
+
+1. **Terminal 1 - Backend:**
+```bash
+python src/api/app.py
+```
+
+2. **Terminal 2 - Frontend:**
+```bash
+cd FRRONTEEEND
+npm.cmd run dev
+```
+
+Access:
+- Frontend (dev): http://localhost:3000
+- Backend API: http://localhost:8080
+
+## 🌐 API Integration
+
+The frontend now communicates with your FastAPI backend instead of calling external APIs directly.
+
+### Environment Variables
+
+Create `FRRONTEEEND/.env` for local development:
+```env
+VITE_API_URL=http://localhost:8080
+```
+
+For production, update `FRRONTEEEND/.env.production`:
+```env
+VITE_API_URL=https://your-cloud-run-url.run.app
+```
+
+## 📦 Deployment
+
+### Docker Build
+
+The Dockerfile now includes a multi-stage build that:
+1. Builds the React frontend
+2. Builds the Python environment
+3. Combines both in the final image
+
+```bash
+docker build -t data-science-agent .
+docker run -p 8080:8080 data-science-agent
+```
+
+### Google Cloud Run
+
+```bash
+gcloud builds submit --tag gcr.io/YOUR-PROJECT-ID/data-science-agent
+gcloud run deploy data-science-agent \
+  --image gcr.io/YOUR-PROJECT-ID/data-science-agent \
+  --platform managed \
+  --region us-central1 \
+  --allow-unauthenticated \
+  --set-env-vars GROQ_API_KEY=your-api-key
+```
+
+## 🔄 What Changed
+
+### Removed
+- ❌ Gradio interface (`chat_ui.py` - kept for reference)
+- ❌ Direct Google GenAI calls from frontend
+- ❌ Gradio dependency
+
+### Added
+- ✅ React + TypeScript frontend with Vite
+- ✅ Professional landing page
+- ✅ Modern chat interface
+- ✅ `/chat` API endpoint
+- ✅ CORS support in FastAPI
+- ✅ Static file serving for React app
+- ✅ Multi-stage Docker build
+
+## 🛠️ Tech Stack
+
+### Frontend
+- React 19
+- TypeScript 5.8
+- Vite 6
+- Tailwind CSS
+- Framer Motion (animations)
+- Lucide React (icons)
+
+### Backend (unchanged)
+- FastAPI
+- Python 3.13
+- Groq API
+- Polars, DuckDB
+- Scikit-learn, XGBoost, LightGBM
+
+## 📁 Project Structure
+
+```
+.
+├── FRRONTEEEND/              # React frontend
+│   ├── components/           # React components
+│   ├── dist/                 # Built frontend (after npm run build)
+│   ├── package.json
+│   ├── vite.config.ts
+│   └── .env                  # Frontend environment variables
+├── src/
+│   ├── api/
+│   │   └── app.py           # FastAPI backend (updated)
+│   ├── tools/               # Data science tools
+│   └── orchestrator.py      # Main agent logic
+├── requirements.txt          # Python dependencies (updated)
+├── Dockerfile               # Multi-stage build (updated)
+├── build-and-deploy.ps1     # Windows build script
+└── build-and-deploy.sh      # Linux/Mac build script
+```
+
+## 🐛 Troubleshooting
+
+### Frontend doesn't load
+- Make sure you've run `npm run build` in the FRRONTEEEND directory
+- Check that `FRRONTEEEND/dist/` exists and contains files
+
+### API errors in chat
+- Ensure the backend is running on port 8080
+- Check that `GROQ_API_KEY` is set in your environment
+- Verify the API URL in `.env` file
+
+### CORS errors
+- The backend now has CORS enabled for development
+- For production, update the `allow_origins` in `src/api/app.py`
+
+## 📝 Notes
+
+- The old `chat_ui.py` has been kept for reference but is no longer used
+- All chat functionality now goes through the `/chat` endpoint
+- The frontend is automatically served by FastAPI in production mode
+- Session history is maintained in the frontend (browser)
+
+## 🎯 Next Steps
+
+1. **Customize the frontend**: Edit files in `FRRONTEEEND/components/`
+2. **Add file upload**: Extend `ChatInterface.tsx` to handle file uploads
+3. **Add visualization**: Display charts from the backend in the chat
+4. **Authentication**: Add user authentication if needed
+
+## 📞 Support
+
+For issues or questions:
+1. Check the console logs (browser & terminal)
+2. Verify environment variables
+3. Ensure all dependencies are installed
+4. Review the API documentation at http://localhost:8080/docs
diff --git a/FRRONTEEEND/.env.production b/FRRONTEEEND/.env.production
new file mode 100644
index 0000000000000000000000000000000000000000..2cc3ea7198d28d128dee3896ee58004eeb3f053e
--- /dev/null
+++ b/FRRONTEEEND/.env.production
@@ -0,0 +1,3 @@
+# Production API Configuration
+# Update this to your production API URL
+VITE_API_URL=https://your-cloud-run-url.run.app
diff --git a/FRRONTEEEND/.gitignore b/FRRONTEEEND/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..a547bf36d8d11a4f89c59c144f24795749086dd1
--- /dev/null
+++ b/FRRONTEEEND/.gitignore
@@ -0,0 +1,24 @@
+# Logs
+logs
+*.log
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+pnpm-debug.log*
+lerna-debug.log*
+
+node_modules
+dist
+dist-ssr
+*.local
+
+# Editor directories and files
+.vscode/*
+!.vscode/extensions.json
+.idea
+.DS_Store
+*.suo
+*.ntvs*
+*.njsproj
+*.sln
+*.sw?
diff --git a/FRRONTEEEND/App.tsx b/FRRONTEEEND/App.tsx
new file mode 100644
index 0000000000000000000000000000000000000000..52ef8907179ae2d06c533aca1dee7cc1a6c70508
--- /dev/null
+++ b/FRRONTEEEND/App.tsx
@@ -0,0 +1,59 @@
+
+import React, { useState } from 'react';
+import { HeroGeometric } from './components/HeroGeometric';
+import ProblemSolution from './components/ProblemSolution';
+import KeyCapabilities from './components/KeyCapabilities';
+import Process from './components/Process';
+import TechStack from './components/TechStack';
+import Footer from './components/Footer';
+import { BackgroundPaths } from './components/BackgroundPaths';
+import { Logo } from './components/Logo';
+import { ChatInterface } from './components/ChatInterface';
+
+const App: React.FC = () => {
+  const [view, setView] = useState<'landing' | 'chat'>('landing');
+
+  if (view === 'chat') {
+    return <ChatInterface onBack={() => setView('landing')} />;
+  }
+
+  return (
+    <div className="min-h-screen bg-[#030303] text-white selection:bg-indigo-500/30">
+      {/* Navigation (Overlay) */}
+      <nav className="fixed top-0 left-0 right-0 z-50 flex justify-between items-center px-6 py-4 backdrop-blur-md bg-[#030303]/20 border-b border-white/5">
+        <div className="flex items-center gap-3 cursor-pointer" onClick={() => setView('landing')}>
+          <Logo className="w-10 h-10" />
+          <span className="font-bold tracking-tight text-lg hidden sm:block uppercase text-white">
+            DATA SCIENCE AGENT
+          </span>
+        </div>
+        
+        <button 
+          onClick={() => setView('chat')}
+          className="px-5 py-2 bg-white/5 hover:bg-white/10 border border-white/10 rounded-lg text-sm font-medium transition-all"
+        >
+          Launch Console
+        </button>
+      </nav>
+
+      <main>
+        <HeroGeometric onChatClick={() => setView('chat')} />
+        <TechStack />
+        <ProblemSolution />
+        <KeyCapabilities />
+        
+        {/* Transitional background paths section */}
+        <BackgroundPaths 
+            title="Intelligence Without Limits" 
+            subtitle="The agent continuously learns from your specific domain, optimizing its own tools and reasoning strategies to solve your hardest data challenges."
+        />
+
+        <Process />
+      </main>
+
+      <Footer />
+    </div>
+  );
+};
+
+export default App;
diff --git a/FRRONTEEEND/README.md b/FRRONTEEEND/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8c6da4bbec3fc4886ec282b611d2c717fe6f602e
--- /dev/null
+++ b/FRRONTEEEND/README.md
@@ -0,0 +1,20 @@
+<div align="center">
+<img width="1200" height="475" alt="GHBanner" src="https://github.com/user-attachments/assets/0aa67016-6eaf-458a-adb2-6e31a0763ed6" />
+</div>
+
+# Run and deploy your AI Studio app
+
+This contains everything you need to run your app locally.
+
+View your app in AI Studio: https://ai.studio/apps/drive/1gChoktTuh429q26FzxS4BPo0q0LnlRE9
+
+## Run Locally
+
+**Prerequisites:**  Node.js
+
+
+1. Install dependencies:
+   `npm install`
+2. Set the `GEMINI_API_KEY` in [.env.local](.env.local) to your Gemini API key
+3. Run the app:
+   `npm run dev`
diff --git a/FRRONTEEEND/components/BackgroundPaths.tsx b/FRRONTEEEND/components/BackgroundPaths.tsx
new file mode 100644
index 0000000000000000000000000000000000000000..fcc5d3a741b76fac9313bac49cf130dd5f9abcb5
--- /dev/null
+++ b/FRRONTEEEND/components/BackgroundPaths.tsx
@@ -0,0 +1,148 @@
+
+import React from "react";
+import { motion } from "framer-motion";
+import { ArrowRight } from "lucide-react";
+import { cn } from "../lib/utils";
+
+function FloatingPaths({ position }: { position: number }) {
+    const paths = Array.from({ length: 36 }, (_, i) => ({
+        id: i,
+        d: `M-${380 - i * 5 * position} -${189 + i * 6}C-${
+            380 - i * 5 * position
+        } -${189 + i * 6} -${312 - i * 5 * position} ${216 - i * 6} ${
+            152 - i * 5 * position
+        } ${343 - i * 6}C${616 - i * 5 * position} ${470 - i * 6} ${
+            684 - i * 5 * position
+        } ${875 - i * 6} ${684 - i * 5 * position} ${875 - i * 6}`,
+        color: `rgba(99,102,241,${0.05 + i * 0.01})`, // Using indigo-500 tint
+        width: 0.5 + i * 0.03,
+    }));
+
+    return (
+        <div className="absolute inset-0 pointer-events-none">
+            <svg
+                className="w-full h-full text-indigo-500/20"
+                viewBox="0 0 696 316"
+                fill="none"
+            >
+                <title>Background Paths</title>
+                {paths.map((path) => (
+                    <motion.path
+                        key={path.id}
+                        d={path.d}
+                        stroke="currentColor"
+                        strokeWidth={path.width}
+                        strokeOpacity={0.1 + path.id * 0.02}
+                        initial={{ pathLength: 0.3, opacity: 0.4 }}
+                        animate={{
+                            pathLength: 1,
+                            opacity: [0.2, 0.5, 0.2],
+                            pathOffset: [0, 1, 0],
+                        }}
+                        transition={{
+                            duration: 15 + Math.random() * 10,
+                            repeat: Number.POSITIVE_INFINITY,
+                            ease: "linear",
+                        }}
+                    />
+                ))}
+            </svg>
+        </div>
+    );
+}
+
+export function BackgroundPaths({
+    title = "The Future is Autonomous",
+    subtitle = "Scale your data engineering and predictive modeling beyond human limits.",
+}: {
+    title?: string;
+    subtitle?: string;
+}) {
+    const words = title.split(" ");
+
+    return (
+        <section className="relative min-h-[80vh] w-full flex items-center justify-center overflow-hidden bg-[#030303]">
+            <div className="absolute inset-0">
+                <FloatingPaths position={1} />
+                <FloatingPaths position={-1} />
+            </div>
+
+            <div className="relative z-10 container mx-auto px-4 md:px-6 text-center">
+                <motion.div
+                    initial={{ opacity: 0 }}
+                    animate={{ opacity: 1 }}
+                    transition={{ duration: 2 }}
+                    className="max-w-4xl mx-auto"
+                >
+                    <h2 className="text-5xl sm:text-6xl md:text-8xl font-extrabold mb-8 tracking-tighter">
+                        {words.map((word, wordIndex) => (
+                            <span
+                                key={wordIndex}
+                                className="inline-block mr-4 last:mr-0"
+                            >
+                                {word.split("").map((letter, letterIndex) => (
+                                    <motion.span
+                                        key={`${wordIndex}-${letterIndex}`}
+                                        initial={{ y: 50, opacity: 0 }}
+                                        whileInView={{ y: 0, opacity: 1 }}
+                                        viewport={{ once: true }}
+                                        transition={{
+                                            delay:
+                                                wordIndex * 0.1 +
+                                                letterIndex * 0.02,
+                                            type: "spring",
+                                            stiffness: 150,
+                                            damping: 25,
+                                        }}
+                                        className="inline-block text-transparent bg-clip-text 
+                                        bg-gradient-to-r from-white via-white/90 to-white/70"
+                                    >
+                                        {letter}
+                                    </motion.span>
+                                ))}
+                            </span>
+                        ))}
+                    </h2>
+
+                    <motion.p 
+                        initial={{ opacity: 0, y: 20 }}
+                        whileInView={{ opacity: 1, y: 0 }}
+                        viewport={{ once: true }}
+                        transition={{ delay: 0.5 }}
+                        className="text-white/40 text-xl font-medium mb-12 max-w-2xl mx-auto tracking-tight"
+                    >
+                        {subtitle}
+                    </motion.p>
+
+                    <motion.div
+                        initial={{ opacity: 0, scale: 0.9 }}
+                        whileInView={{ opacity: 1, scale: 1 }}
+                        viewport={{ once: true }}
+                        transition={{ delay: 0.8 }}
+                        className="inline-block group relative bg-gradient-to-b from-white/10 to-indigo-500/10 
+                        p-px rounded-2xl backdrop-blur-lg 
+                        overflow-hidden shadow-lg hover:shadow-indigo-500/20 transition-all duration-300"
+                    >
+                        <button
+                            className="rounded-[1.15rem] px-10 py-5 text-lg font-bold backdrop-blur-md 
+                            bg-white/95 hover:bg-white text-black transition-all duration-300 
+                            group-hover:-translate-y-0.5 border border-white/10
+                            flex items-center gap-3"
+                        >
+                            <span className="opacity-90 group-hover:opacity-100 transition-opacity">
+                                Deploy Your First Agent
+                            </span>
+                            <ArrowRight
+                                className="w-5 h-5 opacity-70 group-hover:opacity-100 group-hover:translate-x-1.5 
+                                transition-all duration-300"
+                            />
+                        </button>
+                    </motion.div>
+                </motion.div>
+            </div>
+            
+            {/* Subtle glow effect at the bottom */}
+            <div className="absolute bottom-0 left-1/2 -translate-x-1/2 w-full h-px bg-gradient-to-r from-transparent via-indigo-500/50 to-transparent shadow-[0_0_50px_2px_rgba(99,102,241,0.2)]" />
+        </section>
+    );
+}
diff --git a/FRRONTEEEND/components/ChatInterface.tsx b/FRRONTEEEND/components/ChatInterface.tsx
new file mode 100644
index 0000000000000000000000000000000000000000..f4a750af21511d594edc28a3d68e666d8d40a7d2
--- /dev/null
+++ b/FRRONTEEEND/components/ChatInterface.tsx
@@ -0,0 +1,571 @@
+
+import React, { useState, useRef, useEffect } from 'react';
+import { motion, AnimatePresence } from 'framer-motion';
+import { Send, Plus, Search, Settings, MoreHorizontal, User, Bot, ArrowLeft, Paperclip, Sparkles, Trash2, X, Upload } from 'lucide-react';
+import { cn } from '../lib/utils';
+import { Logo } from './Logo';
+import ReactMarkdown from 'react-markdown';
+
+interface Message {
+  id: string;
+  role: 'user' | 'assistant';
+  content: string;
+  timestamp: Date;
+  file?: {
+    name: string;
+    size: number;
+  };
+  reports?: Array<{
+    name: string;
+    path: string;
+  }>;
+}
+
+interface ChatSession {
+  id: string;
+  title: string;
+  messages: Message[];
+  updatedAt: Date;
+}
+
+export const ChatInterface: React.FC<{ onBack: () => void }> = ({ onBack }) => {
+  const [sessions, setSessions] = useState<ChatSession[]>([
+    {
+      id: '1',
+      title: 'ML Model Analysis',
+      messages: [],
+      updatedAt: new Date(),
+    }
+  ]);
+  const [activeSessionId, setActiveSessionId] = useState('1');
+  const [input, setInput] = useState('');
+  const [isTyping, setIsTyping] = useState(false);
+  const [uploadedFile, setUploadedFile] = useState<File | null>(null);
+  const [reportModalUrl, setReportModalUrl] = useState<string | null>(null);
+  const fileInputRef = useRef<HTMLInputElement>(null);
+  const scrollRef = useRef<HTMLDivElement>(null);
+  
+  const activeSession = sessions.find(s => s.id === activeSessionId) || sessions[0];
+
+  useEffect(() => {
+    if (scrollRef.current) {
+      scrollRef.current.scrollTop = scrollRef.current.scrollHeight;
+    }
+  }, [activeSession.messages, isTyping]);
+
+  const handleSend = async () => {
+    if ((!input.trim() && !uploadedFile) || isTyping) return;
+
+    const userMessage: Message = {
+      id: Date.now().toString(),
+      role: 'user',
+      content: input || (uploadedFile ? `Uploaded: ${uploadedFile.name}` : ''),
+      timestamp: new Date(),
+      file: uploadedFile ? { name: uploadedFile.name, size: uploadedFile.size } : undefined,
+    };
+
+    const newMessages = [...activeSession.messages, userMessage];
+    updateSession(activeSessionId, newMessages);
+    setInput('');
+    setIsTyping(true);
+
+    try {
+      // Use the current origin if running on same server, otherwise use env variable
+      const API_URL = window.location.origin;
+      console.log('API URL:', API_URL);
+      
+      let response;
+      
+      if (uploadedFile) {
+        const formData = new FormData();
+        formData.append('file', uploadedFile);
+        formData.append('task_description', input || 'Analyze this dataset and provide insights');
+        formData.append('use_cache', 'true');
+        formData.append('max_iterations', '20');
+        
+        response = await fetch(`${API_URL}/run`, {
+          method: 'POST',
+          body: formData
+        });
+        
+        setUploadedFile(null);
+      } else {
+        response = await fetch(`${API_URL}/chat`, {
+          method: 'POST',
+          headers: {
+            'Content-Type': 'application/json',
+          },
+          body: JSON.stringify({
+            messages: newMessages.map(m => ({
+              role: m.role,
+              content: m.content
+            })),
+            stream: false
+          })
+        });
+      }
+
+      if (!response.ok) {
+        throw new Error(`API error: ${response.status}`);
+      }
+
+      const data = await response.json();
+      
+      let assistantContent = '';
+      let reports: Array<{name: string, path: string}> = [];
+      
+      if (uploadedFile && data.result) {
+        const result = data.result;
+        assistantContent = `✅ Analysis Complete!\n\n`;
+        
+        // Extract report paths from workflow history
+        if (result.workflow_history) {
+          const reportTools = ['generate_ydata_profiling_report', 'generate_sweetviz_report', 'generate_combined_eda_report'];
+          result.workflow_history.forEach((step: any) => {
+            if (reportTools.includes(step.tool)) {
+              // Check multiple possible locations for the report path
+              const reportPath = step.result?.output_path || step.result?.report_path || step.arguments?.output_path;
+              
+              if (reportPath && (step.result?.success !== false)) {
+                reports.push({
+                  name: step.tool.replace('generate_', '').replace(/_/g, ' ').replace('report', '').trim(),
+                  path: reportPath
+                });
+              }
+            }
+          });
+        }
+        
+        // Also check for report paths mentioned in the summary text
+        if (result.summary && !reports.length) {
+          const reportPathMatch = result.summary.match(/\.(\/outputs\/reports\/[^\s]+\.html)/);
+          if (reportPathMatch) {
+            reports.push({
+              name: 'ydata profiling',
+              path: reportPathMatch[1]
+            });
+          }
+        }
+        
+        if (result.summary) {
+          assistantContent += `**Summary:**\n${result.summary}\n\n`;
+        }
+        
+        if (result.workflow_history && result.workflow_history.length > 0) {
+          assistantContent += `**Tools Used:** ${result.workflow_history.length} steps\n\n`;
+          assistantContent += `**Final Result:**\n${result.final_result || 'Analysis completed successfully'}`;
+        }
+      } else if (data.success && data.message) {
+        assistantContent = data.message;
+      } else {
+        throw new Error('Invalid response from API');
+      }
+      
+      updateSession(activeSessionId, [...newMessages, {
+        id: (Date.now() + 1).toString(),
+        role: 'assistant',
+        content: assistantContent,
+        timestamp: new Date(),
+        reports: reports.length > 0 ? reports : undefined
+      }]);
+    } catch (error: any) {
+      console.error("Chat Error:", error);
+      
+      let errorMessage = "I'm sorry, I encountered an error processing your request.";
+      
+      if (error.message) {
+        errorMessage += `\n\n**Error:** ${error.message}`;
+      }
+      
+      // Try to parse response error
+      try {
+        const errorText = await error.text?.();
+        if (errorText) {
+          const errorData = JSON.parse(errorText);
+          if (errorData.detail) {
+            errorMessage = `**Error:** ${typeof errorData.detail === 'string' ? errorData.detail : JSON.stringify(errorData.detail)}`;
+          }
+        }
+      } catch (e) {
+        // Ignore parsing errors
+      }
+      
+      updateSession(activeSessionId, [...newMessages, {
+        id: 'err-' + Date.now(),
+        role: 'assistant',
+        content: errorMessage,
+        timestamp: new Date()
+      }]);
+    } finally {
+      setIsTyping(false);
+    }
+  };
+
+  const updateSession = (id: string, messages: Message[]) => {
+    setSessions(prev => prev.map(s => {
+      if (s.id === id) {
+        return { ...s, messages, updatedAt: new Date() };
+      }
+      return s;
+    }));
+  };
+
+  const createNewChat = () => {
+    const newId = Date.now().toString();
+    const newSession: ChatSession = {
+      id: newId,
+      title: 'New Chat',
+      messages: [],
+      updatedAt: new Date()
+    };
+    setSessions([newSession, ...sessions]);
+    setActiveSessionId(newId);
+  };
+
+  const deleteSession = (e: React.MouseEvent, id: string) => {
+    e.stopPropagation();
+    if (sessions.length === 1) return;
+    setSessions(prev => prev.filter(s => s.id !== id));
+    if (activeSessionId === id) {
+      setActiveSessionId(sessions.find(s => s.id !== id)?.id || '');
+    }
+  };
+
+  const handleFileSelect = (e: React.ChangeEvent<HTMLInputElement>) => {
+    const file = e.target.files?.[0];
+    if (file) {
+      const validTypes = ['.csv', '.parquet'];
+      const fileExt = file.name.substring(file.name.lastIndexOf('.')).toLowerCase();
+      
+      if (validTypes.includes(fileExt)) {
+        setUploadedFile(file);
+      } else {
+        alert('Please upload a CSV or Parquet file');
+      }
+    }
+  };
+
+  const removeFile = () => {
+    setUploadedFile(null);
+    if (fileInputRef.current) {
+      fileInputRef.current.value = '';
+    }
+  };
+
+  return (
+    <div className="flex h-screen w-full bg-[#050505] overflow-hidden text-white/90">
+      {/* Sidebar */}
+      <aside className="w-[280px] hidden md:flex flex-col border-r border-white/5 bg-[#0a0a0a]/50 backdrop-blur-xl">
+        <div className="p-4 flex flex-col h-full">
+          <div className="flex items-center gap-3 mb-8 px-2">
+            <Logo className="w-8 h-8" />
+            <span className="font-bold tracking-tight text-sm uppercase">Console</span>
+          </div>
+
+          <button 
+            onClick={createNewChat}
+            className="w-full flex items-center gap-3 px-4 py-3 rounded-xl bg-white/5 hover:bg-white/10 border border-white/10 transition-all text-sm font-medium mb-6 group"
+          >
+            <Plus className="w-4 h-4 group-hover:scale-110 transition-transform" />
+            New Conversation
+          </button>
+
+          <div className="flex-1 overflow-y-auto space-y-2 custom-scrollbar">
+            <p className="px-3 text-[10px] uppercase tracking-widest text-white/30 font-bold mb-2">History</p>
+            {sessions.map(session => (
+              <div
+                key={session.id}
+                onClick={() => setActiveSessionId(session.id)}
+                className={cn(
+                  "group flex items-center justify-between px-4 py-3 rounded-xl cursor-pointer transition-all text-sm",
+                  activeSessionId === session.id 
+                    ? "bg-white/10 text-white border border-white/10 shadow-lg" 
+                    : "text-white/40 hover:text-white/70 hover:bg-white/5"
+                )}
+              >
+                <span className="truncate flex-1 pr-2">{session.title}</span>
+                <Trash2 
+                  onClick={(e) => deleteSession(e, session.id)}
+                  className="w-4 h-4 opacity-0 group-hover:opacity-100 hover:text-rose-400 transition-all" 
+                />
+              </div>
+            ))}
+          </div>
+
+          <div className="mt-auto pt-4 border-t border-white/5 flex items-center justify-between px-2">
+            <button onClick={onBack} className="p-2 hover:bg-white/5 rounded-lg transition-colors text-white/40 hover:text-white">
+              <ArrowLeft className="w-5 h-5" />
+            </button>
+            <div className="flex gap-2">
+              <button className="p-2 hover:bg-white/5 rounded-lg transition-colors text-white/40 hover:text-white">
+                <Settings className="w-5 h-5" />
+              </button>
+              <button className="p-2 hover:bg-white/5 rounded-lg transition-colors text-white/40 hover:text-white">
+                <User className="w-5 h-5" />
+              </button>
+            </div>
+          </div>
+        </div>
+      </aside>
+
+      {/* Main Chat Area */}
+      <main className="flex-1 flex flex-col relative bg-gradient-to-b from-[#080808] to-[#050505]">
+        {/* Top Header */}
+        <header className="h-16 flex items-center justify-between px-6 border-b border-white/5 backdrop-blur-md bg-black/20 sticky top-0 z-10">
+          <div className="flex items-center gap-4">
+             <button onClick={onBack} className="md:hidden p-2 hover:bg-white/5 rounded-lg">
+               <ArrowLeft className="w-5 h-5" />
+             </button>
+             <div>
+               <h2 className="text-sm font-bold text-white tracking-tight">{activeSession.title}</h2>
+               <p className="text-[10px] text-white/30 font-medium">{activeSession.messages.length} messages in session</p>
+             </div>
+          </div>
+          <div className="flex items-center gap-3">
+            <button className="p-2 text-white/40 hover:text-white transition-colors">
+              <Search className="w-5 h-5" />
+            </button>
+            <button className="p-2 text-white/40 hover:text-white transition-colors">
+              <MoreHorizontal className="w-5 h-5" />
+            </button>
+          </div>
+        </header>
+
+        {/* Message List */}
+        <div 
+          ref={scrollRef}
+          className="flex-1 overflow-y-auto p-4 md:p-8 space-y-8 scroll-smooth"
+        >
+          {activeSession.messages.length === 0 ? (
+            <div className="h-full flex flex-col items-center justify-center text-center px-4">
+               <motion.div 
+                 initial={{ opacity: 0, scale: 0.9 }}
+                 animate={{ opacity: 1, scale: 1 }}
+                 className="w-16 h-16 bg-gradient-to-br from-indigo-500/20 to-rose-500/20 rounded-2xl flex items-center justify-center mb-6 border border-white/10"
+               >
+                 <Sparkles className="w-8 h-8 text-indigo-400" />
+               </motion.div>
+               <h1 className="text-2xl font-extrabold text-white mb-3">Welcome, Data Scientist</h1>
+               <p className="text-white/40 max-w-sm leading-relaxed text-sm">
+                 I'm your autonomous agent ready to profile data, train models, or build dashboards. 
+                 Try uploading a dataset or describing your ML objective.
+               </p>
+               <div className="grid grid-cols-1 sm:grid-cols-2 gap-3 mt-8 w-full max-w-lg">
+                  {[
+                    "Profile my sales.csv",
+                    "Train a XGBoost classifier",
+                    "Generate a correlation heatmap",
+                    "Explain feature importance"
+                  ].map(prompt => (
+                    <button 
+                      key={prompt}
+                      onClick={() => setInput(prompt)}
+                      className="text-left px-4 py-3 rounded-xl bg-white/[0.03] border border-white/5 hover:bg-white/5 transition-all text-xs text-white/60 hover:text-white"
+                    >
+                      "{prompt}"
+                    </button>
+                  ))}
+               </div>
+            </div>
+          ) : (
+            activeSession.messages.map((msg) => (
+              <motion.div
+                key={msg.id}
+                initial={{ opacity: 0, y: 10 }}
+                animate={{ opacity: 1, y: 0 }}
+                className={cn(
+                  "flex w-full gap-4",
+                  msg.role === 'user' ? "flex-row-reverse" : "flex-row"
+                )}
+              >
+                <div className={cn(
+                  "w-8 h-8 rounded-lg flex items-center justify-center shrink-0 border border-white/10",
+                  msg.role === 'user' ? "bg-indigo-500/20" : "bg-white/5"
+                )}>
+                  {msg.role === 'user' ? <User className="w-4 h-4" /> : <Bot className="w-4 h-4 text-indigo-400" />}
+                </div>
+                <div className={cn(
+                  "max-w-[80%] md:max-w-[70%] p-4 rounded-2xl text-sm leading-relaxed",
+                  msg.role === 'user' 
+                    ? "bg-indigo-600/20 text-indigo-50 border border-indigo-500/20" 
+                    : "bg-white/[0.03] text-white/80 border border-white/5"
+                )}>
+                  {msg.file && (
+                    <div className="mb-2 flex items-center gap-2 text-xs bg-white/5 rounded-lg px-3 py-2 border border-white/10">
+                      <Paperclip className="w-3 h-3" />
+                      <span className="font-medium">{msg.file.name}</span>
+                      <span className="text-white/40">({(msg.file.size / 1024).toFixed(1)} KB)</span>
+                    </div>
+                  )}
+                  {msg.role === 'assistant' ? (
+                    <ReactMarkdown 
+                      className="prose prose-invert prose-sm max-w-none prose-p:leading-relaxed prose-pre:bg-black/40 prose-pre:border prose-pre:border-white/10 prose-headings:text-white prose-strong:text-white prose-li:text-white/80"
+                      components={{
+                        p: ({node, ...props}) => <p className="mb-3 last:mb-0" {...props} />,
+                        ul: ({node, ...props}) => <ul className="mb-3 space-y-1" {...props} />,
+                        ol: ({node, ...props}) => <ol className="mb-3 space-y-1" {...props} />,
+                        li: ({node, ...props}) => <li className="ml-4" {...props} />,
+                        strong: ({node, ...props}) => <strong className="font-semibold text-white" {...props} />,
+                        code: ({node, inline, ...props}: any) => 
+                          inline ? 
+                            <code className="px-1.5 py-0.5 rounded bg-white/10 text-indigo-300 text-xs font-mono" {...props} /> :
+                            <code className="block p-3 rounded-lg bg-black/40 border border-white/10 text-xs font-mono overflow-x-auto" {...props} />
+                      }}
+                    >
+                      {msg.content || ''}
+                    </ReactMarkdown>
+                  ) : (
+                    msg.content || (msg.role === 'assistant' && isTyping && "...")
+                  )}
+                  {msg.reports && msg.reports.length > 0 && (
+                    <div className="mt-4 flex flex-wrap gap-2">
+                      {msg.reports.map((report, idx) => (
+                        <button
+                          key={idx}
+                          onClick={() => setReportModalUrl(`${window.location.origin}${report.path}`)}
+                          className="flex items-center gap-2 px-4 py-2 rounded-lg bg-indigo-500/20 hover:bg-indigo-500/30 border border-indigo-500/30 text-indigo-200 text-xs font-medium transition-all group"
+                        >
+                          <Sparkles className="w-3.5 h-3.5 group-hover:scale-110 transition-transform" />
+                          View {report.name} Report
+                        </button>
+                      ))}
+                    </div>
+                  )}
+                  <div className="mt-2 text-[10px] opacity-20 font-mono">
+                    {msg.timestamp.toLocaleTimeString([], { hour: '2-digit', minute: '2-digit' })}
+                  </div>
+                </div>
+              </motion.div>
+            ))
+          )}
+          {isTyping && activeSession.messages[activeSession.messages.length - 1]?.role === 'user' && (
+             <div className="flex gap-4">
+                <div className="w-8 h-8 rounded-lg flex items-center justify-center shrink-0 bg-white/5 border border-white/10">
+                  <Bot className="w-4 h-4 text-indigo-400" />
+                </div>
+                <div className="bg-white/[0.03] p-4 rounded-2xl border border-white/5">
+                  <div className="flex gap-1">
+                    <span className="w-1.5 h-1.5 bg-white/20 rounded-full animate-bounce [animation-delay:-0.3s]"></span>
+                    <span className="w-1.5 h-1.5 bg-white/20 rounded-full animate-bounce [animation-delay:-0.15s]"></span>
+                    <span className="w-1.5 h-1.5 bg-white/20 rounded-full animate-bounce"></span>
+                  </div>
+                </div>
+             </div>
+          )}
+        </div>
+
+        {/* Input Bar */}
+        <div className="p-4 md:p-8 pt-0">
+          <div className="max-w-4xl mx-auto relative">
+            <div className="absolute -top-10 left-4 flex gap-2">
+               <input
+                 ref={fileInputRef}
+                 type="file"
+                 accept=".csv,.parquet"
+                 onChange={handleFileSelect}
+                 className="hidden"
+                 id="file-upload"
+               />
+               <label
+                 htmlFor="file-upload"
+                 className="flex items-center gap-1.5 px-3 py-1 rounded-full bg-white/[0.03] border border-white/5 text-[10px] text-white/40 hover:text-white hover:bg-white/5 transition-all cursor-pointer"
+               >
+                  <Upload className="w-3 h-3" /> Upload Dataset
+               </label>
+               {uploadedFile && (
+                 <div className="flex items-center gap-2 px-3 py-1 rounded-full bg-indigo-500/20 border border-indigo-500/30 text-[10px] text-indigo-200">
+                   <Paperclip className="w-3 h-3" />
+                   <span className="max-w-[150px] truncate">{uploadedFile.name}</span>
+                   <button onClick={removeFile} className="hover:text-white transition-colors">
+                     <X className="w-3 h-3" />
+                   </button>
+                 </div>
+               )}
+            </div>
+            <div className="relative group">
+               <textarea
+                value={input}
+                onChange={(e) => setInput(e.target.value)}
+                onKeyDown={(e) => {
+                  if (e.key === 'Enter' && !e.shiftKey) {
+                    e.preventDefault();
+                    handleSend();
+                  }
+                }}
+                placeholder={uploadedFile ? "Describe what you want to do with this dataset..." : "Ask your agent anything or upload a dataset..."}
+                className="w-full bg-[#0d0d0d] border border-white/10 rounded-2xl p-4 pr-16 text-sm min-h-[56px] max-h-48 resize-none focus:outline-none focus:border-indigo-500/50 focus:ring-1 focus:ring-indigo-500/20 transition-all text-white/90 placeholder:text-white/20 shadow-2xl"
+              />
+              <button
+                onClick={handleSend}
+                disabled={(!input.trim() && !uploadedFile) || isTyping}
+                className={cn(
+                  "absolute right-3 bottom-3 p-2.5 rounded-xl transition-all",
+                  (input.trim() || uploadedFile) && !isTyping 
+                    ? "bg-white text-black hover:scale-105 active:scale-95" 
+                    : "bg-white/5 text-white/20 cursor-not-allowed"
+                )}
+              >
+                <Send className="w-4 h-4" />
+              </button>
+            </div>
+            <p className="text-center mt-3 text-[10px] text-white/20 font-medium">
+              Enterprise Data Agent v3.1 | Secured with end-to-end encryption
+            </p>
+          </div>
+        </div>
+      </main>
+      
+      {/* Report Modal */}
+      <AnimatePresence>
+        {reportModalUrl && (
+          <motion.div
+            initial={{ opacity: 0 }}
+            animate={{ opacity: 1 }}
+            exit={{ opacity: 0 }}
+            className="fixed inset-0 bg-black/80 backdrop-blur-sm z-50 flex items-center justify-center p-4"
+            onClick={() => setReportModalUrl(null)}
+          >
+            <motion.div
+              initial={{ scale: 0.95, opacity: 0 }}
+              animate={{ scale: 1, opacity: 1 }}
+              exit={{ scale: 0.95, opacity: 0 }}
+              className="bg-[#0a0a0a] border border-white/10 rounded-2xl w-full max-w-7xl h-[90vh] flex flex-col overflow-hidden shadow-2xl"
+              onClick={(e) => e.stopPropagation()}
+            >
+              <div className="flex items-center justify-between p-4 border-b border-white/5">
+                <h3 className="text-lg font-semibold text-white">Data Profiling Report</h3>
+                <button
+                  onClick={() => setReportModalUrl(null)}
+                  className="p-2 rounded-lg hover:bg-white/5 transition-colors"
+                >
+                  <X className="w-5 h-5" />
+                </button>
+              </div>
+              <iframe
+                src={reportModalUrl}
+                className="flex-1 w-full bg-white"
+                title="Report Viewer"
+              />
+            </motion.div>
+          </motion.div>
+        )}
+      </AnimatePresence>
+      
+      <style>{`
+        .custom-scrollbar::-webkit-scrollbar {
+          width: 4px;
+        }
+        .custom-scrollbar::-webkit-scrollbar-track {
+          background: transparent;
+        }
+        .custom-scrollbar::-webkit-scrollbar-thumb {
+          background: rgba(255, 255, 255, 0.05);
+          border-radius: 10px;
+        }
+        .custom-scrollbar::-webkit-scrollbar-thumb:hover {
+          background: rgba(255, 255, 255, 0.1);
+        }
+      `}</style>
+    </div>
+  );
+};
diff --git a/FRRONTEEEND/components/Footer.tsx b/FRRONTEEEND/components/Footer.tsx
new file mode 100644
index 0000000000000000000000000000000000000000..37c33805b9d89d816c541bd0f501fc21a282b551
--- /dev/null
+++ b/FRRONTEEEND/components/Footer.tsx
@@ -0,0 +1,171 @@
+
+import React, { useRef, useId, useEffect } from 'react';
+import { motion, animate, useMotionValue, AnimationPlaybackControls } from 'framer-motion';
+import { ArrowRight } from 'lucide-react';
+import { Logo } from './Logo';
+
+function mapRange(
+    value: number,
+    fromLow: number,
+    fromHigh: number,
+    toLow: number,
+    toHigh: number
+): number {
+    if (fromLow === fromHigh) {
+        return toLow;
+    }
+    const percentage = (value - fromLow) / (fromHigh - fromLow);
+    return toLow + percentage * (toHigh - toLow);
+}
+
+const Footer = () => {
+  const id = useId().replace(/:/g, "");
+  const instanceId = `footer-shadow-${id}`;
+  const feColorMatrixRef = useRef<SVGFEColorMatrixElement>(null);
+  const hueRotateMotionValue = useMotionValue(0);
+  const hueRotateAnimation = useRef<AnimationPlaybackControls | null>(null);
+
+  // Configuration from ShadowSection
+  const animationScale = 50;
+  const animationSpeed = 15;
+  const displacementScale = mapRange(animationScale, 1, 100, 20, 100);
+  const animationDuration = mapRange(animationSpeed, 1, 100, 1000, 50);
+
+  useEffect(() => {
+    if (feColorMatrixRef.current) {
+        hueRotateAnimation.current = animate(hueRotateMotionValue, 360, {
+            duration: animationDuration / 25,
+            repeat: Infinity,
+            repeatType: "loop",
+            ease: "linear",
+            onUpdate: (value: number) => {
+                if (feColorMatrixRef.current) {
+                    feColorMatrixRef.current.setAttribute("values", String(value));
+                }
+            }
+        });
+        return () => hueRotateAnimation.current?.stop();
+    }
+  }, [animationDuration, hueRotateMotionValue]);
+
+  return (
+    <footer className="bg-[#030303] overflow-hidden">
+      {/* High-Impact CTA with Atmospheric Shadow UI */}
+      <section className="relative w-full py-32 md:py-48 flex items-center justify-center border-t border-white/5">
+        <div
+            className="absolute inset-0 pointer-events-none overflow-hidden"
+            style={{
+                filter: `url(#${instanceId}) blur(12px)`,
+                opacity: 0.8
+            }}
+        >
+            <svg style={{ position: "absolute", width: 0, height: 0 }}>
+                <defs>
+                    <filter id={instanceId}>
+                        <feTurbulence
+                            result="undulation"
+                            numOctaves="2"
+                            baseFrequency={`${mapRange(animationScale, 0, 100, 0.001, 0.0005)},${mapRange(animationScale, 0, 100, 0.004, 0.002)}`}
+                            seed="0"
+                            type="turbulence"
+                        />
+                        <feColorMatrix
+                            ref={feColorMatrixRef}
+                            in="undulation"
+                            type="hueRotate"
+                            values="180"
+                        />
+                        <feColorMatrix
+                            in="dist"
+                            result="circulation"
+                            type="matrix"
+                            values="4 0 0 0 1  4 0 0 0 1  4 0 0 0 1  1 0 0 0 0"
+                        />
+                        <feDisplacementMap
+                            in="SourceGraphic"
+                            in2="circulation"
+                            scale={displacementScale}
+                            result="dist"
+                        />
+                        <feDisplacementMap
+                            in="dist"
+                            in2="undulation"
+                            scale={displacementScale}
+                            result="output"
+                        />
+                    </filter>
+                </defs>
+            </svg>
+            <div
+                style={{
+                    backgroundColor: 'rgba(99, 102, 241, 0.4)',
+                    maskImage: `url('https://framerusercontent.com/images/ceBGguIpUU8luwByxuQz79t7To.png')`,
+                    maskSize: "cover",
+                    maskRepeat: "no-repeat",
+                    maskPosition: "center",
+                    width: "120%",
+                    height: "120%",
+                    position: 'absolute',
+                    top: '-10%',
+                    left: '-10%'
+                }}
+            />
+        </div>
+
+        {/* Noise overlay */}
+        <div
+            className="absolute inset-0 pointer-events-none opacity-[0.03]"
+            style={{
+                backgroundImage: `url("https://framerusercontent.com/images/g0QcWrxr87K0ufOxIUFBakwYA8.png")`,
+                backgroundSize: '100px',
+                backgroundRepeat: "repeat",
+            }}
+        />
+
+        <div className="relative z-20 max-w-7xl mx-auto px-6 text-center">
+            <motion.div
+                initial={{ opacity: 0, y: 30 }}
+                whileInView={{ opacity: 1, y: 0 }}
+                viewport={{ once: true }}
+                transition={{ duration: 0.8 }}
+            >
+                <h2 className="text-4xl md:text-7xl font-extrabold text-white mb-8 tracking-tighter">
+                    Ready to automate your workflow?
+                </h2>
+                <p className="text-white/50 text-xl md:text-2xl mb-12 max-w-2xl mx-auto font-medium leading-relaxed">
+                    Build smarter ML workflows with AI autonomy. Join the next generation of data scientists.
+                </p>
+                <button className="group relative px-10 py-5 bg-white text-black font-extrabold rounded-2xl transition-all hover:scale-105 active:scale-95 shadow-[0_0_50px_-12px_rgba(255,255,255,0.5)] flex items-center gap-3 mx-auto">
+                    Get Started Now
+                    <ArrowRight className="w-5 h-5 group-hover:translate-x-1 transition-transform" />
+                </button>
+            </motion.div>
+        </div>
+
+        {/* Gradient fades to blend with rest of footer */}
+        <div className="absolute inset-x-0 bottom-0 h-40 bg-gradient-to-t from-[#030303] to-transparent z-10" />
+        <div className="absolute inset-x-0 top-0 h-40 bg-gradient-to-b from-[#030303] to-transparent z-10" />
+      </section>
+
+      {/* Main Footer Links */}
+      <div className="max-w-7xl mx-auto px-6 pb-20">
+        <div className="pt-8 border-t border-white/5 flex flex-col md:flex-row justify-between items-center gap-6">
+          <div className="flex items-center gap-4">
+            <Logo className="w-8 h-8" />
+            <span className="text-white font-extrabold tracking-tight uppercase">DATA SCIENCE AGENT</span>
+          </div>
+          <div className="text-white/30 text-[10px] sm:text-xs font-semibold uppercase tracking-wider">
+            © 2025 Data Science Agent. Built for the autonomous future.
+          </div>
+          <div className="flex gap-8 text-white/40 text-sm font-bold italic">
+            <a href="#" className="hover:text-white transition-colors">Twitter</a>
+            <a href="#" className="hover:text-white transition-colors">GitHub</a>
+            <a href="#" className="hover:text-white transition-colors">Docs</a>
+          </div>
+        </div>
+      </div>
+    </footer>
+  );
+};
+
+export default Footer;
diff --git a/FRRONTEEEND/components/HeroGeometric.tsx b/FRRONTEEEND/components/HeroGeometric.tsx
new file mode 100644
index 0000000000000000000000000000000000000000..33cf8c89f9229eab1c633a0fed0c33144e063b6e
--- /dev/null
+++ b/FRRONTEEEND/components/HeroGeometric.tsx
@@ -0,0 +1,213 @@
+
+import React from 'react';
+import { motion, Variants } from "framer-motion";
+import { Circle, MessageSquare } from "lucide-react";
+import { cn } from "../lib/utils";
+
+function ElegantShape({
+    className,
+    delay = 0,
+    width = 400,
+    height = 100,
+    rotate = 0,
+    gradient = "from-white/[0.08]",
+}: {
+    className?: string;
+    delay?: number;
+    width?: number;
+    height?: number;
+    rotate?: number;
+    gradient?: string;
+}) {
+    return (
+        <motion.div
+            initial={{
+                opacity: 0,
+                y: -150,
+                rotate: rotate - 15,
+            }}
+            animate={{
+                opacity: 1,
+                y: 0,
+                rotate: rotate,
+            }}
+            transition={{
+                duration: 2.4,
+                delay,
+                ease: [0.23, 0.86, 0.39, 0.96],
+                opacity: { duration: 1.2 },
+            }}
+            className={cn("absolute", className)}
+        >
+            <motion.div
+                animate={{
+                    y: [0, 15, 0],
+                }}
+                transition={{
+                    duration: 12,
+                    repeat: Number.POSITIVE_INFINITY,
+                    ease: "easeInOut",
+                }}
+                style={{
+                    width,
+                    height,
+                }}
+                className="relative"
+            >
+                <div
+                    className={cn(
+                        "absolute inset-0 rounded-full",
+                        "bg-gradient-to-r to-transparent",
+                        gradient,
+                        "backdrop-blur-[2px] border-2 border-white/[0.15]",
+                        "shadow-[0_8px_32px_0_rgba(255,255,255,0.1)]",
+                        "after:absolute after:inset-0 after:rounded-full",
+                        "after:bg-[radial-gradient(circle_at_50%_50%,rgba(255,255,255,0.2),transparent_70%)]"
+                    )}
+                />
+            </motion.div>
+        </motion.div>
+    );
+}
+
+export function HeroGeometric({
+    badge = "Autonomous AI for Data Science",
+    title1 = "DATA SCIENCE AGENT",
+    title2 = "Autonomous AI for End-to-End ML",
+    onChatClick,
+}: {
+    badge?: string;
+    title1?: string;
+    title2?: string;
+    onChatClick?: () => void;
+}) {
+    const fadeUpVariants: Variants = {
+        hidden: { opacity: 0, y: 30 },
+        visible: (i: number) => ({
+            opacity: 1,
+            y: 0,
+            transition: {
+                duration: 1,
+                delay: 0.5 + i * 0.2,
+                ease: [0.25, 0.4, 0.25, 1] as [number, number, number, number],
+            },
+        }),
+    };
+
+    return (
+        <div className="relative min-h-screen w-full flex items-center justify-center overflow-hidden bg-[#030303]">
+            <div className="absolute inset-0 bg-gradient-to-br from-indigo-500/[0.05] via-transparent to-rose-500/[0.05] blur-3xl" />
+
+            <div className="absolute inset-0 overflow-hidden">
+                <ElegantShape
+                    delay={0.3}
+                    width={600}
+                    height={140}
+                    rotate={12}
+                    gradient="from-indigo-500/[0.15]"
+                    className="left-[-10%] md:left-[-5%] top-[15%] md:top-[20%]"
+                />
+                <ElegantShape
+                    delay={0.5}
+                    width={500}
+                    height={120}
+                    rotate={-15}
+                    gradient="from-rose-500/[0.15]"
+                    className="right-[-5%] md:right-[0%] top-[70%] md:top-[75%]"
+                />
+                <ElegantShape
+                    delay={0.4}
+                    width={300}
+                    height={80}
+                    rotate={-8}
+                    gradient="from-violet-500/[0.15]"
+                    className="left-[5%] md:left-[10%] bottom-[5%] md:bottom-[10%]"
+                />
+                <ElegantShape
+                    delay={0.6}
+                    width={200}
+                    height={60}
+                    rotate={20}
+                    gradient="from-amber-500/[0.15]"
+                    className="right-[15%] md:right-[20%] top-[10%] md:top-[15%]"
+                />
+                <ElegantShape
+                    delay={0.7}
+                    width={150}
+                    height={40}
+                    rotate={-25}
+                    gradient="from-cyan-500/[0.15]"
+                    className="left-[20%] md:left-[25%] top-[5%] md:top-[10%]"
+                />
+            </div>
+
+            <div className="relative z-10 container mx-auto px-4 md:px-6">
+                <div className="max-w-4xl mx-auto text-center">
+                    <motion.div
+                        custom={0}
+                        variants={fadeUpVariants}
+                        initial="hidden"
+                        animate="visible"
+                        className="inline-flex items-center gap-2 px-4 py-1.5 rounded-full bg-white/[0.03] border border-white/[0.08] mb-6 md:mb-10"
+                    >
+                        <Circle className="h-2 w-2 fill-indigo-500/80" />
+                        <span className="text-xs font-semibold text-white/60 tracking-[0.1em] uppercase">
+                            {badge}
+                        </span>
+                    </motion.div>
+
+                    <motion.div
+                        custom={1}
+                        variants={fadeUpVariants}
+                        initial="hidden"
+                        animate="visible"
+                    >
+                        <h1 className="text-3xl sm:text-4xl md:text-6xl font-extrabold mb-6 md:mb-8 tracking-tight leading-[1.1]">
+                            <span className="bg-clip-text text-transparent bg-gradient-to-b from-white to-white/80">
+                                {title1}
+                            </span>
+                            <br />
+                            <span
+                                className={cn(
+                                    "bg-clip-text text-transparent bg-gradient-to-r from-indigo-300 via-white/90 to-rose-300"
+                                )}
+                            >
+                                {title2}
+                            </span>
+                        </h1>
+                    </motion.div>
+
+                    <motion.div
+                        custom={2}
+                        variants={fadeUpVariants}
+                        initial="hidden"
+                        animate="visible"
+                    >
+                        <p className="text-sm sm:text-base md:text-lg text-white/40 mb-10 leading-relaxed font-normal tracking-tight max-w-xl mx-auto px-4">
+                            Upload your data. Describe your goal.
+                            Let AI handle profiling, modeling, visualization, and strategic insights autonomously.
+                        </p>
+                    </motion.div>
+
+                    <motion.div
+                        custom={3}
+                        variants={fadeUpVariants}
+                        initial="hidden"
+                        animate="visible"
+                        className="flex flex-col sm:flex-row items-center justify-center gap-4 px-4"
+                    >
+                        <button 
+                            onClick={onChatClick}
+                            className="w-full sm:w-auto px-8 py-3.5 bg-white text-black font-bold rounded-xl hover:bg-white/90 transition-all flex items-center justify-center gap-2 group text-sm shadow-xl"
+                        >
+                            Chat Now
+                            <MessageSquare className="w-4 h-4 fill-black group-hover:translate-x-0.5 transition-transform" />
+                        </button>
+                    </motion.div>
+                </div>
+            </div>
+
+            <div className="absolute inset-0 bg-gradient-to-t from-[#030303] via-transparent to-[#030303]/80 pointer-events-none" />
+        </div>
+    );
+}
diff --git a/FRRONTEEEND/components/KeyCapabilities.tsx b/FRRONTEEEND/components/KeyCapabilities.tsx
new file mode 100644
index 0000000000000000000000000000000000000000..1e2785e3e14eb61a40cfb04443b2101e8cba88ba
--- /dev/null
+++ b/FRRONTEEEND/components/KeyCapabilities.tsx
@@ -0,0 +1,91 @@
+
+import React from 'react';
+import { motion } from 'framer-motion';
+import { Database, Wrench, Cpu, Brain, LineChart, Server } from 'lucide-react';
+import { cn } from '../lib/utils';
+
+const capabilities = [
+  {
+    title: "Autonomous ML Pipelines",
+    description: "End-to-end automation from profiling to deployment without manual coding.",
+    icon: Database,
+    color: "from-blue-500/20 to-cyan-500/20",
+    hover: "hover:bg-blue-500/10 hover:border-blue-500/30 hover:shadow-[0_0_30px_-10px_rgba(59,130,246,0.2)]"
+  },
+  {
+    title: "82+ Specialized Tools",
+    description: "An extensive arsenal for cleaning, statistical testing, and predictive modeling.",
+    icon: Wrench,
+    color: "from-purple-500/20 to-pink-500/20",
+    hover: "hover:bg-pink-500/10 hover:border-pink-500/30 hover:shadow-[0_0_30px_-10px_rgba(236,72,153,0.2)]"
+  },
+  {
+    title: "Dual LLM Intelligence",
+    description: "Orchestrated by Groq (for speed) and Gemini (for deep reasoning).",
+    icon: Brain,
+    color: "from-orange-500/20 to-amber-500/20",
+    hover: "hover:bg-amber-500/10 hover:border-amber-500/30 hover:shadow-[0_0_30px_-10px_rgba(245,158,11,0.2)]"
+  },
+  {
+    title: "Session Memory",
+    description: "Maintains context across complex workflows, allowing for iterative refinement.",
+    icon: Cpu,
+    color: "from-emerald-500/20 to-teal-500/20",
+    hover: "hover:bg-emerald-500/10 hover:border-emerald-500/30 hover:shadow-[0_0_30px_-10px_rgba(16,185,129,0.2)]"
+  },
+  {
+    title: "Visual Insights",
+    description: "Automatic generation of publication-quality charts and explainability reports.",
+    icon: LineChart,
+    color: "from-indigo-500/20 to-blue-500/20",
+    hover: "hover:bg-indigo-500/10 hover:border-indigo-500/30 hover:shadow-[0_0_30px_-10px_rgba(99,102,241,0.2)]"
+  },
+  {
+    title: "Cloud Run Ready",
+    description: "Deploy your optimized models directly to production-grade cloud environments.",
+    icon: Server,
+    color: "from-rose-500/20 to-red-500/20",
+    hover: "hover:bg-rose-500/10 hover:border-rose-500/30 hover:shadow-[0_0_30px_-10px_rgba(244,63,94,0.2)]"
+  }
+];
+
+const KeyCapabilities = () => {
+  return (
+    <section id="features" className="py-24 bg-[#030303]">
+      <div className="max-w-7xl mx-auto px-6">
+        <div className="text-center mb-16">
+          <h2 className="text-4xl md:text-5xl font-extrabold text-white mb-4 tracking-tight">Powerful Orchestration</h2>
+          <p className="text-white/40 text-xl font-medium">Not just a chatbot, but a true system of intelligence.</p>
+        </div>
+        
+        <div className="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-3 gap-8">
+          {capabilities.map((cap, i) => (
+            <motion.div
+              key={i}
+              initial={{ opacity: 0, y: 20 }}
+              whileInView={{ opacity: 1, y: 0 }}
+              viewport={{ once: true }}
+              transition={{ delay: i * 0.1 }}
+              whileHover={{ scale: 1.02, y: -5 }}
+              className={cn(
+                "group p-8 rounded-2xl bg-white/[0.02] border border-white/[0.08] transition-all duration-300 cursor-default",
+                cap.hover
+              )}
+            >
+              <div className={cn(
+                "w-12 h-12 rounded-lg bg-gradient-to-br flex items-center justify-center mb-6 group-hover:scale-110 transition-transform duration-300",
+                cap.color
+              )}>
+                <cap.icon className="w-6 h-6 text-white" />
+              </div>
+              <h3 className="text-xl font-bold text-white mb-3 tracking-tight">{cap.title}</h3>
+              <p className="text-white/50 leading-relaxed font-medium">{cap.description}</p>
+            </motion.div>
+          ))}
+        </div>
+      </div>
+    </section>
+  );
+};
+
+export default KeyCapabilities;
diff --git a/FRRONTEEEND/components/Logo.tsx b/FRRONTEEEND/components/Logo.tsx
new file mode 100644
index 0000000000000000000000000000000000000000..53c2e533d59bce2f59a253a858ef927889fab2d9
--- /dev/null
+++ b/FRRONTEEEND/components/Logo.tsx
@@ -0,0 +1,92 @@
+
+import React from 'react';
+import { cn } from '../lib/utils';
+
+interface LogoProps {
+  className?: string;
+  showText?: boolean;
+}
+
+export const Logo: React.FC<LogoProps> = ({ className, showText = false }) => {
+  return (
+    <div className={cn("flex flex-col items-center", className)}>
+      <svg
+        viewBox="0 0 120 120"
+        className="w-full h-full"
+        fill="none"
+        xmlns="http://www.w3.org/2000/svg"
+      >
+        <defs>
+          <linearGradient id="logoGradient" x1="0%" y1="0%" x2="100%" y2="100%">
+            <stop offset="0%" stopColor="#22d3ee" />
+            <stop offset="100%" stopColor="#6366f1" />
+          </linearGradient>
+          <filter id="glow" x="-20%" y="-20%" width="140%" height="140%">
+            <feGaussianBlur stdDeviation="2" result="blur" />
+            <feComposite in="SourceGraphic" in2="blur" operator="over" />
+          </filter>
+        </defs>
+
+        {/* Central Core */}
+        <circle cx="60" cy="60" r="6" fill="url(#logoGradient)" filter="url(#glow)" />
+
+        {/* Inner Circuit Ring */}
+        <circle cx="60" cy="60" r="18" stroke="url(#logoGradient)" strokeWidth="1" strokeDasharray="2 4" opacity="0.4" />
+        
+        {/* Complex Neural Paths (Stylized) */}
+        <g opacity="0.8">
+          {[0, 45, 90, 135, 180, 225, 270, 315].map((angle) => (
+            <g key={angle} transform={`rotate(${angle} 60 60)`}>
+              <path
+                d="M60 35 L60 30 M60 30 L55 25 M60 30 L65 25"
+                stroke="url(#logoGradient)"
+                strokeWidth="1.5"
+                strokeLinecap="round"
+              />
+              <circle cx="55" cy="25" r="1.5" fill="url(#logoGradient)" />
+              <circle cx="65" cy="25" r="1.5" fill="url(#logoGradient)" />
+            </g>
+          ))}
+        </g>
+
+        {/* Middle Dashed Ring */}
+        <circle cx="60" cy="60" r="32" stroke="url(#logoGradient)" strokeWidth="1.5" strokeDasharray="10 6" opacity="0.6" />
+
+        {/* Outer Orbital with Squares */}
+        <circle cx="60" cy="60" r="45" stroke="url(#logoGradient)" strokeWidth="0.5" opacity="0.3" />
+        {[0, 30, 60, 90, 120, 150, 180, 210, 240, 270, 300, 330].map((angle) => (
+          <rect
+            key={angle}
+            x="58"
+            y="12"
+            width="4"
+            height="4"
+            fill="url(#logoGradient)"
+            transform={`rotate(${angle} 60 60)`}
+            rx="1"
+          />
+        ))}
+
+        {/* Connection Spokes */}
+        {[0, 90, 180, 270].map((angle) => (
+          <line
+            key={angle}
+            x1="60"
+            y1="16"
+            x2="60"
+            y2="30"
+            stroke="url(#logoGradient)"
+            strokeWidth="1"
+            opacity="0.5"
+            transform={`rotate(${angle} 60 60)`}
+          />
+        ))}
+      </svg>
+      {showText && (
+        <span className="mt-2 text-white font-extrabold tracking-widest text-[10px] sm:text-xs uppercase">
+          DATA SCIENCE AGENT
+        </span>
+      )}
+    </div>
+  );
+};
diff --git a/FRRONTEEEND/components/ProblemSolution.tsx b/FRRONTEEEND/components/ProblemSolution.tsx
new file mode 100644
index 0000000000000000000000000000000000000000..89342f4061ae800a4c23354da6994cea020e2506
--- /dev/null
+++ b/FRRONTEEEND/components/ProblemSolution.tsx
@@ -0,0 +1,70 @@
+
+import React from 'react';
+import { motion } from 'framer-motion';
+import { AlertCircle, Zap, ShieldCheck, Clock } from 'lucide-react';
+
+const ProblemSolution = () => {
+  return (
+    <section className="py-24 relative bg-[#030303] overflow-hidden">
+      <div className="max-w-7xl mx-auto px-6">
+        <div className="grid grid-cols-1 lg:grid-cols-2 gap-16 items-center">
+          <motion.div
+            initial={{ opacity: 0, x: -30 }}
+            whileInView={{ opacity: 1, x: 0 }}
+            viewport={{ once: true }}
+            transition={{ duration: 0.8 }}
+          >
+            <h2 className="text-3xl md:text-5xl font-extrabold text-white mb-6 tracking-tight">
+              The Data Science <span className="text-rose-400">Bottleneck</span>
+            </h2>
+            <p className="text-white/60 text-lg mb-8 leading-relaxed font-medium">
+              Modern data science is 80% manual labor. Cleaning messy datasets, engineering features, and tuning models takes weeks of repetitive effort. Mistakes are costly, and scaling insights is slow.
+            </p>
+            <ul className="space-y-4">
+              {[
+                { icon: AlertCircle, text: "Error-prone manual data preprocessing", color: "text-rose-400" },
+                { icon: Clock, text: "Days spent on hyperparameter tuning", color: "text-rose-400" },
+                { icon: AlertCircle, text: "Disconnected silos of code and insights", color: "text-rose-400" },
+              ].map((item, i) => (
+                <li key={i} className="flex items-center gap-3 text-white/80 font-semibold">
+                  <item.icon className={`w-5 h-5 ${item.color}`} />
+                  <span>{item.text}</span>
+                </li>
+              ))}
+            </ul>
+          </motion.div>
+
+          <motion.div
+            initial={{ opacity: 0, x: 30 }}
+            whileInView={{ opacity: 1, x: 0 }}
+            viewport={{ once: true }}
+            transition={{ duration: 0.8 }}
+            className="relative p-8 md:p-12 rounded-3xl bg-gradient-to-br from-indigo-500/10 via-white/5 to-rose-500/10 border border-white/10"
+          >
+            <div className="absolute -top-6 -right-6 w-32 h-32 bg-indigo-500/20 blur-3xl" />
+            <h2 className="text-3xl md:text-5xl font-extrabold text-white mb-6 tracking-tight">
+              The <span className="text-indigo-400">Autonomous</span> Solution
+            </h2>
+            <p className="text-white/60 text-lg mb-8 leading-relaxed font-medium">
+              DATA SCIENCE AGENT automates the entire lifecycle. From raw CSV to production-ready models and interactive dashboards, our agent uses 82+ specialized tools to deliver precision at scale.
+            </p>
+            <ul className="space-y-4">
+              {[
+                { icon: Zap, text: "Instant feature engineering and selection", color: "text-indigo-400" },
+                { icon: ShieldCheck, text: "Automated error recovery and re-training", color: "text-indigo-400" },
+                { icon: Zap, text: "Explainable AI (XAI) reports by default", color: "text-indigo-400" },
+              ].map((item, i) => (
+                <li key={i} className="flex items-center gap-3 text-white/80 font-semibold">
+                  <item.icon className={`w-5 h-5 ${item.color}`} />
+                  <span>{item.text}</span>
+                </li>
+              ))}
+            </ul>
+          </motion.div>
+        </div>
+      </div>
+    </section>
+  );
+};
+
+export default ProblemSolution;
diff --git a/FRRONTEEEND/components/Process.tsx b/FRRONTEEEND/components/Process.tsx
new file mode 100644
index 0000000000000000000000000000000000000000..7d5e644867c1683af67a852fce96ac1348504a1d
--- /dev/null
+++ b/FRRONTEEEND/components/Process.tsx
@@ -0,0 +1,70 @@
+
+import React from 'react';
+import { motion } from 'framer-motion';
+
+const steps = [
+  {
+    number: "01",
+    title: "Ingest Data",
+    description: "Upload your raw CSV, JSON, or Parquet files directly to the secure environment."
+  },
+  {
+    number: "02",
+    title: "Define Objective",
+    description: "Describe what you want to achieve in natural language. 'Predict churn' or 'Find outliers'."
+  },
+  {
+    number: "03",
+    title: "Agent Execution",
+    description: "The agent orchestrates tools to clean, transform, and model your data autonomously."
+  },
+  {
+    number: "04",
+    title: "Receive Assets",
+    description: "Get fully trained models, performance metrics, and interactive explainable reports."
+  }
+];
+
+const Process = () => {
+  return (
+    <section id="process" className="py-24 bg-[#030303] border-y border-white/5">
+      <div className="max-w-7xl mx-auto px-6">
+        <div className="text-center mb-20">
+          <h2 className="text-4xl md:text-5xl font-extrabold text-white mb-4 tracking-tight">How it Works</h2>
+          <p className="text-white/40 text-xl font-medium">From raw data to actionable intelligence in 4 steps.</p>
+        </div>
+
+        <div className="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-4 gap-12">
+          {steps.map((step, i) => (
+            <motion.div
+              key={i}
+              initial={{ opacity: 0, scale: 0.95 }}
+              whileInView={{ opacity: 1, scale: 1 }}
+              viewport={{ once: true }}
+              transition={{ delay: i * 0.1 }}
+              className="relative"
+            >
+              <span className="text-7xl font-extrabold text-white/5 absolute -top-10 -left-4 select-none italic">
+                {step.number}
+              </span>
+              <div className="relative z-10">
+                <h3 className="text-xl font-bold text-white mb-4 flex items-center gap-2 tracking-tight">
+                  <span className="w-1.5 h-1.5 rounded-full bg-indigo-500" />
+                  {step.title}
+                </h3>
+                <p className="text-white/40 leading-relaxed font-medium">
+                  {step.description}
+                </p>
+              </div>
+              {i < steps.length - 1 && (
+                <div className="hidden lg:block absolute top-1/2 -right-6 w-12 h-[1px] bg-gradient-to-r from-white/10 to-transparent" />
+              )}
+            </motion.div>
+          ))}
+        </div>
+      </div>
+    </section>
+  );
+};
+
+export default Process;
diff --git a/FRRONTEEEND/components/ShadowSection.tsx b/FRRONTEEEND/components/ShadowSection.tsx
new file mode 100644
index 0000000000000000000000000000000000000000..4dd4fd44edf34b6b3e6fd8947f248b57824ad69a
--- /dev/null
+++ b/FRRONTEEEND/components/ShadowSection.tsx
@@ -0,0 +1,222 @@
+
+'use client';
+
+import React, { useRef, useId, useEffect, CSSProperties } from 'react';
+import { animate, useMotionValue, AnimationPlaybackControls, motion } from 'framer-motion';
+import { cn } from '../lib/utils';
+
+// Type definitions
+interface ResponsiveImage {
+    src: string;
+    alt?: string;
+    srcSet?: string;
+}
+
+interface AnimationConfig {
+    preview?: boolean;
+    scale: number;
+    speed: number;
+}
+
+interface NoiseConfig {
+    opacity: number;
+    scale: number;
+}
+
+interface ShadowOverlayProps {
+    type?: 'preset' | 'custom';
+    presetIndex?: number;
+    customImage?: ResponsiveImage;
+    sizing?: 'fill' | 'stretch';
+    color?: string;
+    animation?: AnimationConfig;
+    noise?: NoiseConfig;
+    style?: CSSProperties;
+    className?: string;
+    title?: string;
+    description?: string;
+}
+
+function mapRange(
+    value: number,
+    fromLow: number,
+    fromHigh: number,
+    toLow: number,
+    toHigh: number
+): number {
+    if (fromLow === fromHigh) {
+        return toLow;
+    }
+    const percentage = (value - fromLow) / (fromHigh - fromLow);
+    return toLow + percentage * (toHigh - toLow);
+}
+
+const useInstanceId = (): string => {
+    const id = useId();
+    const cleanId = id.replace(/:/g, "");
+    const instanceId = `shadowoverlay-${cleanId}`;
+    return instanceId;
+};
+
+export function ShadowSection({
+    sizing = 'fill',
+    color = 'rgba(99, 102, 241, 0.6)',
+    animation = { scale: 50, speed: 15 },
+    noise = { opacity: 0.1, scale: 0.5 },
+    style,
+    className,
+    title = "Cognitive Core",
+    description = "The unseen intelligence powering your most critical decisions."
+}: ShadowOverlayProps) {
+    const id = useInstanceId();
+    const animationEnabled = animation && animation.scale > 0;
+    const feColorMatrixRef = useRef<SVGFEColorMatrixElement>(null);
+    const hueRotateMotionValue = useMotionValue(180);
+    const hueRotateAnimation = useRef<AnimationPlaybackControls | null>(null);
+
+    const displacementScale = animation ? mapRange(animation.scale, 1, 100, 20, 100) : 0;
+    const animationDuration = animation ? mapRange(animation.speed, 1, 100, 1000, 50) : 1;
+
+    useEffect(() => {
+        if (feColorMatrixRef.current && animationEnabled) {
+            if (hueRotateAnimation.current) {
+                hueRotateAnimation.current.stop();
+            }
+            hueRotateMotionValue.set(0);
+            hueRotateAnimation.current = animate(hueRotateMotionValue, 360, {
+                duration: animationDuration / 25,
+                repeat: Infinity,
+                repeatType: "loop",
+                repeatDelay: 0,
+                ease: "linear",
+                delay: 0,
+                onUpdate: (value: number) => {
+                    if (feColorMatrixRef.current) {
+                        feColorMatrixRef.current.setAttribute("values", String(value));
+                    }
+                }
+            });
+
+            return () => {
+                if (hueRotateAnimation.current) {
+                    hueRotateAnimation.current.stop();
+                }
+            };
+        }
+    }, [animationEnabled, animationDuration, hueRotateMotionValue]);
+
+    return (
+        <section
+            className={cn("relative w-full h-[70vh] min-h-[500px] overflow-hidden bg-[#030303]", className)}
+            style={style}
+        >
+            <div
+                style={{
+                    position: "absolute",
+                    inset: -displacementScale,
+                    filter: animationEnabled ? `url(#${id}) blur(8px)` : "none"
+                }}
+            >
+                {animationEnabled && (
+                    <svg style={{ position: "absolute", width: 0, height: 0 }}>
+                        <defs>
+                            <filter id={id}>
+                                <feTurbulence
+                                    result="undulation"
+                                    numOctaves="2"
+                                    baseFrequency={`${mapRange(animation.scale, 0, 100, 0.001, 0.0005)},${mapRange(animation.scale, 0, 100, 0.004, 0.002)}`}
+                                    seed="0"
+                                    type="turbulence"
+                                />
+                                <feColorMatrix
+                                    ref={feColorMatrixRef}
+                                    in="undulation"
+                                    type="hueRotate"
+                                    values="180"
+                                />
+                                <feColorMatrix
+                                    in="dist"
+                                    result="circulation"
+                                    type="matrix"
+                                    values="4 0 0 0 1  4 0 0 0 1  4 0 0 0 1  1 0 0 0 0"
+                                />
+                                <feDisplacementMap
+                                    in="SourceGraphic"
+                                    in2="circulation"
+                                    scale={displacementScale}
+                                    result="dist"
+                                />
+                                <feDisplacementMap
+                                    in="dist"
+                                    in2="undulation"
+                                    scale={displacementScale}
+                                    result="output"
+                                />
+                            </filter>
+                        </defs>
+                    </svg>
+                )}
+                <div
+                    style={{
+                        backgroundColor: color,
+                        maskImage: `url('https://framerusercontent.com/images/ceBGguIpUU8luwByxuQz79t7To.png')`,
+                        maskSize: sizing === "stretch" ? "100% 100%" : "cover",
+                        maskRepeat: "no-repeat",
+                        maskPosition: "center",
+                        width: "100%",
+                        height: "100%"
+                    }}
+                />
+            </div>
+
+            <div
+                style={{
+                    position: "absolute",
+                    top: "50%",
+                    left: "50%",
+                    transform: "translate(-50%, -50%)",
+                    textAlign: "center",
+                    zIndex: 20,
+                    width: '100%',
+                    padding: '0 2rem'
+                }}
+            >
+                <motion.h2 
+                    initial={{ opacity: 0, y: 20 }}
+                    whileInView={{ opacity: 1, y: 0 }}
+                    viewport={{ once: true }}
+                    className="md:text-7xl text-5xl lg:text-8xl font-heading font-bold text-center text-white relative z-20 tracking-tighter mb-4"
+                >
+                    {title}
+                </motion.h2>
+                <motion.p
+                    initial={{ opacity: 0, y: 20 }}
+                    whileInView={{ opacity: 1, y: 0 }}
+                    viewport={{ once: true }}
+                    transition={{ delay: 0.2 }}
+                    className="text-white/60 text-lg md:text-xl font-sans max-w-xl mx-auto"
+                >
+                    {description}
+                </motion.p>
+            </div>
+
+            {noise && noise.opacity > 0 && (
+                <div
+                    style={{
+                        position: "absolute",
+                        inset: 0,
+                        backgroundImage: `url("https://framerusercontent.com/images/g0QcWrxr87K0ufOxIUFBakwYA8.png")`,
+                        backgroundSize: noise.scale * 200,
+                        backgroundRepeat: "repeat",
+                        opacity: noise.opacity / 2,
+                        zIndex: 15
+                    }}
+                />
+            )}
+            
+            {/* Bottom Vignette */}
+            <div className="absolute inset-x-0 bottom-0 h-40 bg-gradient-to-t from-[#030303] to-transparent z-30" />
+            <div className="absolute inset-x-0 top-0 h-40 bg-gradient-to-b from-[#030303] to-transparent z-30" />
+        </section>
+    );
+}
diff --git a/FRRONTEEEND/components/TechStack.tsx b/FRRONTEEEND/components/TechStack.tsx
new file mode 100644
index 0000000000000000000000000000000000000000..e0c80bf88df3287be004d9c94288a64da27ae841
--- /dev/null
+++ b/FRRONTEEEND/components/TechStack.tsx
@@ -0,0 +1,36 @@
+
+import React from 'react';
+import { motion } from 'framer-motion';
+
+const techs = [
+  "Python", "Polars", "Pandas", "Scikit-Learn", "XGBoost", "LightGBM", "Groq", "Gemini", "FastAPI", "Cloud Run", "Docker", "PyTorch"
+];
+
+const TechStack = () => {
+  return (
+    <section className="py-24 bg-[#030303]">
+      <div className="max-w-7xl mx-auto px-6">
+        <div className="text-center mb-12">
+          <h3 className="text-xs font-bold uppercase tracking-[0.3em] text-white/30 italic">Built with the modern AI Stack</h3>
+        </div>
+        
+        <div className="flex flex-wrap justify-center gap-4 md:gap-6 opacity-60">
+          {techs.map((tech, i) => (
+            <motion.div
+              key={tech}
+              initial={{ opacity: 0 }}
+              whileInView={{ opacity: 1 }}
+              viewport={{ once: true }}
+              transition={{ delay: i * 0.05 }}
+              className="px-5 py-2 rounded-lg border border-white/5 bg-white/[0.02] text-white/80 font-bold text-xs md:text-sm whitespace-nowrap tracking-wide uppercase"
+            >
+              {tech}
+            </motion.div>
+          ))}
+        </div>
+      </div>
+    </section>
+  );
+};
+
+export default TechStack;
diff --git a/FRRONTEEEND/index.html b/FRRONTEEEND/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..3e1853589b0b7647f5f7f83a9deb06a671d6ff2c
--- /dev/null
+++ b/FRRONTEEEND/index.html
@@ -0,0 +1,59 @@
+
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>Data Science Agent </title>
+    <script src="https://cdn.tailwindcss.com"></script>
+    <link rel="preconnect" href="https://fonts.googleapis.com">
+    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+    <link href="https://fonts.googleapis.com/css2?family=Plus+Jakarta+Sans:ital,wght@0,200;0,300;0,400;0,500;0,600;0,700;0,800;1,200;1,300;1,400;1,500;1,600;1,700;1,800&display=swap" rel="stylesheet">
+    <script>
+      tailwind.config = {
+        theme: {
+          extend: {
+            fontFamily: {
+              sans: ['Plus Jakarta Sans', 'sans-serif'],
+              heading: ['Plus Jakarta Sans', 'sans-serif'],
+              mono: ['Plus Jakarta Sans', 'sans-serif'],
+            },
+          },
+        },
+      }
+    </script>
+    <style>
+      body {
+        margin: 0;
+        background-color: #030303;
+        overflow-x: hidden;
+        font-family: 'Plus Jakarta Sans', sans-serif;
+        -webkit-font-smoothing: antialiased;
+        -moz-osx-font-smoothing: grayscale;
+      }
+      ::selection {
+        background-color: rgba(99, 102, 241, 0.3);
+        color: white;
+      }
+    </style>
+  <script type="importmap">
+{
+  "imports": {
+    "react": "https://esm.sh/react@^19.2.3",
+    "react-dom/": "https://esm.sh/react-dom@^19.2.3/",
+    "react/": "https://esm.sh/react@^19.2.3/",
+    "clsx": "https://esm.sh/clsx@^2.1.1",
+    "tailwind-merge": "https://esm.sh/tailwind-merge@^3.4.0",
+    "framer-motion": "https://esm.sh/framer-motion@^12.23.26",
+    "lucide-react": "https://esm.sh/lucide-react@^0.562.0",
+    "@google/genai": "https://esm.sh/@google/genai@^1.34.0"
+  }
+}
+</script>
+<link rel="stylesheet" href="/index.css">
+</head>
+  <body>
+    <div id="root"></div>
+  <script type="module" src="/index.tsx"></script>
+</body>
+</html>
diff --git a/FRRONTEEEND/index.tsx b/FRRONTEEEND/index.tsx
new file mode 100644
index 0000000000000000000000000000000000000000..aaa0c6e4687f6cecf582465e25a32c725104814d
--- /dev/null
+++ b/FRRONTEEEND/index.tsx
@@ -0,0 +1,16 @@
+
+import React from 'react';
+import ReactDOM from 'react-dom/client';
+import App from './App';
+
+const rootElement = document.getElementById('root');
+if (!rootElement) {
+  throw new Error("Could not find root element to mount to");
+}
+
+const root = ReactDOM.createRoot(rootElement);
+root.render(
+  <React.StrictMode>
+    <App />
+  </React.StrictMode>
+);
diff --git a/FRRONTEEEND/lib/utils.ts b/FRRONTEEEND/lib/utils.ts
new file mode 100644
index 0000000000000000000000000000000000000000..d6f3ebf6693b89255527126310d8b721e8c941f4
--- /dev/null
+++ b/FRRONTEEEND/lib/utils.ts
@@ -0,0 +1,7 @@
+
+import { clsx, type ClassValue } from 'clsx';
+import { twMerge } from 'tailwind-merge';
+
+export function cn(...inputs: ClassValue[]) {
+  return twMerge(clsx(inputs));
+}
diff --git a/FRRONTEEEND/metadata.json b/FRRONTEEEND/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..2e343d903ad39237384f6d0c52e5caae4f1e37d8
--- /dev/null
+++ b/FRRONTEEEND/metadata.json
@@ -0,0 +1,5 @@
+{
+  "name": "Data Science Agent",
+  "description": "A production-grade autonomous AI agent for end-to-end data science workflows, featuring 82+ specialized tools and dual LLM support.",
+  "requestFramePermissions": []
+}
\ No newline at end of file
diff --git a/FRRONTEEEND/package-lock.json b/FRRONTEEEND/package-lock.json
new file mode 100644
index 0000000000000000000000000000000000000000..cb4c20cacdb96eeef07e25f79976f0c31dc153f3
--- /dev/null
+++ b/FRRONTEEEND/package-lock.json
@@ -0,0 +1,3036 @@
+{
+  "name": "data-science-agent",
+  "version": "0.0.0",
+  "lockfileVersion": 3,
+  "requires": true,
+  "packages": {
+    "": {
+      "name": "data-science-agent",
+      "version": "0.0.0",
+      "dependencies": {
+        "clsx": "^2.1.1",
+        "framer-motion": "^12.23.26",
+        "lucide-react": "^0.562.0",
+        "react": "^19.2.3",
+        "react-dom": "^19.2.3",
+        "react-markdown": "^9.0.1",
+        "tailwind-merge": "^3.4.0"
+      },
+      "devDependencies": {
+        "@types/node": "^22.14.0",
+        "@vitejs/plugin-react": "^5.0.0",
+        "typescript": "~5.8.2",
+        "vite": "^6.2.0"
+      }
+    },
+    "node_modules/@babel/code-frame": {
+      "version": "7.27.1",
+      "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.27.1.tgz",
+      "integrity": "sha512-cjQ7ZlQ0Mv3b47hABuTevyTuYN4i+loJKGeV9flcCgIK37cCXRh+L1bd3iBHlynerhQ7BhCkn2BPbQUL+rGqFg==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/helper-validator-identifier": "^7.27.1",
+        "js-tokens": "^4.0.0",
+        "picocolors": "^1.1.1"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/compat-data": {
+      "version": "7.28.5",
+      "resolved": "https://registry.npmjs.org/@babel/compat-data/-/compat-data-7.28.5.tgz",
+      "integrity": "sha512-6uFXyCayocRbqhZOB+6XcuZbkMNimwfVGFji8CTZnCzOHVGvDqzvitu1re2AU5LROliz7eQPhB8CpAMvnx9EjA==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/core": {
+      "version": "7.28.5",
+      "resolved": "https://registry.npmjs.org/@babel/core/-/core-7.28.5.tgz",
+      "integrity": "sha512-e7jT4DxYvIDLk1ZHmU/m/mB19rex9sv0c2ftBtjSBv+kVM/902eh0fINUzD7UwLLNR+jU585GxUJ8/EBfAM5fw==",
+      "dev": true,
+      "license": "MIT",
+      "peer": true,
+      "dependencies": {
+        "@babel/code-frame": "^7.27.1",
+        "@babel/generator": "^7.28.5",
+        "@babel/helper-compilation-targets": "^7.27.2",
+        "@babel/helper-module-transforms": "^7.28.3",
+        "@babel/helpers": "^7.28.4",
+        "@babel/parser": "^7.28.5",
+        "@babel/template": "^7.27.2",
+        "@babel/traverse": "^7.28.5",
+        "@babel/types": "^7.28.5",
+        "@jridgewell/remapping": "^2.3.5",
+        "convert-source-map": "^2.0.0",
+        "debug": "^4.1.0",
+        "gensync": "^1.0.0-beta.2",
+        "json5": "^2.2.3",
+        "semver": "^6.3.1"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/babel"
+      }
+    },
+    "node_modules/@babel/generator": {
+      "version": "7.28.5",
+      "resolved": "https://registry.npmjs.org/@babel/generator/-/generator-7.28.5.tgz",
+      "integrity": "sha512-3EwLFhZ38J4VyIP6WNtt2kUdW9dokXA9Cr4IVIFHuCpZ3H8/YFOl5JjZHisrn1fATPBmKKqXzDFvh9fUwHz6CQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/parser": "^7.28.5",
+        "@babel/types": "^7.28.5",
+        "@jridgewell/gen-mapping": "^0.3.12",
+        "@jridgewell/trace-mapping": "^0.3.28",
+        "jsesc": "^3.0.2"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/helper-compilation-targets": {
+      "version": "7.27.2",
+      "resolved": "https://registry.npmjs.org/@babel/helper-compilation-targets/-/helper-compilation-targets-7.27.2.tgz",
+      "integrity": "sha512-2+1thGUUWWjLTYTHZWK1n8Yga0ijBz1XAhUXcKy81rd5g6yh7hGqMp45v7cadSbEHc9G3OTv45SyneRN3ps4DQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/compat-data": "^7.27.2",
+        "@babel/helper-validator-option": "^7.27.1",
+        "browserslist": "^4.24.0",
+        "lru-cache": "^5.1.1",
+        "semver": "^6.3.1"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/helper-globals": {
+      "version": "7.28.0",
+      "resolved": "https://registry.npmjs.org/@babel/helper-globals/-/helper-globals-7.28.0.tgz",
+      "integrity": "sha512-+W6cISkXFa1jXsDEdYA8HeevQT/FULhxzR99pxphltZcVaugps53THCeiWA8SguxxpSp3gKPiuYfSWopkLQ4hw==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/helper-module-imports": {
+      "version": "7.27.1",
+      "resolved": "https://registry.npmjs.org/@babel/helper-module-imports/-/helper-module-imports-7.27.1.tgz",
+      "integrity": "sha512-0gSFWUPNXNopqtIPQvlD5WgXYI5GY2kP2cCvoT8kczjbfcfuIljTbcWrulD1CIPIX2gt1wghbDy08yE1p+/r3w==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/traverse": "^7.27.1",
+        "@babel/types": "^7.27.1"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/helper-module-transforms": {
+      "version": "7.28.3",
+      "resolved": "https://registry.npmjs.org/@babel/helper-module-transforms/-/helper-module-transforms-7.28.3.tgz",
+      "integrity": "sha512-gytXUbs8k2sXS9PnQptz5o0QnpLL51SwASIORY6XaBKF88nsOT0Zw9szLqlSGQDP/4TljBAD5y98p2U1fqkdsw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/helper-module-imports": "^7.27.1",
+        "@babel/helper-validator-identifier": "^7.27.1",
+        "@babel/traverse": "^7.28.3"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      },
+      "peerDependencies": {
+        "@babel/core": "^7.0.0"
+      }
+    },
+    "node_modules/@babel/helper-plugin-utils": {
+      "version": "7.27.1",
+      "resolved": "https://registry.npmjs.org/@babel/helper-plugin-utils/-/helper-plugin-utils-7.27.1.tgz",
+      "integrity": "sha512-1gn1Up5YXka3YYAHGKpbideQ5Yjf1tDa9qYcgysz+cNCXukyLl6DjPXhD3VRwSb8c0J9tA4b2+rHEZtc6R0tlw==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/helper-string-parser": {
+      "version": "7.27.1",
+      "resolved": "https://registry.npmjs.org/@babel/helper-string-parser/-/helper-string-parser-7.27.1.tgz",
+      "integrity": "sha512-qMlSxKbpRlAridDExk92nSobyDdpPijUq2DW6oDnUqd0iOGxmQjyqhMIihI9+zv4LPyZdRje2cavWPbCbWm3eA==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/helper-validator-identifier": {
+      "version": "7.28.5",
+      "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.28.5.tgz",
+      "integrity": "sha512-qSs4ifwzKJSV39ucNjsvc6WVHs6b7S03sOh2OcHF9UHfVPqWWALUsNUVzhSBiItjRZoLHx7nIarVjqKVusUZ1Q==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/helper-validator-option": {
+      "version": "7.27.1",
+      "resolved": "https://registry.npmjs.org/@babel/helper-validator-option/-/helper-validator-option-7.27.1.tgz",
+      "integrity": "sha512-YvjJow9FxbhFFKDSuFnVCe2WxXk1zWc22fFePVNEaWJEu8IrZVlda6N0uHwzZrUM1il7NC9Mlp4MaJYbYd9JSg==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/helpers": {
+      "version": "7.28.4",
+      "resolved": "https://registry.npmjs.org/@babel/helpers/-/helpers-7.28.4.tgz",
+      "integrity": "sha512-HFN59MmQXGHVyYadKLVumYsA9dBFun/ldYxipEjzA4196jpLZd8UjEEBLkbEkvfYreDqJhZxYAWFPtrfhNpj4w==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/template": "^7.27.2",
+        "@babel/types": "^7.28.4"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/parser": {
+      "version": "7.28.5",
+      "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.28.5.tgz",
+      "integrity": "sha512-KKBU1VGYR7ORr3At5HAtUQ+TV3SzRCXmA/8OdDZiLDBIZxVyzXuztPjfLd3BV1PRAQGCMWWSHYhL0F8d5uHBDQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/types": "^7.28.5"
+      },
+      "bin": {
+        "parser": "bin/babel-parser.js"
+      },
+      "engines": {
+        "node": ">=6.0.0"
+      }
+    },
+    "node_modules/@babel/plugin-transform-react-jsx-self": {
+      "version": "7.27.1",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-react-jsx-self/-/plugin-transform-react-jsx-self-7.27.1.tgz",
+      "integrity": "sha512-6UzkCs+ejGdZ5mFFC/OCUrv028ab2fp1znZmCZjAOBKiBK2jXD1O+BPSfX8X2qjJ75fZBMSnQn3Rq2mrBJK2mw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/helper-plugin-utils": "^7.27.1"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      },
+      "peerDependencies": {
+        "@babel/core": "^7.0.0-0"
+      }
+    },
+    "node_modules/@babel/plugin-transform-react-jsx-source": {
+      "version": "7.27.1",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-transform-react-jsx-source/-/plugin-transform-react-jsx-source-7.27.1.tgz",
+      "integrity": "sha512-zbwoTsBruTeKB9hSq73ha66iFeJHuaFkUbwvqElnygoNbj/jHRsSeokowZFN3CZ64IvEqcmmkVe89OPXc7ldAw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/helper-plugin-utils": "^7.27.1"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      },
+      "peerDependencies": {
+        "@babel/core": "^7.0.0-0"
+      }
+    },
+    "node_modules/@babel/template": {
+      "version": "7.27.2",
+      "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.27.2.tgz",
+      "integrity": "sha512-LPDZ85aEJyYSd18/DkjNh4/y1ntkE5KwUHWTiqgRxruuZL2F1yuHligVHLvcHY2vMHXttKFpJn6LwfI7cw7ODw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/code-frame": "^7.27.1",
+        "@babel/parser": "^7.27.2",
+        "@babel/types": "^7.27.1"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/traverse": {
+      "version": "7.28.5",
+      "resolved": "https://registry.npmjs.org/@babel/traverse/-/traverse-7.28.5.tgz",
+      "integrity": "sha512-TCCj4t55U90khlYkVV/0TfkJkAkUg3jZFA3Neb7unZT8CPok7iiRfaX0F+WnqWqt7OxhOn0uBKXCw4lbL8W0aQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/code-frame": "^7.27.1",
+        "@babel/generator": "^7.28.5",
+        "@babel/helper-globals": "^7.28.0",
+        "@babel/parser": "^7.28.5",
+        "@babel/template": "^7.27.2",
+        "@babel/types": "^7.28.5",
+        "debug": "^4.3.1"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/types": {
+      "version": "7.28.5",
+      "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.28.5.tgz",
+      "integrity": "sha512-qQ5m48eI/MFLQ5PxQj4PFaprjyCTLI37ElWMmNs0K8Lk3dVeOdNpB3ks8jc7yM5CDmVC73eMVk/trk3fgmrUpA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/helper-string-parser": "^7.27.1",
+        "@babel/helper-validator-identifier": "^7.28.5"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@esbuild/aix-ppc64": {
+      "version": "0.25.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.25.12.tgz",
+      "integrity": "sha512-Hhmwd6CInZ3dwpuGTF8fJG6yoWmsToE+vYgD4nytZVxcu1ulHpUQRAB1UJ8+N1Am3Mz4+xOByoQoSZf4D+CpkA==",
+      "cpu": [
+        "ppc64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "aix"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/android-arm": {
+      "version": "0.25.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.25.12.tgz",
+      "integrity": "sha512-VJ+sKvNA/GE7Ccacc9Cha7bpS8nyzVv0jdVgwNDaR4gDMC/2TTRc33Ip8qrNYUcpkOHUT5OZ0bUcNNVZQ9RLlg==",
+      "cpu": [
+        "arm"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "android"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/android-arm64": {
+      "version": "0.25.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.25.12.tgz",
+      "integrity": "sha512-6AAmLG7zwD1Z159jCKPvAxZd4y/VTO0VkprYy+3N2FtJ8+BQWFXU+OxARIwA46c5tdD9SsKGZ/1ocqBS/gAKHg==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "android"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/android-x64": {
+      "version": "0.25.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.25.12.tgz",
+      "integrity": "sha512-5jbb+2hhDHx5phYR2By8GTWEzn6I9UqR11Kwf22iKbNpYrsmRB18aX/9ivc5cabcUiAT/wM+YIZ6SG9QO6a8kg==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "android"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/darwin-arm64": {
+      "version": "0.25.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.25.12.tgz",
+      "integrity": "sha512-N3zl+lxHCifgIlcMUP5016ESkeQjLj/959RxxNYIthIg+CQHInujFuXeWbWMgnTo4cp5XVHqFPmpyu9J65C1Yg==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "darwin"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/darwin-x64": {
+      "version": "0.25.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.25.12.tgz",
+      "integrity": "sha512-HQ9ka4Kx21qHXwtlTUVbKJOAnmG1ipXhdWTmNXiPzPfWKpXqASVcWdnf2bnL73wgjNrFXAa3yYvBSd9pzfEIpA==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "darwin"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/freebsd-arm64": {
+      "version": "0.25.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.25.12.tgz",
+      "integrity": "sha512-gA0Bx759+7Jve03K1S0vkOu5Lg/85dou3EseOGUes8flVOGxbhDDh/iZaoek11Y8mtyKPGF3vP8XhnkDEAmzeg==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "freebsd"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/freebsd-x64": {
+      "version": "0.25.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.25.12.tgz",
+      "integrity": "sha512-TGbO26Yw2xsHzxtbVFGEXBFH0FRAP7gtcPE7P5yP7wGy7cXK2oO7RyOhL5NLiqTlBh47XhmIUXuGciXEqYFfBQ==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "freebsd"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-arm": {
+      "version": "0.25.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.25.12.tgz",
+      "integrity": "sha512-lPDGyC1JPDou8kGcywY0YILzWlhhnRjdof3UlcoqYmS9El818LLfJJc3PXXgZHrHCAKs/Z2SeZtDJr5MrkxtOw==",
+      "cpu": [
+        "arm"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-arm64": {
+      "version": "0.25.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.25.12.tgz",
+      "integrity": "sha512-8bwX7a8FghIgrupcxb4aUmYDLp8pX06rGh5HqDT7bB+8Rdells6mHvrFHHW2JAOPZUbnjUpKTLg6ECyzvas2AQ==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-ia32": {
+      "version": "0.25.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.25.12.tgz",
+      "integrity": "sha512-0y9KrdVnbMM2/vG8KfU0byhUN+EFCny9+8g202gYqSSVMonbsCfLjUO+rCci7pM0WBEtz+oK/PIwHkzxkyharA==",
+      "cpu": [
+        "ia32"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-loong64": {
+      "version": "0.25.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.25.12.tgz",
+      "integrity": "sha512-h///Lr5a9rib/v1GGqXVGzjL4TMvVTv+s1DPoxQdz7l/AYv6LDSxdIwzxkrPW438oUXiDtwM10o9PmwS/6Z0Ng==",
+      "cpu": [
+        "loong64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-mips64el": {
+      "version": "0.25.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.25.12.tgz",
+      "integrity": "sha512-iyRrM1Pzy9GFMDLsXn1iHUm18nhKnNMWscjmp4+hpafcZjrr2WbT//d20xaGljXDBYHqRcl8HnxbX6uaA/eGVw==",
+      "cpu": [
+        "mips64el"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-ppc64": {
+      "version": "0.25.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.25.12.tgz",
+      "integrity": "sha512-9meM/lRXxMi5PSUqEXRCtVjEZBGwB7P/D4yT8UG/mwIdze2aV4Vo6U5gD3+RsoHXKkHCfSxZKzmDssVlRj1QQA==",
+      "cpu": [
+        "ppc64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-riscv64": {
+      "version": "0.25.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.25.12.tgz",
+      "integrity": "sha512-Zr7KR4hgKUpWAwb1f3o5ygT04MzqVrGEGXGLnj15YQDJErYu/BGg+wmFlIDOdJp0PmB0lLvxFIOXZgFRrdjR0w==",
+      "cpu": [
+        "riscv64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-s390x": {
+      "version": "0.25.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.25.12.tgz",
+      "integrity": "sha512-MsKncOcgTNvdtiISc/jZs/Zf8d0cl/t3gYWX8J9ubBnVOwlk65UIEEvgBORTiljloIWnBzLs4qhzPkJcitIzIg==",
+      "cpu": [
+        "s390x"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-x64": {
+      "version": "0.25.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.25.12.tgz",
+      "integrity": "sha512-uqZMTLr/zR/ed4jIGnwSLkaHmPjOjJvnm6TVVitAa08SLS9Z0VM8wIRx7gWbJB5/J54YuIMInDquWyYvQLZkgw==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/netbsd-arm64": {
+      "version": "0.25.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/netbsd-arm64/-/netbsd-arm64-0.25.12.tgz",
+      "integrity": "sha512-xXwcTq4GhRM7J9A8Gv5boanHhRa/Q9KLVmcyXHCTaM4wKfIpWkdXiMog/KsnxzJ0A1+nD+zoecuzqPmCRyBGjg==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "netbsd"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/netbsd-x64": {
+      "version": "0.25.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.25.12.tgz",
+      "integrity": "sha512-Ld5pTlzPy3YwGec4OuHh1aCVCRvOXdH8DgRjfDy/oumVovmuSzWfnSJg+VtakB9Cm0gxNO9BzWkj6mtO1FMXkQ==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "netbsd"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/openbsd-arm64": {
+      "version": "0.25.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/openbsd-arm64/-/openbsd-arm64-0.25.12.tgz",
+      "integrity": "sha512-fF96T6KsBo/pkQI950FARU9apGNTSlZGsv1jZBAlcLL1MLjLNIWPBkj5NlSz8aAzYKg+eNqknrUJ24QBybeR5A==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "openbsd"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/openbsd-x64": {
+      "version": "0.25.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.25.12.tgz",
+      "integrity": "sha512-MZyXUkZHjQxUvzK7rN8DJ3SRmrVrke8ZyRusHlP+kuwqTcfWLyqMOE3sScPPyeIXN/mDJIfGXvcMqCgYKekoQw==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "openbsd"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/openharmony-arm64": {
+      "version": "0.25.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/openharmony-arm64/-/openharmony-arm64-0.25.12.tgz",
+      "integrity": "sha512-rm0YWsqUSRrjncSXGA7Zv78Nbnw4XL6/dzr20cyrQf7ZmRcsovpcRBdhD43Nuk3y7XIoW2OxMVvwuRvk9XdASg==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "openharmony"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/sunos-x64": {
+      "version": "0.25.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.25.12.tgz",
+      "integrity": "sha512-3wGSCDyuTHQUzt0nV7bocDy72r2lI33QL3gkDNGkod22EsYl04sMf0qLb8luNKTOmgF/eDEDP5BFNwoBKH441w==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "sunos"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/win32-arm64": {
+      "version": "0.25.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.25.12.tgz",
+      "integrity": "sha512-rMmLrur64A7+DKlnSuwqUdRKyd3UE7oPJZmnljqEptesKM8wx9J8gx5u0+9Pq0fQQW8vqeKebwNXdfOyP+8Bsg==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/win32-ia32": {
+      "version": "0.25.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.25.12.tgz",
+      "integrity": "sha512-HkqnmmBoCbCwxUKKNPBixiWDGCpQGVsrQfJoVGYLPT41XWF8lHuE5N6WhVia2n4o5QK5M4tYr21827fNhi4byQ==",
+      "cpu": [
+        "ia32"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/win32-x64": {
+      "version": "0.25.12",
+      "resolved": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.25.12.tgz",
+      "integrity": "sha512-alJC0uCZpTFrSL0CCDjcgleBXPnCrEAhTBILpeAp7M/OFgoqtAetfBzX0xM00MUsVVPpVjlPuMbREqnZCXaTnA==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@jridgewell/gen-mapping": {
+      "version": "0.3.13",
+      "resolved": "https://registry.npmjs.org/@jridgewell/gen-mapping/-/gen-mapping-0.3.13.tgz",
+      "integrity": "sha512-2kkt/7niJ6MgEPxF0bYdQ6etZaA+fQvDcLKckhy1yIQOzaoKjBBjSj63/aLVjYE3qhRt5dvM+uUyfCg6UKCBbA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@jridgewell/sourcemap-codec": "^1.5.0",
+        "@jridgewell/trace-mapping": "^0.3.24"
+      }
+    },
+    "node_modules/@jridgewell/remapping": {
+      "version": "2.3.5",
+      "resolved": "https://registry.npmjs.org/@jridgewell/remapping/-/remapping-2.3.5.tgz",
+      "integrity": "sha512-LI9u/+laYG4Ds1TDKSJW2YPrIlcVYOwi2fUC6xB43lueCjgxV4lffOCZCtYFiH6TNOX+tQKXx97T4IKHbhyHEQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@jridgewell/gen-mapping": "^0.3.5",
+        "@jridgewell/trace-mapping": "^0.3.24"
+      }
+    },
+    "node_modules/@jridgewell/resolve-uri": {
+      "version": "3.1.2",
+      "resolved": "https://registry.npmjs.org/@jridgewell/resolve-uri/-/resolve-uri-3.1.2.tgz",
+      "integrity": "sha512-bRISgCIjP20/tbWSPWMEi54QVPRZExkuD9lJL+UIxUKtwVJA8wW1Trb1jMs1RFXo1CBTNZ/5hpC9QvmKWdopKw==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=6.0.0"
+      }
+    },
+    "node_modules/@jridgewell/sourcemap-codec": {
+      "version": "1.5.5",
+      "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.5.tgz",
+      "integrity": "sha512-cYQ9310grqxueWbl+WuIUIaiUaDcj7WOq5fVhEljNVgRfOUhY9fy2zTvfoqWsnebh8Sl70VScFbICvJnLKB0Og==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/@jridgewell/trace-mapping": {
+      "version": "0.3.31",
+      "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.31.tgz",
+      "integrity": "sha512-zzNR+SdQSDJzc8joaeP8QQoCQr8NuYx2dIIytl1QeBEZHJ9uW6hebsrYgbz8hJwUQao3TWCMtmfV8Nu1twOLAw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@jridgewell/resolve-uri": "^3.1.0",
+        "@jridgewell/sourcemap-codec": "^1.4.14"
+      }
+    },
+    "node_modules/@rolldown/pluginutils": {
+      "version": "1.0.0-beta.53",
+      "resolved": "https://registry.npmjs.org/@rolldown/pluginutils/-/pluginutils-1.0.0-beta.53.tgz",
+      "integrity": "sha512-vENRlFU4YbrwVqNDZ7fLvy+JR1CRkyr01jhSiDpE1u6py3OMzQfztQU2jxykW3ALNxO4kSlqIDeYyD0Y9RcQeQ==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/@rollup/rollup-android-arm-eabi": {
+      "version": "4.54.0",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.54.0.tgz",
+      "integrity": "sha512-OywsdRHrFvCdvsewAInDKCNyR3laPA2mc9bRYJ6LBp5IyvF3fvXbbNR0bSzHlZVFtn6E0xw2oZlyjg4rKCVcng==",
+      "cpu": [
+        "arm"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "android"
+      ]
+    },
+    "node_modules/@rollup/rollup-android-arm64": {
+      "version": "4.54.0",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.54.0.tgz",
+      "integrity": "sha512-Skx39Uv+u7H224Af+bDgNinitlmHyQX1K/atIA32JP3JQw6hVODX5tkbi2zof/E69M1qH2UoN3Xdxgs90mmNYw==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "android"
+      ]
+    },
+    "node_modules/@rollup/rollup-darwin-arm64": {
+      "version": "4.54.0",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.54.0.tgz",
+      "integrity": "sha512-k43D4qta/+6Fq+nCDhhv9yP2HdeKeP56QrUUTW7E6PhZP1US6NDqpJj4MY0jBHlJivVJD5P8NxrjuobZBJTCRw==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "darwin"
+      ]
+    },
+    "node_modules/@rollup/rollup-darwin-x64": {
+      "version": "4.54.0",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.54.0.tgz",
+      "integrity": "sha512-cOo7biqwkpawslEfox5Vs8/qj83M/aZCSSNIWpVzfU2CYHa2G3P1UN5WF01RdTHSgCkri7XOlTdtk17BezlV3A==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "darwin"
+      ]
+    },
+    "node_modules/@rollup/rollup-freebsd-arm64": {
+      "version": "4.54.0",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-arm64/-/rollup-freebsd-arm64-4.54.0.tgz",
+      "integrity": "sha512-miSvuFkmvFbgJ1BevMa4CPCFt5MPGw094knM64W9I0giUIMMmRYcGW/JWZDriaw/k1kOBtsWh1z6nIFV1vPNtA==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "freebsd"
+      ]
+    },
+    "node_modules/@rollup/rollup-freebsd-x64": {
+      "version": "4.54.0",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-x64/-/rollup-freebsd-x64-4.54.0.tgz",
+      "integrity": "sha512-KGXIs55+b/ZfZsq9aR026tmr/+7tq6VG6MsnrvF4H8VhwflTIuYh+LFUlIsRdQSgrgmtM3fVATzEAj4hBQlaqQ==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "freebsd"
+      ]
+    },
+    "node_modules/@rollup/rollup-linux-arm-gnueabihf": {
+      "version": "4.54.0",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.54.0.tgz",
+      "integrity": "sha512-EHMUcDwhtdRGlXZsGSIuXSYwD5kOT9NVnx9sqzYiwAc91wfYOE1g1djOEDseZJKKqtHAHGwnGPQu3kytmfaXLQ==",
+      "cpu": [
+        "arm"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@rollup/rollup-linux-arm-musleabihf": {
+      "version": "4.54.0",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.54.0.tgz",
+      "integrity": "sha512-+pBrqEjaakN2ySv5RVrj/qLytYhPKEUwk+e3SFU5jTLHIcAtqh2rLrd/OkbNuHJpsBgxsD8ccJt5ga/SeG0JmA==",
+      "cpu": [
+        "arm"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@rollup/rollup-linux-arm64-gnu": {
+      "version": "4.54.0",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.54.0.tgz",
+      "integrity": "sha512-NSqc7rE9wuUaRBsBp5ckQ5CVz5aIRKCwsoa6WMF7G01sX3/qHUw/z4pv+D+ahL1EIKy6Enpcnz1RY8pf7bjwng==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@rollup/rollup-linux-arm64-musl": {
+      "version": "4.54.0",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.54.0.tgz",
+      "integrity": "sha512-gr5vDbg3Bakga5kbdpqx81m2n9IX8M6gIMlQQIXiLTNeQW6CucvuInJ91EuCJ/JYvc+rcLLsDFcfAD1K7fMofg==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@rollup/rollup-linux-loong64-gnu": {
+      "version": "4.54.0",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loong64-gnu/-/rollup-linux-loong64-gnu-4.54.0.tgz",
+      "integrity": "sha512-gsrtB1NA3ZYj2vq0Rzkylo9ylCtW/PhpLEivlgWe0bpgtX5+9j9EZa0wtZiCjgu6zmSeZWyI/e2YRX1URozpIw==",
+      "cpu": [
+        "loong64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@rollup/rollup-linux-ppc64-gnu": {
+      "version": "4.54.0",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-ppc64-gnu/-/rollup-linux-ppc64-gnu-4.54.0.tgz",
+      "integrity": "sha512-y3qNOfTBStmFNq+t4s7Tmc9hW2ENtPg8FeUD/VShI7rKxNW7O4fFeaYbMsd3tpFlIg1Q8IapFgy7Q9i2BqeBvA==",
+      "cpu": [
+        "ppc64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@rollup/rollup-linux-riscv64-gnu": {
+      "version": "4.54.0",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.54.0.tgz",
+      "integrity": "sha512-89sepv7h2lIVPsFma8iwmccN7Yjjtgz0Rj/Ou6fEqg3HDhpCa+Et+YSufy27i6b0Wav69Qv4WBNl3Rs6pwhebQ==",
+      "cpu": [
+        "riscv64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@rollup/rollup-linux-riscv64-musl": {
+      "version": "4.54.0",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-musl/-/rollup-linux-riscv64-musl-4.54.0.tgz",
+      "integrity": "sha512-ZcU77ieh0M2Q8Ur7D5X7KvK+UxbXeDHwiOt/CPSBTI1fBmeDMivW0dPkdqkT4rOgDjrDDBUed9x4EgraIKoR2A==",
+      "cpu": [
+        "riscv64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@rollup/rollup-linux-s390x-gnu": {
+      "version": "4.54.0",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.54.0.tgz",
+      "integrity": "sha512-2AdWy5RdDF5+4YfG/YesGDDtbyJlC9LHmL6rZw6FurBJ5n4vFGupsOBGfwMRjBYH7qRQowT8D/U4LoSvVwOhSQ==",
+      "cpu": [
+        "s390x"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@rollup/rollup-linux-x64-gnu": {
+      "version": "4.54.0",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.54.0.tgz",
+      "integrity": "sha512-WGt5J8Ij/rvyqpFexxk3ffKqqbLf9AqrTBbWDk7ApGUzaIs6V+s2s84kAxklFwmMF/vBNGrVdYgbblCOFFezMQ==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@rollup/rollup-linux-x64-musl": {
+      "version": "4.54.0",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.54.0.tgz",
+      "integrity": "sha512-JzQmb38ATzHjxlPHuTH6tE7ojnMKM2kYNzt44LO/jJi8BpceEC8QuXYA908n8r3CNuG/B3BV8VR3Hi1rYtmPiw==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
+    },
+    "node_modules/@rollup/rollup-openharmony-arm64": {
+      "version": "4.54.0",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-openharmony-arm64/-/rollup-openharmony-arm64-4.54.0.tgz",
+      "integrity": "sha512-huT3fd0iC7jigGh7n3q/+lfPcXxBi+om/Rs3yiFxjvSxbSB6aohDFXbWvlspaqjeOh+hx7DDHS+5Es5qRkWkZg==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "openharmony"
+      ]
+    },
+    "node_modules/@rollup/rollup-win32-arm64-msvc": {
+      "version": "4.54.0",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.54.0.tgz",
+      "integrity": "sha512-c2V0W1bsKIKfbLMBu/WGBz6Yci8nJ/ZJdheE0EwB73N3MvHYKiKGs3mVilX4Gs70eGeDaMqEob25Tw2Gb9Nqyw==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ]
+    },
+    "node_modules/@rollup/rollup-win32-ia32-msvc": {
+      "version": "4.54.0",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.54.0.tgz",
+      "integrity": "sha512-woEHgqQqDCkAzrDhvDipnSirm5vxUXtSKDYTVpZG3nUdW/VVB5VdCYA2iReSj/u3yCZzXID4kuKG7OynPnB3WQ==",
+      "cpu": [
+        "ia32"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ]
+    },
+    "node_modules/@rollup/rollup-win32-x64-gnu": {
+      "version": "4.54.0",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-gnu/-/rollup-win32-x64-gnu-4.54.0.tgz",
+      "integrity": "sha512-dzAc53LOuFvHwbCEOS0rPbXp6SIhAf2txMP5p6mGyOXXw5mWY8NGGbPMPrs4P1WItkfApDathBj/NzMLUZ9rtQ==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ]
+    },
+    "node_modules/@rollup/rollup-win32-x64-msvc": {
+      "version": "4.54.0",
+      "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.54.0.tgz",
+      "integrity": "sha512-hYT5d3YNdSh3mbCU1gwQyPgQd3T2ne0A3KG8KSBdav5TiBg6eInVmV+TeR5uHufiIgSFg0XsOWGW5/RhNcSvPg==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ]
+    },
+    "node_modules/@types/babel__core": {
+      "version": "7.20.5",
+      "resolved": "https://registry.npmjs.org/@types/babel__core/-/babel__core-7.20.5.tgz",
+      "integrity": "sha512-qoQprZvz5wQFJwMDqeseRXWv3rqMvhgpbXFfVyWhbx9X47POIA6i/+dXefEmZKoAgOaTdaIgNSMqMIU61yRyzA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/parser": "^7.20.7",
+        "@babel/types": "^7.20.7",
+        "@types/babel__generator": "*",
+        "@types/babel__template": "*",
+        "@types/babel__traverse": "*"
+      }
+    },
+    "node_modules/@types/babel__generator": {
+      "version": "7.27.0",
+      "resolved": "https://registry.npmjs.org/@types/babel__generator/-/babel__generator-7.27.0.tgz",
+      "integrity": "sha512-ufFd2Xi92OAVPYsy+P4n7/U7e68fex0+Ee8gSG9KX7eo084CWiQ4sdxktvdl0bOPupXtVJPY19zk6EwWqUQ8lg==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/types": "^7.0.0"
+      }
+    },
+    "node_modules/@types/babel__template": {
+      "version": "7.4.4",
+      "resolved": "https://registry.npmjs.org/@types/babel__template/-/babel__template-7.4.4.tgz",
+      "integrity": "sha512-h/NUaSyG5EyxBIp8YRxo4RMe2/qQgvyowRwVMzhYhBCONbW8PUsg4lkFMrhgZhUe5z3L3MiLDuvyJ/CaPa2A8A==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/parser": "^7.1.0",
+        "@babel/types": "^7.0.0"
+      }
+    },
+    "node_modules/@types/babel__traverse": {
+      "version": "7.28.0",
+      "resolved": "https://registry.npmjs.org/@types/babel__traverse/-/babel__traverse-7.28.0.tgz",
+      "integrity": "sha512-8PvcXf70gTDZBgt9ptxJ8elBeBjcLOAcOtoO/mPJjtji1+CdGbHgm77om1GrsPxsiE+uXIpNSK64UYaIwQXd4Q==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/types": "^7.28.2"
+      }
+    },
+    "node_modules/@types/debug": {
+      "version": "4.1.12",
+      "resolved": "https://registry.npmjs.org/@types/debug/-/debug-4.1.12.tgz",
+      "integrity": "sha512-vIChWdVG3LG1SMxEvI/AK+FWJthlrqlTu7fbrlywTkkaONwk/UAGaULXRlf8vkzFBLVm0zkMdCquhL5aOjhXPQ==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/ms": "*"
+      }
+    },
+    "node_modules/@types/estree": {
+      "version": "1.0.8",
+      "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.8.tgz",
+      "integrity": "sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w==",
+      "license": "MIT"
+    },
+    "node_modules/@types/estree-jsx": {
+      "version": "1.0.5",
+      "resolved": "https://registry.npmjs.org/@types/estree-jsx/-/estree-jsx-1.0.5.tgz",
+      "integrity": "sha512-52CcUVNFyfb1A2ALocQw/Dd1BQFNmSdkuC3BkZ6iqhdMfQz7JWOFRuJFloOzjk+6WijU56m9oKXFAXc7o3Towg==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/estree": "*"
+      }
+    },
+    "node_modules/@types/hast": {
+      "version": "3.0.4",
+      "resolved": "https://registry.npmjs.org/@types/hast/-/hast-3.0.4.tgz",
+      "integrity": "sha512-WPs+bbQw5aCj+x6laNGWLH3wviHtoCv/P3+otBhbOhJgG8qtpdAMlTCxLtsTWA7LH1Oh/bFCHsBn0TPS5m30EQ==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "*"
+      }
+    },
+    "node_modules/@types/mdast": {
+      "version": "4.0.4",
+      "resolved": "https://registry.npmjs.org/@types/mdast/-/mdast-4.0.4.tgz",
+      "integrity": "sha512-kGaNbPh1k7AFzgpud/gMdvIm5xuECykRR+JnWKQno9TAXVa6WIVCGTPvYGekIDL4uwCZQSYbUxNBSb1aUo79oA==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "*"
+      }
+    },
+    "node_modules/@types/ms": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/@types/ms/-/ms-2.1.0.tgz",
+      "integrity": "sha512-GsCCIZDE/p3i96vtEqx+7dBUGXrc7zeSK3wwPHIaRThS+9OhWIXRqzs4d6k1SVU8g91DrNRWxWUGhp5KXQb2VA==",
+      "license": "MIT"
+    },
+    "node_modules/@types/node": {
+      "version": "22.19.3",
+      "resolved": "https://registry.npmjs.org/@types/node/-/node-22.19.3.tgz",
+      "integrity": "sha512-1N9SBnWYOJTrNZCdh/yJE+t910Y128BoyY+zBLWhL3r0TYzlTmFdXrPwHL9DyFZmlEXNQQolTZh3KHV31QDhyA==",
+      "dev": true,
+      "license": "MIT",
+      "peer": true,
+      "dependencies": {
+        "undici-types": "~6.21.0"
+      }
+    },
+    "node_modules/@types/react": {
+      "version": "19.2.7",
+      "resolved": "https://registry.npmjs.org/@types/react/-/react-19.2.7.tgz",
+      "integrity": "sha512-MWtvHrGZLFttgeEj28VXHxpmwYbor/ATPYbBfSFZEIRK0ecCFLl2Qo55z52Hss+UV9CRN7trSeq1zbgx7YDWWg==",
+      "license": "MIT",
+      "peer": true,
+      "dependencies": {
+        "csstype": "^3.2.2"
+      }
+    },
+    "node_modules/@types/unist": {
+      "version": "3.0.3",
+      "resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz",
+      "integrity": "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==",
+      "license": "MIT"
+    },
+    "node_modules/@ungap/structured-clone": {
+      "version": "1.3.0",
+      "resolved": "https://registry.npmjs.org/@ungap/structured-clone/-/structured-clone-1.3.0.tgz",
+      "integrity": "sha512-WmoN8qaIAo7WTYWbAZuG8PYEhn5fkz7dZrqTBZ7dtt//lL2Gwms1IcnQ5yHqjDfX8Ft5j4YzDM23f87zBfDe9g==",
+      "license": "ISC"
+    },
+    "node_modules/@vitejs/plugin-react": {
+      "version": "5.1.2",
+      "resolved": "https://registry.npmjs.org/@vitejs/plugin-react/-/plugin-react-5.1.2.tgz",
+      "integrity": "sha512-EcA07pHJouywpzsoTUqNh5NwGayl2PPVEJKUSinGGSxFGYn+shYbqMGBg6FXDqgXum9Ou/ecb+411ssw8HImJQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/core": "^7.28.5",
+        "@babel/plugin-transform-react-jsx-self": "^7.27.1",
+        "@babel/plugin-transform-react-jsx-source": "^7.27.1",
+        "@rolldown/pluginutils": "1.0.0-beta.53",
+        "@types/babel__core": "^7.20.5",
+        "react-refresh": "^0.18.0"
+      },
+      "engines": {
+        "node": "^20.19.0 || >=22.12.0"
+      },
+      "peerDependencies": {
+        "vite": "^4.2.0 || ^5.0.0 || ^6.0.0 || ^7.0.0"
+      }
+    },
+    "node_modules/bail": {
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/bail/-/bail-2.0.2.tgz",
+      "integrity": "sha512-0xO6mYd7JB2YesxDKplafRpsiOzPt9V02ddPCLbY1xYGPOX24NTyN50qnUxgCPcSoYMhKpAuBTjQoRZCAkUDRw==",
+      "license": "MIT",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/baseline-browser-mapping": {
+      "version": "2.9.11",
+      "resolved": "https://registry.npmjs.org/baseline-browser-mapping/-/baseline-browser-mapping-2.9.11.tgz",
+      "integrity": "sha512-Sg0xJUNDU1sJNGdfGWhVHX0kkZ+HWcvmVymJbj6NSgZZmW/8S9Y2HQ5euytnIgakgxN6papOAWiwDo1ctFDcoQ==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "bin": {
+        "baseline-browser-mapping": "dist/cli.js"
+      }
+    },
+    "node_modules/browserslist": {
+      "version": "4.28.1",
+      "resolved": "https://registry.npmjs.org/browserslist/-/browserslist-4.28.1.tgz",
+      "integrity": "sha512-ZC5Bd0LgJXgwGqUknZY/vkUQ04r8NXnJZ3yYi4vDmSiZmC/pdSN0NbNRPxZpbtO4uAfDUAFffO8IZoM3Gj8IkA==",
+      "dev": true,
+      "funding": [
+        {
+          "type": "opencollective",
+          "url": "https://opencollective.com/browserslist"
+        },
+        {
+          "type": "tidelift",
+          "url": "https://tidelift.com/funding/github/npm/browserslist"
+        },
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/ai"
+        }
+      ],
+      "license": "MIT",
+      "peer": true,
+      "dependencies": {
+        "baseline-browser-mapping": "^2.9.0",
+        "caniuse-lite": "^1.0.30001759",
+        "electron-to-chromium": "^1.5.263",
+        "node-releases": "^2.0.27",
+        "update-browserslist-db": "^1.2.0"
+      },
+      "bin": {
+        "browserslist": "cli.js"
+      },
+      "engines": {
+        "node": "^6 || ^7 || ^8 || ^9 || ^10 || ^11 || ^12 || >=13.7"
+      }
+    },
+    "node_modules/caniuse-lite": {
+      "version": "1.0.30001761",
+      "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001761.tgz",
+      "integrity": "sha512-JF9ptu1vP2coz98+5051jZ4PwQgd2ni8A+gYSN7EA7dPKIMf0pDlSUxhdmVOaV3/fYK5uWBkgSXJaRLr4+3A6g==",
+      "dev": true,
+      "funding": [
+        {
+          "type": "opencollective",
+          "url": "https://opencollective.com/browserslist"
+        },
+        {
+          "type": "tidelift",
+          "url": "https://tidelift.com/funding/github/npm/caniuse-lite"
+        },
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/ai"
+        }
+      ],
+      "license": "CC-BY-4.0"
+    },
+    "node_modules/ccount": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/ccount/-/ccount-2.0.1.tgz",
+      "integrity": "sha512-eyrF0jiFpY+3drT6383f1qhkbGsLSifNAjA61IUjZjmLCWjItY6LB9ft9YhoDgwfmclB2zhu51Lc7+95b8NRAg==",
+      "license": "MIT",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/character-entities": {
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/character-entities/-/character-entities-2.0.2.tgz",
+      "integrity": "sha512-shx7oQ0Awen/BRIdkjkvz54PnEEI/EjwXDSIZp86/KKdbafHh1Df/RYGBhn4hbe2+uKC9FnT5UCEdyPz3ai9hQ==",
+      "license": "MIT",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/character-entities-html4": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/character-entities-html4/-/character-entities-html4-2.1.0.tgz",
+      "integrity": "sha512-1v7fgQRj6hnSwFpq1Eu0ynr/CDEw0rXo2B61qXrLNdHZmPKgb7fqS1a2JwF0rISo9q77jDI8VMEHoApn8qDoZA==",
+      "license": "MIT",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/character-entities-legacy": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/character-entities-legacy/-/character-entities-legacy-3.0.0.tgz",
+      "integrity": "sha512-RpPp0asT/6ufRm//AJVwpViZbGM/MkjQFxJccQRHmISF/22NBtsHqAWmL+/pmkPWoIUJdWyeVleTl1wydHATVQ==",
+      "license": "MIT",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/character-reference-invalid": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/character-reference-invalid/-/character-reference-invalid-2.0.1.tgz",
+      "integrity": "sha512-iBZ4F4wRbyORVsu0jPV7gXkOsGYjGHPmAyv+HiHG8gi5PtC9KI2j1+v8/tlibRvjoWX027ypmG/n0HtO5t7unw==",
+      "license": "MIT",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/clsx": {
+      "version": "2.1.1",
+      "resolved": "https://registry.npmjs.org/clsx/-/clsx-2.1.1.tgz",
+      "integrity": "sha512-eYm0QWBtUrBWZWG0d386OGAw16Z995PiOVo2B7bjWSbHedGl5e0ZWaq65kOGgUSNesEIDkB9ISbTg/JK9dhCZA==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/comma-separated-tokens": {
+      "version": "2.0.3",
+      "resolved": "https://registry.npmjs.org/comma-separated-tokens/-/comma-separated-tokens-2.0.3.tgz",
+      "integrity": "sha512-Fu4hJdvzeylCfQPp9SGWidpzrMs7tTrlu6Vb8XGaRGck8QSNZJJp538Wrb60Lax4fPwR64ViY468OIUTbRlGZg==",
+      "license": "MIT",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/convert-source-map": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/convert-source-map/-/convert-source-map-2.0.0.tgz",
+      "integrity": "sha512-Kvp459HrV2FEJ1CAsi1Ku+MY3kasH19TFykTz2xWmMeq6bk2NU3XXvfJ+Q61m0xktWwt+1HSYf3JZsTms3aRJg==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/csstype": {
+      "version": "3.2.3",
+      "resolved": "https://registry.npmjs.org/csstype/-/csstype-3.2.3.tgz",
+      "integrity": "sha512-z1HGKcYy2xA8AGQfwrn0PAy+PB7X/GSj3UVJW9qKyn43xWa+gl5nXmU4qqLMRzWVLFC8KusUX8T/0kCiOYpAIQ==",
+      "license": "MIT"
+    },
+    "node_modules/debug": {
+      "version": "4.4.3",
+      "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz",
+      "integrity": "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==",
+      "license": "MIT",
+      "dependencies": {
+        "ms": "^2.1.3"
+      },
+      "engines": {
+        "node": ">=6.0"
+      },
+      "peerDependenciesMeta": {
+        "supports-color": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/decode-named-character-reference": {
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/decode-named-character-reference/-/decode-named-character-reference-1.2.0.tgz",
+      "integrity": "sha512-c6fcElNV6ShtZXmsgNgFFV5tVX2PaV4g+MOAkb8eXHvn6sryJBrZa9r0zV6+dtTyoCKxtDy5tyQ5ZwQuidtd+Q==",
+      "license": "MIT",
+      "dependencies": {
+        "character-entities": "^2.0.0"
+      },
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/dequal": {
+      "version": "2.0.3",
+      "resolved": "https://registry.npmjs.org/dequal/-/dequal-2.0.3.tgz",
+      "integrity": "sha512-0je+qPKHEMohvfRTCEo3CrPG6cAzAYgmzKyxRiYSSDkS6eGJdyVJm7WaYA5ECaAD9wLB2T4EEeymA5aFVcYXCA==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/devlop": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/devlop/-/devlop-1.1.0.tgz",
+      "integrity": "sha512-RWmIqhcFf1lRYBvNmr7qTNuyCt/7/ns2jbpp1+PalgE/rDQcBT0fioSMUpJ93irlUhC5hrg4cYqe6U+0ImW0rA==",
+      "license": "MIT",
+      "dependencies": {
+        "dequal": "^2.0.0"
+      },
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/electron-to-chromium": {
+      "version": "1.5.267",
+      "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.5.267.tgz",
+      "integrity": "sha512-0Drusm6MVRXSOJpGbaSVgcQsuB4hEkMpHXaVstcPmhu5LIedxs1xNK/nIxmQIU/RPC0+1/o0AVZfBTkTNJOdUw==",
+      "dev": true,
+      "license": "ISC"
+    },
+    "node_modules/esbuild": {
+      "version": "0.25.12",
+      "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.25.12.tgz",
+      "integrity": "sha512-bbPBYYrtZbkt6Os6FiTLCTFxvq4tt3JKall1vRwshA3fdVztsLAatFaZobhkBC8/BrPetoa0oksYoKXoG4ryJg==",
+      "dev": true,
+      "hasInstallScript": true,
+      "license": "MIT",
+      "bin": {
+        "esbuild": "bin/esbuild"
+      },
+      "engines": {
+        "node": ">=18"
+      },
+      "optionalDependencies": {
+        "@esbuild/aix-ppc64": "0.25.12",
+        "@esbuild/android-arm": "0.25.12",
+        "@esbuild/android-arm64": "0.25.12",
+        "@esbuild/android-x64": "0.25.12",
+        "@esbuild/darwin-arm64": "0.25.12",
+        "@esbuild/darwin-x64": "0.25.12",
+        "@esbuild/freebsd-arm64": "0.25.12",
+        "@esbuild/freebsd-x64": "0.25.12",
+        "@esbuild/linux-arm": "0.25.12",
+        "@esbuild/linux-arm64": "0.25.12",
+        "@esbuild/linux-ia32": "0.25.12",
+        "@esbuild/linux-loong64": "0.25.12",
+        "@esbuild/linux-mips64el": "0.25.12",
+        "@esbuild/linux-ppc64": "0.25.12",
+        "@esbuild/linux-riscv64": "0.25.12",
+        "@esbuild/linux-s390x": "0.25.12",
+        "@esbuild/linux-x64": "0.25.12",
+        "@esbuild/netbsd-arm64": "0.25.12",
+        "@esbuild/netbsd-x64": "0.25.12",
+        "@esbuild/openbsd-arm64": "0.25.12",
+        "@esbuild/openbsd-x64": "0.25.12",
+        "@esbuild/openharmony-arm64": "0.25.12",
+        "@esbuild/sunos-x64": "0.25.12",
+        "@esbuild/win32-arm64": "0.25.12",
+        "@esbuild/win32-ia32": "0.25.12",
+        "@esbuild/win32-x64": "0.25.12"
+      }
+    },
+    "node_modules/escalade": {
+      "version": "3.2.0",
+      "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.2.0.tgz",
+      "integrity": "sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/estree-util-is-identifier-name": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/estree-util-is-identifier-name/-/estree-util-is-identifier-name-3.0.0.tgz",
+      "integrity": "sha512-hFtqIDZTIUZ9BXLb8y4pYGyk6+wekIivNVTcmvk8NoOh+VeRn5y6cEHzbURrWbfp1fIqdVipilzj+lfaadNZmg==",
+      "license": "MIT",
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/extend": {
+      "version": "3.0.2",
+      "resolved": "https://registry.npmjs.org/extend/-/extend-3.0.2.tgz",
+      "integrity": "sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==",
+      "license": "MIT"
+    },
+    "node_modules/fdir": {
+      "version": "6.5.0",
+      "resolved": "https://registry.npmjs.org/fdir/-/fdir-6.5.0.tgz",
+      "integrity": "sha512-tIbYtZbucOs0BRGqPJkshJUYdL+SDH7dVM8gjy+ERp3WAUjLEFJE+02kanyHtwjWOnwrKYBiwAmM0p4kLJAnXg==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=12.0.0"
+      },
+      "peerDependencies": {
+        "picomatch": "^3 || ^4"
+      },
+      "peerDependenciesMeta": {
+        "picomatch": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/framer-motion": {
+      "version": "12.23.26",
+      "resolved": "https://registry.npmjs.org/framer-motion/-/framer-motion-12.23.26.tgz",
+      "integrity": "sha512-cPcIhgR42xBn1Uj+PzOyheMtZ73H927+uWPDVhUMqxy8UHt6Okavb6xIz9J/phFUHUj0OncR6UvMfJTXoc/LKA==",
+      "license": "MIT",
+      "dependencies": {
+        "motion-dom": "^12.23.23",
+        "motion-utils": "^12.23.6",
+        "tslib": "^2.4.0"
+      },
+      "peerDependencies": {
+        "@emotion/is-prop-valid": "*",
+        "react": "^18.0.0 || ^19.0.0",
+        "react-dom": "^18.0.0 || ^19.0.0"
+      },
+      "peerDependenciesMeta": {
+        "@emotion/is-prop-valid": {
+          "optional": true
+        },
+        "react": {
+          "optional": true
+        },
+        "react-dom": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/fsevents": {
+      "version": "2.3.3",
+      "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz",
+      "integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==",
+      "dev": true,
+      "hasInstallScript": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "darwin"
+      ],
+      "engines": {
+        "node": "^8.16.0 || ^10.6.0 || >=11.0.0"
+      }
+    },
+    "node_modules/gensync": {
+      "version": "1.0.0-beta.2",
+      "resolved": "https://registry.npmjs.org/gensync/-/gensync-1.0.0-beta.2.tgz",
+      "integrity": "sha512-3hN7NaskYvMDLQY55gnW3NQ+mesEAepTqlg+VEbj7zzqEMBVNhzcGYYeqFo/TlYz6eQiFcp1HcsCZO+nGgS8zg==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/hast-util-to-jsx-runtime": {
+      "version": "2.3.6",
+      "resolved": "https://registry.npmjs.org/hast-util-to-jsx-runtime/-/hast-util-to-jsx-runtime-2.3.6.tgz",
+      "integrity": "sha512-zl6s8LwNyo1P9uw+XJGvZtdFF1GdAkOg8ujOw+4Pyb76874fLps4ueHXDhXWdk6YHQ6OgUtinliG7RsYvCbbBg==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/estree": "^1.0.0",
+        "@types/hast": "^3.0.0",
+        "@types/unist": "^3.0.0",
+        "comma-separated-tokens": "^2.0.0",
+        "devlop": "^1.0.0",
+        "estree-util-is-identifier-name": "^3.0.0",
+        "hast-util-whitespace": "^3.0.0",
+        "mdast-util-mdx-expression": "^2.0.0",
+        "mdast-util-mdx-jsx": "^3.0.0",
+        "mdast-util-mdxjs-esm": "^2.0.0",
+        "property-information": "^7.0.0",
+        "space-separated-tokens": "^2.0.0",
+        "style-to-js": "^1.0.0",
+        "unist-util-position": "^5.0.0",
+        "vfile-message": "^4.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/hast-util-whitespace": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/hast-util-whitespace/-/hast-util-whitespace-3.0.0.tgz",
+      "integrity": "sha512-88JUN06ipLwsnv+dVn+OIYOvAuvBMy/Qoi6O7mQHxdPXpjy+Cd6xRkWwux7DKO+4sYILtLBRIKgsdpS2gQc7qw==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/hast": "^3.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/html-url-attributes": {
+      "version": "3.0.1",
+      "resolved": "https://registry.npmjs.org/html-url-attributes/-/html-url-attributes-3.0.1.tgz",
+      "integrity": "sha512-ol6UPyBWqsrO6EJySPz2O7ZSr856WDrEzM5zMqp+FJJLGMW35cLYmmZnl0vztAZxRUoNZJFTCohfjuIJ8I4QBQ==",
+      "license": "MIT",
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/inline-style-parser": {
+      "version": "0.2.7",
+      "resolved": "https://registry.npmjs.org/inline-style-parser/-/inline-style-parser-0.2.7.tgz",
+      "integrity": "sha512-Nb2ctOyNR8DqQoR0OwRG95uNWIC0C1lCgf5Naz5H6Ji72KZ8OcFZLz2P5sNgwlyoJ8Yif11oMuYs5pBQa86csA==",
+      "license": "MIT"
+    },
+    "node_modules/is-alphabetical": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/is-alphabetical/-/is-alphabetical-2.0.1.tgz",
+      "integrity": "sha512-FWyyY60MeTNyeSRpkM2Iry0G9hpr7/9kD40mD/cGQEuilcZYS4okz8SN2Q6rLCJ8gbCt6fN+rC+6tMGS99LaxQ==",
+      "license": "MIT",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/is-alphanumerical": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/is-alphanumerical/-/is-alphanumerical-2.0.1.tgz",
+      "integrity": "sha512-hmbYhX/9MUMF5uh7tOXyK/n0ZvWpad5caBA17GsC6vyuCqaWliRG5K1qS9inmUhEMaOBIW7/whAnSwveW/LtZw==",
+      "license": "MIT",
+      "dependencies": {
+        "is-alphabetical": "^2.0.0",
+        "is-decimal": "^2.0.0"
+      },
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/is-decimal": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/is-decimal/-/is-decimal-2.0.1.tgz",
+      "integrity": "sha512-AAB9hiomQs5DXWcRB1rqsxGUstbRroFOPPVAomNk/3XHR5JyEZChOyTWe2oayKnsSsr/kcGqF+z6yuH6HHpN0A==",
+      "license": "MIT",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/is-hexadecimal": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/is-hexadecimal/-/is-hexadecimal-2.0.1.tgz",
+      "integrity": "sha512-DgZQp241c8oO6cA1SbTEWiXeoxV42vlcJxgH+B3hi1AiqqKruZR3ZGF8In3fj4+/y/7rHvlOZLZtgJ/4ttYGZg==",
+      "license": "MIT",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/is-plain-obj": {
+      "version": "4.1.0",
+      "resolved": "https://registry.npmjs.org/is-plain-obj/-/is-plain-obj-4.1.0.tgz",
+      "integrity": "sha512-+Pgi+vMuUNkJyExiMBt5IlFoMyKnr5zhJ4Uspz58WOhBF5QoIZkFyNHIbBAtHwzVAgk5RtndVNsDRN61/mmDqg==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=12"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/js-tokens": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz",
+      "integrity": "sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/jsesc": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/jsesc/-/jsesc-3.1.0.tgz",
+      "integrity": "sha512-/sM3dO2FOzXjKQhJuo0Q173wf2KOo8t4I8vHy6lF9poUp7bKT0/NHE8fPX23PwfhnykfqnC2xRxOnVw5XuGIaA==",
+      "dev": true,
+      "license": "MIT",
+      "bin": {
+        "jsesc": "bin/jsesc"
+      },
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/json5": {
+      "version": "2.2.3",
+      "resolved": "https://registry.npmjs.org/json5/-/json5-2.2.3.tgz",
+      "integrity": "sha512-XmOWe7eyHYH14cLdVPoyg+GOH3rYX++KpzrylJwSW98t3Nk+U8XOl8FWKOgwtzdb8lXGf6zYwDUzeHMWfxasyg==",
+      "dev": true,
+      "license": "MIT",
+      "bin": {
+        "json5": "lib/cli.js"
+      },
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/longest-streak": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/longest-streak/-/longest-streak-3.1.0.tgz",
+      "integrity": "sha512-9Ri+o0JYgehTaVBBDoMqIl8GXtbWg711O3srftcHhZ0dqnETqLaoIK0x17fUw9rFSlK/0NlsKe0Ahhyl5pXE2g==",
+      "license": "MIT",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/lru-cache": {
+      "version": "5.1.1",
+      "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz",
+      "integrity": "sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==",
+      "dev": true,
+      "license": "ISC",
+      "dependencies": {
+        "yallist": "^3.0.2"
+      }
+    },
+    "node_modules/lucide-react": {
+      "version": "0.562.0",
+      "resolved": "https://registry.npmjs.org/lucide-react/-/lucide-react-0.562.0.tgz",
+      "integrity": "sha512-82hOAu7y0dbVuFfmO4bYF1XEwYk/mEbM5E+b1jgci/udUBEE/R7LF5Ip0CCEmXe8AybRM8L+04eP+LGZeDvkiw==",
+      "license": "ISC",
+      "peerDependencies": {
+        "react": "^16.5.1 || ^17.0.0 || ^18.0.0 || ^19.0.0"
+      }
+    },
+    "node_modules/mdast-util-from-markdown": {
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/mdast-util-from-markdown/-/mdast-util-from-markdown-2.0.2.tgz",
+      "integrity": "sha512-uZhTV/8NBuw0WHkPTrCqDOl0zVe1BIng5ZtHoDk49ME1qqcjYmmLmOf0gELgcRMxN4w2iuIeVso5/6QymSrgmA==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/mdast": "^4.0.0",
+        "@types/unist": "^3.0.0",
+        "decode-named-character-reference": "^1.0.0",
+        "devlop": "^1.0.0",
+        "mdast-util-to-string": "^4.0.0",
+        "micromark": "^4.0.0",
+        "micromark-util-decode-numeric-character-reference": "^2.0.0",
+        "micromark-util-decode-string": "^2.0.0",
+        "micromark-util-normalize-identifier": "^2.0.0",
+        "micromark-util-symbol": "^2.0.0",
+        "micromark-util-types": "^2.0.0",
+        "unist-util-stringify-position": "^4.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/mdast-util-mdx-expression": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/mdast-util-mdx-expression/-/mdast-util-mdx-expression-2.0.1.tgz",
+      "integrity": "sha512-J6f+9hUp+ldTZqKRSg7Vw5V6MqjATc+3E4gf3CFNcuZNWD8XdyI6zQ8GqH7f8169MM6P7hMBRDVGnn7oHB9kXQ==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/estree-jsx": "^1.0.0",
+        "@types/hast": "^3.0.0",
+        "@types/mdast": "^4.0.0",
+        "devlop": "^1.0.0",
+        "mdast-util-from-markdown": "^2.0.0",
+        "mdast-util-to-markdown": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/mdast-util-mdx-jsx": {
+      "version": "3.2.0",
+      "resolved": "https://registry.npmjs.org/mdast-util-mdx-jsx/-/mdast-util-mdx-jsx-3.2.0.tgz",
+      "integrity": "sha512-lj/z8v0r6ZtsN/cGNNtemmmfoLAFZnjMbNyLzBafjzikOM+glrjNHPlf6lQDOTccj9n5b0PPihEBbhneMyGs1Q==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/estree-jsx": "^1.0.0",
+        "@types/hast": "^3.0.0",
+        "@types/mdast": "^4.0.0",
+        "@types/unist": "^3.0.0",
+        "ccount": "^2.0.0",
+        "devlop": "^1.1.0",
+        "mdast-util-from-markdown": "^2.0.0",
+        "mdast-util-to-markdown": "^2.0.0",
+        "parse-entities": "^4.0.0",
+        "stringify-entities": "^4.0.0",
+        "unist-util-stringify-position": "^4.0.0",
+        "vfile-message": "^4.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/mdast-util-mdxjs-esm": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/mdast-util-mdxjs-esm/-/mdast-util-mdxjs-esm-2.0.1.tgz",
+      "integrity": "sha512-EcmOpxsZ96CvlP03NghtH1EsLtr0n9Tm4lPUJUBccV9RwUOneqSycg19n5HGzCf+10LozMRSObtVr3ee1WoHtg==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/estree-jsx": "^1.0.0",
+        "@types/hast": "^3.0.0",
+        "@types/mdast": "^4.0.0",
+        "devlop": "^1.0.0",
+        "mdast-util-from-markdown": "^2.0.0",
+        "mdast-util-to-markdown": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/mdast-util-phrasing": {
+      "version": "4.1.0",
+      "resolved": "https://registry.npmjs.org/mdast-util-phrasing/-/mdast-util-phrasing-4.1.0.tgz",
+      "integrity": "sha512-TqICwyvJJpBwvGAMZjj4J2n0X8QWp21b9l0o7eXyVJ25YNWYbJDVIyD1bZXE6WtV6RmKJVYmQAKWa0zWOABz2w==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/mdast": "^4.0.0",
+        "unist-util-is": "^6.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/mdast-util-to-hast": {
+      "version": "13.2.1",
+      "resolved": "https://registry.npmjs.org/mdast-util-to-hast/-/mdast-util-to-hast-13.2.1.tgz",
+      "integrity": "sha512-cctsq2wp5vTsLIcaymblUriiTcZd0CwWtCbLvrOzYCDZoWyMNV8sZ7krj09FSnsiJi3WVsHLM4k6Dq/yaPyCXA==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/hast": "^3.0.0",
+        "@types/mdast": "^4.0.0",
+        "@ungap/structured-clone": "^1.0.0",
+        "devlop": "^1.0.0",
+        "micromark-util-sanitize-uri": "^2.0.0",
+        "trim-lines": "^3.0.0",
+        "unist-util-position": "^5.0.0",
+        "unist-util-visit": "^5.0.0",
+        "vfile": "^6.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/mdast-util-to-markdown": {
+      "version": "2.1.2",
+      "resolved": "https://registry.npmjs.org/mdast-util-to-markdown/-/mdast-util-to-markdown-2.1.2.tgz",
+      "integrity": "sha512-xj68wMTvGXVOKonmog6LwyJKrYXZPvlwabaryTjLh9LuvovB/KAH+kvi8Gjj+7rJjsFi23nkUxRQv1KqSroMqA==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/mdast": "^4.0.0",
+        "@types/unist": "^3.0.0",
+        "longest-streak": "^3.0.0",
+        "mdast-util-phrasing": "^4.0.0",
+        "mdast-util-to-string": "^4.0.0",
+        "micromark-util-classify-character": "^2.0.0",
+        "micromark-util-decode-string": "^2.0.0",
+        "unist-util-visit": "^5.0.0",
+        "zwitch": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/mdast-util-to-string": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/mdast-util-to-string/-/mdast-util-to-string-4.0.0.tgz",
+      "integrity": "sha512-0H44vDimn51F0YwvxSJSm0eCDOJTRlmN0R1yBh4HLj9wiV1Dn0QoXGbvFAWj2hSItVTlCmBF1hqKlIyUBVFLPg==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/mdast": "^4.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/micromark": {
+      "version": "4.0.2",
+      "resolved": "https://registry.npmjs.org/micromark/-/micromark-4.0.2.tgz",
+      "integrity": "sha512-zpe98Q6kvavpCr1NPVSCMebCKfD7CA2NqZ+rykeNhONIJBpc1tFKt9hucLGwha3jNTNI8lHpctWJWoimVF4PfA==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "@types/debug": "^4.0.0",
+        "debug": "^4.0.0",
+        "decode-named-character-reference": "^1.0.0",
+        "devlop": "^1.0.0",
+        "micromark-core-commonmark": "^2.0.0",
+        "micromark-factory-space": "^2.0.0",
+        "micromark-util-character": "^2.0.0",
+        "micromark-util-chunked": "^2.0.0",
+        "micromark-util-combine-extensions": "^2.0.0",
+        "micromark-util-decode-numeric-character-reference": "^2.0.0",
+        "micromark-util-encode": "^2.0.0",
+        "micromark-util-normalize-identifier": "^2.0.0",
+        "micromark-util-resolve-all": "^2.0.0",
+        "micromark-util-sanitize-uri": "^2.0.0",
+        "micromark-util-subtokenize": "^2.0.0",
+        "micromark-util-symbol": "^2.0.0",
+        "micromark-util-types": "^2.0.0"
+      }
+    },
+    "node_modules/micromark-core-commonmark": {
+      "version": "2.0.3",
+      "resolved": "https://registry.npmjs.org/micromark-core-commonmark/-/micromark-core-commonmark-2.0.3.tgz",
+      "integrity": "sha512-RDBrHEMSxVFLg6xvnXmb1Ayr2WzLAWjeSATAoxwKYJV94TeNavgoIdA0a9ytzDSVzBy2YKFK+emCPOEibLeCrg==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "decode-named-character-reference": "^1.0.0",
+        "devlop": "^1.0.0",
+        "micromark-factory-destination": "^2.0.0",
+        "micromark-factory-label": "^2.0.0",
+        "micromark-factory-space": "^2.0.0",
+        "micromark-factory-title": "^2.0.0",
+        "micromark-factory-whitespace": "^2.0.0",
+        "micromark-util-character": "^2.0.0",
+        "micromark-util-chunked": "^2.0.0",
+        "micromark-util-classify-character": "^2.0.0",
+        "micromark-util-html-tag-name": "^2.0.0",
+        "micromark-util-normalize-identifier": "^2.0.0",
+        "micromark-util-resolve-all": "^2.0.0",
+        "micromark-util-subtokenize": "^2.0.0",
+        "micromark-util-symbol": "^2.0.0",
+        "micromark-util-types": "^2.0.0"
+      }
+    },
+    "node_modules/micromark-factory-destination": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/micromark-factory-destination/-/micromark-factory-destination-2.0.1.tgz",
+      "integrity": "sha512-Xe6rDdJlkmbFRExpTOmRj9N3MaWmbAgdpSrBQvCFqhezUn4AHqJHbaEnfbVYYiexVSs//tqOdY/DxhjdCiJnIA==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "micromark-util-character": "^2.0.0",
+        "micromark-util-symbol": "^2.0.0",
+        "micromark-util-types": "^2.0.0"
+      }
+    },
+    "node_modules/micromark-factory-label": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/micromark-factory-label/-/micromark-factory-label-2.0.1.tgz",
+      "integrity": "sha512-VFMekyQExqIW7xIChcXn4ok29YE3rnuyveW3wZQWWqF4Nv9Wk5rgJ99KzPvHjkmPXF93FXIbBp6YdW3t71/7Vg==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "devlop": "^1.0.0",
+        "micromark-util-character": "^2.0.0",
+        "micromark-util-symbol": "^2.0.0",
+        "micromark-util-types": "^2.0.0"
+      }
+    },
+    "node_modules/micromark-factory-space": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/micromark-factory-space/-/micromark-factory-space-2.0.1.tgz",
+      "integrity": "sha512-zRkxjtBxxLd2Sc0d+fbnEunsTj46SWXgXciZmHq0kDYGnck/ZSGj9/wULTV95uoeYiK5hRXP2mJ98Uo4cq/LQg==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "micromark-util-character": "^2.0.0",
+        "micromark-util-types": "^2.0.0"
+      }
+    },
+    "node_modules/micromark-factory-title": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/micromark-factory-title/-/micromark-factory-title-2.0.1.tgz",
+      "integrity": "sha512-5bZ+3CjhAd9eChYTHsjy6TGxpOFSKgKKJPJxr293jTbfry2KDoWkhBb6TcPVB4NmzaPhMs1Frm9AZH7OD4Cjzw==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "micromark-factory-space": "^2.0.0",
+        "micromark-util-character": "^2.0.0",
+        "micromark-util-symbol": "^2.0.0",
+        "micromark-util-types": "^2.0.0"
+      }
+    },
+    "node_modules/micromark-factory-whitespace": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/micromark-factory-whitespace/-/micromark-factory-whitespace-2.0.1.tgz",
+      "integrity": "sha512-Ob0nuZ3PKt/n0hORHyvoD9uZhr+Za8sFoP+OnMcnWK5lngSzALgQYKMr9RJVOWLqQYuyn6ulqGWSXdwf6F80lQ==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "micromark-factory-space": "^2.0.0",
+        "micromark-util-character": "^2.0.0",
+        "micromark-util-symbol": "^2.0.0",
+        "micromark-util-types": "^2.0.0"
+      }
+    },
+    "node_modules/micromark-util-character": {
+      "version": "2.1.1",
+      "resolved": "https://registry.npmjs.org/micromark-util-character/-/micromark-util-character-2.1.1.tgz",
+      "integrity": "sha512-wv8tdUTJ3thSFFFJKtpYKOYiGP2+v96Hvk4Tu8KpCAsTMs6yi+nVmGh1syvSCsaxz45J6Jbw+9DD6g97+NV67Q==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "micromark-util-symbol": "^2.0.0",
+        "micromark-util-types": "^2.0.0"
+      }
+    },
+    "node_modules/micromark-util-chunked": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/micromark-util-chunked/-/micromark-util-chunked-2.0.1.tgz",
+      "integrity": "sha512-QUNFEOPELfmvv+4xiNg2sRYeS/P84pTW0TCgP5zc9FpXetHY0ab7SxKyAQCNCc1eK0459uoLI1y5oO5Vc1dbhA==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "micromark-util-symbol": "^2.0.0"
+      }
+    },
+    "node_modules/micromark-util-classify-character": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/micromark-util-classify-character/-/micromark-util-classify-character-2.0.1.tgz",
+      "integrity": "sha512-K0kHzM6afW/MbeWYWLjoHQv1sgg2Q9EccHEDzSkxiP/EaagNzCm7T/WMKZ3rjMbvIpvBiZgwR3dKMygtA4mG1Q==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "micromark-util-character": "^2.0.0",
+        "micromark-util-symbol": "^2.0.0",
+        "micromark-util-types": "^2.0.0"
+      }
+    },
+    "node_modules/micromark-util-combine-extensions": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/micromark-util-combine-extensions/-/micromark-util-combine-extensions-2.0.1.tgz",
+      "integrity": "sha512-OnAnH8Ujmy59JcyZw8JSbK9cGpdVY44NKgSM7E9Eh7DiLS2E9RNQf0dONaGDzEG9yjEl5hcqeIsj4hfRkLH/Bg==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "micromark-util-chunked": "^2.0.0",
+        "micromark-util-types": "^2.0.0"
+      }
+    },
+    "node_modules/micromark-util-decode-numeric-character-reference": {
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/micromark-util-decode-numeric-character-reference/-/micromark-util-decode-numeric-character-reference-2.0.2.tgz",
+      "integrity": "sha512-ccUbYk6CwVdkmCQMyr64dXz42EfHGkPQlBj5p7YVGzq8I7CtjXZJrubAYezf7Rp+bjPseiROqe7G6foFd+lEuw==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "micromark-util-symbol": "^2.0.0"
+      }
+    },
+    "node_modules/micromark-util-decode-string": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/micromark-util-decode-string/-/micromark-util-decode-string-2.0.1.tgz",
+      "integrity": "sha512-nDV/77Fj6eH1ynwscYTOsbK7rR//Uj0bZXBwJZRfaLEJ1iGBR6kIfNmlNqaqJf649EP0F3NWNdeJi03elllNUQ==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "decode-named-character-reference": "^1.0.0",
+        "micromark-util-character": "^2.0.0",
+        "micromark-util-decode-numeric-character-reference": "^2.0.0",
+        "micromark-util-symbol": "^2.0.0"
+      }
+    },
+    "node_modules/micromark-util-encode": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/micromark-util-encode/-/micromark-util-encode-2.0.1.tgz",
+      "integrity": "sha512-c3cVx2y4KqUnwopcO9b/SCdo2O67LwJJ/UyqGfbigahfegL9myoEFoDYZgkT7f36T0bLrM9hZTAaAyH+PCAXjw==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
+      "license": "MIT"
+    },
+    "node_modules/micromark-util-html-tag-name": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/micromark-util-html-tag-name/-/micromark-util-html-tag-name-2.0.1.tgz",
+      "integrity": "sha512-2cNEiYDhCWKI+Gs9T0Tiysk136SnR13hhO8yW6BGNyhOC4qYFnwF1nKfD3HFAIXA5c45RrIG1ub11GiXeYd1xA==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
+      "license": "MIT"
+    },
+    "node_modules/micromark-util-normalize-identifier": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/micromark-util-normalize-identifier/-/micromark-util-normalize-identifier-2.0.1.tgz",
+      "integrity": "sha512-sxPqmo70LyARJs0w2UclACPUUEqltCkJ6PhKdMIDuJ3gSf/Q+/GIe3WKl0Ijb/GyH9lOpUkRAO2wp0GVkLvS9Q==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "micromark-util-symbol": "^2.0.0"
+      }
+    },
+    "node_modules/micromark-util-resolve-all": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/micromark-util-resolve-all/-/micromark-util-resolve-all-2.0.1.tgz",
+      "integrity": "sha512-VdQyxFWFT2/FGJgwQnJYbe1jjQoNTS4RjglmSjTUlpUMa95Htx9NHeYW4rGDJzbjvCsl9eLjMQwGeElsqmzcHg==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "micromark-util-types": "^2.0.0"
+      }
+    },
+    "node_modules/micromark-util-sanitize-uri": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/micromark-util-sanitize-uri/-/micromark-util-sanitize-uri-2.0.1.tgz",
+      "integrity": "sha512-9N9IomZ/YuGGZZmQec1MbgxtlgougxTodVwDzzEouPKo3qFWvymFHWcnDi2vzV1ff6kas9ucW+o3yzJK9YB1AQ==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "micromark-util-character": "^2.0.0",
+        "micromark-util-encode": "^2.0.0",
+        "micromark-util-symbol": "^2.0.0"
+      }
+    },
+    "node_modules/micromark-util-subtokenize": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/micromark-util-subtokenize/-/micromark-util-subtokenize-2.1.0.tgz",
+      "integrity": "sha512-XQLu552iSctvnEcgXw6+Sx75GflAPNED1qx7eBJ+wydBb2KCbRZe+NwvIEEMM83uml1+2WSXpBAcp9IUCgCYWA==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "devlop": "^1.0.0",
+        "micromark-util-chunked": "^2.0.0",
+        "micromark-util-symbol": "^2.0.0",
+        "micromark-util-types": "^2.0.0"
+      }
+    },
+    "node_modules/micromark-util-symbol": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/micromark-util-symbol/-/micromark-util-symbol-2.0.1.tgz",
+      "integrity": "sha512-vs5t8Apaud9N28kgCrRUdEed4UJ+wWNvicHLPxCa9ENlYuAY31M0ETy5y1vA33YoNPDFTghEbnh6efaE8h4x0Q==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
+      "license": "MIT"
+    },
+    "node_modules/micromark-util-types": {
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/micromark-util-types/-/micromark-util-types-2.0.2.tgz",
+      "integrity": "sha512-Yw0ECSpJoViF1qTU4DC6NwtC4aWGt1EkzaQB8KPPyCRR8z9TWeV0HbEFGTO+ZY1wB22zmxnJqhPyTpOVCpeHTA==",
+      "funding": [
+        {
+          "type": "GitHub Sponsors",
+          "url": "https://github.com/sponsors/unifiedjs"
+        },
+        {
+          "type": "OpenCollective",
+          "url": "https://opencollective.com/unified"
+        }
+      ],
+      "license": "MIT"
+    },
+    "node_modules/motion-dom": {
+      "version": "12.23.23",
+      "resolved": "https://registry.npmjs.org/motion-dom/-/motion-dom-12.23.23.tgz",
+      "integrity": "sha512-n5yolOs0TQQBRUFImrRfs/+6X4p3Q4n1dUEqt/H58Vx7OW6RF+foWEgmTVDhIWJIMXOuNNL0apKH2S16en9eiA==",
+      "license": "MIT",
+      "dependencies": {
+        "motion-utils": "^12.23.6"
+      }
+    },
+    "node_modules/motion-utils": {
+      "version": "12.23.6",
+      "resolved": "https://registry.npmjs.org/motion-utils/-/motion-utils-12.23.6.tgz",
+      "integrity": "sha512-eAWoPgr4eFEOFfg2WjIsMoqJTW6Z8MTUCgn/GZ3VRpClWBdnbjryiA3ZSNLyxCTmCQx4RmYX6jX1iWHbenUPNQ==",
+      "license": "MIT"
+    },
+    "node_modules/ms": {
+      "version": "2.1.3",
+      "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
+      "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==",
+      "license": "MIT"
+    },
+    "node_modules/nanoid": {
+      "version": "3.3.11",
+      "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.11.tgz",
+      "integrity": "sha512-N8SpfPUnUp1bK+PMYW8qSWdl9U+wwNWI4QKxOYDy9JAro3WMX7p2OeVRF9v+347pnakNevPmiHhNmZ2HbFA76w==",
+      "dev": true,
+      "funding": [
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/ai"
+        }
+      ],
+      "license": "MIT",
+      "bin": {
+        "nanoid": "bin/nanoid.cjs"
+      },
+      "engines": {
+        "node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1"
+      }
+    },
+    "node_modules/node-releases": {
+      "version": "2.0.27",
+      "resolved": "https://registry.npmjs.org/node-releases/-/node-releases-2.0.27.tgz",
+      "integrity": "sha512-nmh3lCkYZ3grZvqcCH+fjmQ7X+H0OeZgP40OierEaAptX4XofMh5kwNbWh7lBduUzCcV/8kZ+NDLCwm2iorIlA==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/parse-entities": {
+      "version": "4.0.2",
+      "resolved": "https://registry.npmjs.org/parse-entities/-/parse-entities-4.0.2.tgz",
+      "integrity": "sha512-GG2AQYWoLgL877gQIKeRPGO1xF9+eG1ujIb5soS5gPvLQ1y2o8FL90w2QWNdf9I361Mpp7726c+lj3U0qK1uGw==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2.0.0",
+        "character-entities-legacy": "^3.0.0",
+        "character-reference-invalid": "^2.0.0",
+        "decode-named-character-reference": "^1.0.0",
+        "is-alphanumerical": "^2.0.0",
+        "is-decimal": "^2.0.0",
+        "is-hexadecimal": "^2.0.0"
+      },
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/parse-entities/node_modules/@types/unist": {
+      "version": "2.0.11",
+      "resolved": "https://registry.npmjs.org/@types/unist/-/unist-2.0.11.tgz",
+      "integrity": "sha512-CmBKiL6NNo/OqgmMn95Fk9Whlp2mtvIv+KNpQKN2F4SjvrEesubTRWGYSg+BnWZOnlCaSTU1sMpsBOzgbYhnsA==",
+      "license": "MIT"
+    },
+    "node_modules/picocolors": {
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz",
+      "integrity": "sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA==",
+      "dev": true,
+      "license": "ISC"
+    },
+    "node_modules/picomatch": {
+      "version": "4.0.3",
+      "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.3.tgz",
+      "integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==",
+      "dev": true,
+      "license": "MIT",
+      "peer": true,
+      "engines": {
+        "node": ">=12"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/jonschlinkert"
+      }
+    },
+    "node_modules/postcss": {
+      "version": "8.5.6",
+      "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.5.6.tgz",
+      "integrity": "sha512-3Ybi1tAuwAP9s0r1UQ2J4n5Y0G05bJkpUIO0/bI9MhwmD70S5aTWbXGBwxHrelT+XM1k6dM0pk+SwNkpTRN7Pg==",
+      "dev": true,
+      "funding": [
+        {
+          "type": "opencollective",
+          "url": "https://opencollective.com/postcss/"
+        },
+        {
+          "type": "tidelift",
+          "url": "https://tidelift.com/funding/github/npm/postcss"
+        },
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/ai"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "nanoid": "^3.3.11",
+        "picocolors": "^1.1.1",
+        "source-map-js": "^1.2.1"
+      },
+      "engines": {
+        "node": "^10 || ^12 || >=14"
+      }
+    },
+    "node_modules/property-information": {
+      "version": "7.1.0",
+      "resolved": "https://registry.npmjs.org/property-information/-/property-information-7.1.0.tgz",
+      "integrity": "sha512-TwEZ+X+yCJmYfL7TPUOcvBZ4QfoT5YenQiJuX//0th53DE6w0xxLEtfK3iyryQFddXuvkIk51EEgrJQ0WJkOmQ==",
+      "license": "MIT",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/react": {
+      "version": "19.2.3",
+      "resolved": "https://registry.npmjs.org/react/-/react-19.2.3.tgz",
+      "integrity": "sha512-Ku/hhYbVjOQnXDZFv2+RibmLFGwFdeeKHFcOTlrt7xplBnya5OGn/hIRDsqDiSUcfORsDC7MPxwork8jBwsIWA==",
+      "license": "MIT",
+      "peer": true,
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/react-dom": {
+      "version": "19.2.3",
+      "resolved": "https://registry.npmjs.org/react-dom/-/react-dom-19.2.3.tgz",
+      "integrity": "sha512-yELu4WmLPw5Mr/lmeEpox5rw3RETacE++JgHqQzd2dg+YbJuat3jH4ingc+WPZhxaoFzdv9y33G+F7Nl5O0GBg==",
+      "license": "MIT",
+      "peer": true,
+      "dependencies": {
+        "scheduler": "^0.27.0"
+      },
+      "peerDependencies": {
+        "react": "^19.2.3"
+      }
+    },
+    "node_modules/react-markdown": {
+      "version": "9.1.0",
+      "resolved": "https://registry.npmjs.org/react-markdown/-/react-markdown-9.1.0.tgz",
+      "integrity": "sha512-xaijuJB0kzGiUdG7nc2MOMDUDBWPyGAjZtUrow9XxUeua8IqeP+VlIfAZ3bphpcLTnSZXz6z9jcVC/TCwbfgdw==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/hast": "^3.0.0",
+        "@types/mdast": "^4.0.0",
+        "devlop": "^1.0.0",
+        "hast-util-to-jsx-runtime": "^2.0.0",
+        "html-url-attributes": "^3.0.0",
+        "mdast-util-to-hast": "^13.0.0",
+        "remark-parse": "^11.0.0",
+        "remark-rehype": "^11.0.0",
+        "unified": "^11.0.0",
+        "unist-util-visit": "^5.0.0",
+        "vfile": "^6.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      },
+      "peerDependencies": {
+        "@types/react": ">=18",
+        "react": ">=18"
+      }
+    },
+    "node_modules/react-refresh": {
+      "version": "0.18.0",
+      "resolved": "https://registry.npmjs.org/react-refresh/-/react-refresh-0.18.0.tgz",
+      "integrity": "sha512-QgT5//D3jfjJb6Gsjxv0Slpj23ip+HtOpnNgnb2S5zU3CB26G/IDPGoy4RJB42wzFE46DRsstbW6tKHoKbhAxw==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/remark-parse": {
+      "version": "11.0.0",
+      "resolved": "https://registry.npmjs.org/remark-parse/-/remark-parse-11.0.0.tgz",
+      "integrity": "sha512-FCxlKLNGknS5ba/1lmpYijMUzX2esxW5xQqjWxw2eHFfS2MSdaHVINFmhjo+qN1WhZhNimq0dZATN9pH0IDrpA==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/mdast": "^4.0.0",
+        "mdast-util-from-markdown": "^2.0.0",
+        "micromark-util-types": "^2.0.0",
+        "unified": "^11.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/remark-rehype": {
+      "version": "11.1.2",
+      "resolved": "https://registry.npmjs.org/remark-rehype/-/remark-rehype-11.1.2.tgz",
+      "integrity": "sha512-Dh7l57ianaEoIpzbp0PC9UKAdCSVklD8E5Rpw7ETfbTl3FqcOOgq5q2LVDhgGCkaBv7p24JXikPdvhhmHvKMsw==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/hast": "^3.0.0",
+        "@types/mdast": "^4.0.0",
+        "mdast-util-to-hast": "^13.0.0",
+        "unified": "^11.0.0",
+        "vfile": "^6.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/rollup": {
+      "version": "4.54.0",
+      "resolved": "https://registry.npmjs.org/rollup/-/rollup-4.54.0.tgz",
+      "integrity": "sha512-3nk8Y3a9Ea8szgKhinMlGMhGMw89mqule3KWczxhIzqudyHdCIOHw8WJlj/r329fACjKLEh13ZSk7oE22kyeIw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@types/estree": "1.0.8"
+      },
+      "bin": {
+        "rollup": "dist/bin/rollup"
+      },
+      "engines": {
+        "node": ">=18.0.0",
+        "npm": ">=8.0.0"
+      },
+      "optionalDependencies": {
+        "@rollup/rollup-android-arm-eabi": "4.54.0",
+        "@rollup/rollup-android-arm64": "4.54.0",
+        "@rollup/rollup-darwin-arm64": "4.54.0",
+        "@rollup/rollup-darwin-x64": "4.54.0",
+        "@rollup/rollup-freebsd-arm64": "4.54.0",
+        "@rollup/rollup-freebsd-x64": "4.54.0",
+        "@rollup/rollup-linux-arm-gnueabihf": "4.54.0",
+        "@rollup/rollup-linux-arm-musleabihf": "4.54.0",
+        "@rollup/rollup-linux-arm64-gnu": "4.54.0",
+        "@rollup/rollup-linux-arm64-musl": "4.54.0",
+        "@rollup/rollup-linux-loong64-gnu": "4.54.0",
+        "@rollup/rollup-linux-ppc64-gnu": "4.54.0",
+        "@rollup/rollup-linux-riscv64-gnu": "4.54.0",
+        "@rollup/rollup-linux-riscv64-musl": "4.54.0",
+        "@rollup/rollup-linux-s390x-gnu": "4.54.0",
+        "@rollup/rollup-linux-x64-gnu": "4.54.0",
+        "@rollup/rollup-linux-x64-musl": "4.54.0",
+        "@rollup/rollup-openharmony-arm64": "4.54.0",
+        "@rollup/rollup-win32-arm64-msvc": "4.54.0",
+        "@rollup/rollup-win32-ia32-msvc": "4.54.0",
+        "@rollup/rollup-win32-x64-gnu": "4.54.0",
+        "@rollup/rollup-win32-x64-msvc": "4.54.0",
+        "fsevents": "~2.3.2"
+      }
+    },
+    "node_modules/scheduler": {
+      "version": "0.27.0",
+      "resolved": "https://registry.npmjs.org/scheduler/-/scheduler-0.27.0.tgz",
+      "integrity": "sha512-eNv+WrVbKu1f3vbYJT/xtiF5syA5HPIMtf9IgY/nKg0sWqzAUEvqY/xm7OcZc/qafLx/iO9FgOmeSAp4v5ti/Q==",
+      "license": "MIT"
+    },
+    "node_modules/semver": {
+      "version": "6.3.1",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz",
+      "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==",
+      "dev": true,
+      "license": "ISC",
+      "bin": {
+        "semver": "bin/semver.js"
+      }
+    },
+    "node_modules/source-map-js": {
+      "version": "1.2.1",
+      "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz",
+      "integrity": "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==",
+      "dev": true,
+      "license": "BSD-3-Clause",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/space-separated-tokens": {
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/space-separated-tokens/-/space-separated-tokens-2.0.2.tgz",
+      "integrity": "sha512-PEGlAwrG8yXGXRjW32fGbg66JAlOAwbObuqVoJpv/mRgoWDQfgH1wDPvtzWyUSNAXBGSk8h755YDbbcEy3SH2Q==",
+      "license": "MIT",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/stringify-entities": {
+      "version": "4.0.4",
+      "resolved": "https://registry.npmjs.org/stringify-entities/-/stringify-entities-4.0.4.tgz",
+      "integrity": "sha512-IwfBptatlO+QCJUo19AqvrPNqlVMpW9YEL2LIVY+Rpv2qsjCGxaDLNRgeGsQWJhfItebuJhsGSLjaBbNSQ+ieg==",
+      "license": "MIT",
+      "dependencies": {
+        "character-entities-html4": "^2.0.0",
+        "character-entities-legacy": "^3.0.0"
+      },
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/style-to-js": {
+      "version": "1.1.21",
+      "resolved": "https://registry.npmjs.org/style-to-js/-/style-to-js-1.1.21.tgz",
+      "integrity": "sha512-RjQetxJrrUJLQPHbLku6U/ocGtzyjbJMP9lCNK7Ag0CNh690nSH8woqWH9u16nMjYBAok+i7JO1NP2pOy8IsPQ==",
+      "license": "MIT",
+      "dependencies": {
+        "style-to-object": "1.0.14"
+      }
+    },
+    "node_modules/style-to-object": {
+      "version": "1.0.14",
+      "resolved": "https://registry.npmjs.org/style-to-object/-/style-to-object-1.0.14.tgz",
+      "integrity": "sha512-LIN7rULI0jBscWQYaSswptyderlarFkjQ+t79nzty8tcIAceVomEVlLzH5VP4Cmsv6MtKhs7qaAiwlcp+Mgaxw==",
+      "license": "MIT",
+      "dependencies": {
+        "inline-style-parser": "0.2.7"
+      }
+    },
+    "node_modules/tailwind-merge": {
+      "version": "3.4.0",
+      "resolved": "https://registry.npmjs.org/tailwind-merge/-/tailwind-merge-3.4.0.tgz",
+      "integrity": "sha512-uSaO4gnW+b3Y2aWoWfFpX62vn2sR3skfhbjsEnaBI81WD1wBLlHZe5sWf0AqjksNdYTbGBEd0UasQMT3SNV15g==",
+      "license": "MIT",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/dcastil"
+      }
+    },
+    "node_modules/tinyglobby": {
+      "version": "0.2.15",
+      "resolved": "https://registry.npmjs.org/tinyglobby/-/tinyglobby-0.2.15.tgz",
+      "integrity": "sha512-j2Zq4NyQYG5XMST4cbs02Ak8iJUdxRM0XI5QyxXuZOzKOINmWurp3smXu3y5wDcJrptwpSjgXHzIQxR0omXljQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "fdir": "^6.5.0",
+        "picomatch": "^4.0.3"
+      },
+      "engines": {
+        "node": ">=12.0.0"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/SuperchupuDev"
+      }
+    },
+    "node_modules/trim-lines": {
+      "version": "3.0.1",
+      "resolved": "https://registry.npmjs.org/trim-lines/-/trim-lines-3.0.1.tgz",
+      "integrity": "sha512-kRj8B+YHZCc9kQYdWfJB2/oUl9rA99qbowYYBtr4ui4mZyAQ2JpvVBd/6U2YloATfqBhBTSMhTpgBHtU0Mf3Rg==",
+      "license": "MIT",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/trough": {
+      "version": "2.2.0",
+      "resolved": "https://registry.npmjs.org/trough/-/trough-2.2.0.tgz",
+      "integrity": "sha512-tmMpK00BjZiUyVyvrBK7knerNgmgvcV/KLVyuma/SC+TQN167GrMRciANTz09+k3zW8L8t60jWO1GpfkZdjTaw==",
+      "license": "MIT",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/tslib": {
+      "version": "2.8.1",
+      "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz",
+      "integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==",
+      "license": "0BSD"
+    },
+    "node_modules/typescript": {
+      "version": "5.8.3",
+      "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.8.3.tgz",
+      "integrity": "sha512-p1diW6TqL9L07nNxvRMM7hMMw4c5XOo/1ibL4aAIGmSAt9slTE1Xgw5KWuof2uTOvCg9BY7ZRi+GaF+7sfgPeQ==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "bin": {
+        "tsc": "bin/tsc",
+        "tsserver": "bin/tsserver"
+      },
+      "engines": {
+        "node": ">=14.17"
+      }
+    },
+    "node_modules/undici-types": {
+      "version": "6.21.0",
+      "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz",
+      "integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/unified": {
+      "version": "11.0.5",
+      "resolved": "https://registry.npmjs.org/unified/-/unified-11.0.5.tgz",
+      "integrity": "sha512-xKvGhPWw3k84Qjh8bI3ZeJjqnyadK+GEFtazSfZv/rKeTkTjOJho6mFqh2SM96iIcZokxiOpg78GazTSg8+KHA==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^3.0.0",
+        "bail": "^2.0.0",
+        "devlop": "^1.0.0",
+        "extend": "^3.0.0",
+        "is-plain-obj": "^4.0.0",
+        "trough": "^2.0.0",
+        "vfile": "^6.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/unist-util-is": {
+      "version": "6.0.1",
+      "resolved": "https://registry.npmjs.org/unist-util-is/-/unist-util-is-6.0.1.tgz",
+      "integrity": "sha512-LsiILbtBETkDz8I9p1dQ0uyRUWuaQzd/cuEeS1hoRSyW5E5XGmTzlwY1OrNzzakGowI9Dr/I8HVaw4hTtnxy8g==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^3.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/unist-util-position": {
+      "version": "5.0.0",
+      "resolved": "https://registry.npmjs.org/unist-util-position/-/unist-util-position-5.0.0.tgz",
+      "integrity": "sha512-fucsC7HjXvkB5R3kTCO7kUjRdrS0BJt3M/FPxmHMBOm8JQi2BsHAHFsy27E0EolP8rp0NzXsJ+jNPyDWvOJZPA==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^3.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/unist-util-stringify-position": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/unist-util-stringify-position/-/unist-util-stringify-position-4.0.0.tgz",
+      "integrity": "sha512-0ASV06AAoKCDkS2+xw5RXJywruurpbC4JZSm7nr7MOt1ojAzvyyaO+UxZf18j8FCF6kmzCZKcAgN/yu2gm2XgQ==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^3.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/unist-util-visit": {
+      "version": "5.0.0",
+      "resolved": "https://registry.npmjs.org/unist-util-visit/-/unist-util-visit-5.0.0.tgz",
+      "integrity": "sha512-MR04uvD+07cwl/yhVuVWAtw+3GOR/knlL55Nd/wAdblk27GCVt3lqpTivy/tkJcZoNPzTwS1Y+KMojlLDhoTzg==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^3.0.0",
+        "unist-util-is": "^6.0.0",
+        "unist-util-visit-parents": "^6.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/unist-util-visit-parents": {
+      "version": "6.0.2",
+      "resolved": "https://registry.npmjs.org/unist-util-visit-parents/-/unist-util-visit-parents-6.0.2.tgz",
+      "integrity": "sha512-goh1s1TBrqSqukSc8wrjwWhL0hiJxgA8m4kFxGlQ+8FYQ3C/m11FcTs4YYem7V664AhHVvgoQLk890Ssdsr2IQ==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^3.0.0",
+        "unist-util-is": "^6.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/update-browserslist-db": {
+      "version": "1.2.3",
+      "resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.2.3.tgz",
+      "integrity": "sha512-Js0m9cx+qOgDxo0eMiFGEueWztz+d4+M3rGlmKPT+T4IS/jP4ylw3Nwpu6cpTTP8R1MAC1kF4VbdLt3ARf209w==",
+      "dev": true,
+      "funding": [
+        {
+          "type": "opencollective",
+          "url": "https://opencollective.com/browserslist"
+        },
+        {
+          "type": "tidelift",
+          "url": "https://tidelift.com/funding/github/npm/browserslist"
+        },
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/ai"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "escalade": "^3.2.0",
+        "picocolors": "^1.1.1"
+      },
+      "bin": {
+        "update-browserslist-db": "cli.js"
+      },
+      "peerDependencies": {
+        "browserslist": ">= 4.21.0"
+      }
+    },
+    "node_modules/vfile": {
+      "version": "6.0.3",
+      "resolved": "https://registry.npmjs.org/vfile/-/vfile-6.0.3.tgz",
+      "integrity": "sha512-KzIbH/9tXat2u30jf+smMwFCsno4wHVdNmzFyL+T/L3UGqqk6JKfVqOFOZEpZSHADH1k40ab6NUIXZq422ov3Q==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^3.0.0",
+        "vfile-message": "^4.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/vfile-message": {
+      "version": "4.0.3",
+      "resolved": "https://registry.npmjs.org/vfile-message/-/vfile-message-4.0.3.tgz",
+      "integrity": "sha512-QTHzsGd1EhbZs4AsQ20JX1rC3cOlt/IWJruk893DfLRr57lcnOeMaWG4K0JrRta4mIJZKth2Au3mM3u03/JWKw==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^3.0.0",
+        "unist-util-stringify-position": "^4.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/vite": {
+      "version": "6.4.1",
+      "resolved": "https://registry.npmjs.org/vite/-/vite-6.4.1.tgz",
+      "integrity": "sha512-+Oxm7q9hDoLMyJOYfUYBuHQo+dkAloi33apOPP56pzj+vsdJDzr+j1NISE5pyaAuKL4A3UD34qd0lx5+kfKp2g==",
+      "dev": true,
+      "license": "MIT",
+      "peer": true,
+      "dependencies": {
+        "esbuild": "^0.25.0",
+        "fdir": "^6.4.4",
+        "picomatch": "^4.0.2",
+        "postcss": "^8.5.3",
+        "rollup": "^4.34.9",
+        "tinyglobby": "^0.2.13"
+      },
+      "bin": {
+        "vite": "bin/vite.js"
+      },
+      "engines": {
+        "node": "^18.0.0 || ^20.0.0 || >=22.0.0"
+      },
+      "funding": {
+        "url": "https://github.com/vitejs/vite?sponsor=1"
+      },
+      "optionalDependencies": {
+        "fsevents": "~2.3.3"
+      },
+      "peerDependencies": {
+        "@types/node": "^18.0.0 || ^20.0.0 || >=22.0.0",
+        "jiti": ">=1.21.0",
+        "less": "*",
+        "lightningcss": "^1.21.0",
+        "sass": "*",
+        "sass-embedded": "*",
+        "stylus": "*",
+        "sugarss": "*",
+        "terser": "^5.16.0",
+        "tsx": "^4.8.1",
+        "yaml": "^2.4.2"
+      },
+      "peerDependenciesMeta": {
+        "@types/node": {
+          "optional": true
+        },
+        "jiti": {
+          "optional": true
+        },
+        "less": {
+          "optional": true
+        },
+        "lightningcss": {
+          "optional": true
+        },
+        "sass": {
+          "optional": true
+        },
+        "sass-embedded": {
+          "optional": true
+        },
+        "stylus": {
+          "optional": true
+        },
+        "sugarss": {
+          "optional": true
+        },
+        "terser": {
+          "optional": true
+        },
+        "tsx": {
+          "optional": true
+        },
+        "yaml": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/yallist": {
+      "version": "3.1.1",
+      "resolved": "https://registry.npmjs.org/yallist/-/yallist-3.1.1.tgz",
+      "integrity": "sha512-a4UGQaWPH59mOXUYnAG2ewncQS4i4F43Tv3JoAM+s2VDAmS9NsK8GpDMLrCHPksFT7h3K6TOoUNn2pb7RoXx4g==",
+      "dev": true,
+      "license": "ISC"
+    },
+    "node_modules/zwitch": {
+      "version": "2.0.4",
+      "resolved": "https://registry.npmjs.org/zwitch/-/zwitch-2.0.4.tgz",
+      "integrity": "sha512-bXE4cR/kVZhKZX/RjPEflHaKVhUVl85noU3v6b8apfQEc1x4A+zBxjZ4lN8LqGd6WZ3dl98pY4o717VFmoPp+A==",
+      "license": "MIT",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    }
+  }
+}
diff --git a/FRRONTEEEND/package.json b/FRRONTEEEND/package.json
new file mode 100644
index 0000000000000000000000000000000000000000..e7b88a4a8527a0219fff88c1c57dba73fda09800
--- /dev/null
+++ b/FRRONTEEEND/package.json
@@ -0,0 +1,26 @@
+{
+  "name": "data-science-agent",
+  "private": true,
+  "version": "0.0.0",
+  "type": "module",
+  "scripts": {
+    "dev": "vite",
+    "build": "vite build",
+    "preview": "vite preview"
+  },
+  "dependencies": {
+    "react": "^19.2.3",
+    "react-dom": "^19.2.3",
+    "clsx": "^2.1.1",
+    "tailwind-merge": "^3.4.0",
+    "framer-motion": "^12.23.26",
+    "lucide-react": "^0.562.0",
+    "react-markdown": "^9.0.1"
+  },
+  "devDependencies": {
+    "@types/node": "^22.14.0",
+    "@vitejs/plugin-react": "^5.0.0",
+    "typescript": "~5.8.2",
+    "vite": "^6.2.0"
+  }
+}
diff --git a/FRRONTEEEND/tsconfig.json b/FRRONTEEEND/tsconfig.json
new file mode 100644
index 0000000000000000000000000000000000000000..2c6eed55868c7545e8f265f260277fb0605b2dbc
--- /dev/null
+++ b/FRRONTEEEND/tsconfig.json
@@ -0,0 +1,29 @@
+{
+  "compilerOptions": {
+    "target": "ES2022",
+    "experimentalDecorators": true,
+    "useDefineForClassFields": false,
+    "module": "ESNext",
+    "lib": [
+      "ES2022",
+      "DOM",
+      "DOM.Iterable"
+    ],
+    "skipLibCheck": true,
+    "types": [
+      "node"
+    ],
+    "moduleResolution": "bundler",
+    "isolatedModules": true,
+    "moduleDetection": "force",
+    "allowJs": true,
+    "jsx": "react-jsx",
+    "paths": {
+      "@/*": [
+        "./*"
+      ]
+    },
+    "allowImportingTsExtensions": true,
+    "noEmit": true
+  }
+}
\ No newline at end of file
diff --git a/FRRONTEEEND/vite.config.ts b/FRRONTEEEND/vite.config.ts
new file mode 100644
index 0000000000000000000000000000000000000000..22128abc0af86614dcc0075959d121cfb4c60f57
--- /dev/null
+++ b/FRRONTEEEND/vite.config.ts
@@ -0,0 +1,29 @@
+import path from 'path';
+import { defineConfig, loadEnv } from 'vite';
+import react from '@vitejs/plugin-react';
+
+export default defineConfig(({ mode }) => {
+    const env = loadEnv(mode, '.', '');
+    return {
+      server: {
+        port: 3000,
+        host: '0.0.0.0',
+        proxy: {
+          '/api': {
+            target: env.VITE_API_URL || 'http://localhost:8080',
+            changeOrigin: true,
+            rewrite: (path) => path.replace(/^\/api/, '')
+          }
+        }
+      },
+      plugins: [react()],
+      define: {
+        'import.meta.env.VITE_API_URL': JSON.stringify(env.VITE_API_URL || 'http://localhost:8080')
+      },
+      resolve: {
+        alias: {
+          '@': path.resolve(__dirname, '.'),
+        }
+      }
+    };
+});
diff --git a/GEMINI_UPDATE.md b/GEMINI_UPDATE.md
new file mode 100644
index 0000000000000000000000000000000000000000..838e9277fb1ca5a811889c7e136a2b329ea36343
--- /dev/null
+++ b/GEMINI_UPDATE.md
@@ -0,0 +1,93 @@
+# 🔄 Updated to Use Google Gemini!
+
+## What Changed
+
+The application now uses **Google Gemini (gemini-2.0-flash-exp)** instead of Groq for the chat interface.
+
+## Required Setup
+
+### 1. Set Your Google API Key
+
+```powershell
+# Windows PowerShell
+$env:GOOGLE_API_KEY="your-google-api-key-here"
+
+# Verify it's set
+echo $env:GOOGLE_API_KEY
+```
+
+### 2. Get Your API Key
+
+If you don't have a Google API key:
+1. Go to [Google AI Studio](https://aistudio.google.com/app/apikey)
+2. Create a new API key
+3. Copy and set it as shown above
+
+## Quick Start
+
+```powershell
+# Set your API key
+$env:GOOGLE_API_KEY="your-key-here"
+
+# Run the application
+.\start.ps1
+```
+
+Then open: **http://localhost:8080**
+
+## What's Using Gemini
+
+- ✅ **Chat Interface** (`/chat` endpoint) - Uses Gemini 2.0 Flash
+- ℹ️ **Full Workflow** (`/run` endpoint) - Uses the main agent (configurable via LLM_PROVIDER)
+
+## Technical Details
+
+The `/chat` endpoint now:
+- Uses `google.generativeai` SDK
+- Model: `gemini-2.0-flash-exp`
+- Maintains conversation history
+- Professional data science system instruction
+
+## Expected Console Output
+
+When you start the server:
+```
+INFO:     Started server process [####]
+INFO:     Waiting for application startup.
+✅ Agent initialized with provider: gemini
+✅ Frontend assets mounted from C:\Users\Pulastya\Videos\DS AGENTTTT\FRRONTEEEND\dist
+INFO:     Application startup complete.
+INFO:     Uvicorn running on http://0.0.0.0:8080
+```
+
+## Files Updated
+
+- ✅ [src/api/app.py](src/api/app.py) - `/chat` endpoint now uses Gemini
+- ✅ [.env.example](.env.example) - Updated to GOOGLE_API_KEY
+- ✅ [start.ps1](start.ps1) - Updated environment variable reference
+- ✅ [start.sh](start.sh) - Updated environment variable reference
+- ✅ [CHECKLIST.md](CHECKLIST.md) - Updated instructions
+- ✅ [FRRONTEEEND/.env](FRRONTEEEND/.env) - Added note about Gemini
+
+## Troubleshooting
+
+### Error: "API key not configured"
+**Solution**: Make sure you've set the environment variable:
+```powershell
+$env:GOOGLE_API_KEY="your-actual-api-key"
+```
+
+### Error: "Module google.generativeai not found"
+**Solution**: The dependency is already in requirements.txt. Verify it's installed:
+```bash
+pip install google-generativeai
+```
+
+### Rate Limits
+Gemini 2.0 Flash has generous rate limits:
+- Free tier: 15 RPM (requests per minute)
+- 1 million TPM (tokens per minute)
+
+---
+
+**Ready?** Set your `GOOGLE_API_KEY` and run `.\start.ps1` 🚀
diff --git a/MIGRATION_COMPLETE.md b/MIGRATION_COMPLETE.md
new file mode 100644
index 0000000000000000000000000000000000000000..517c1d60a75f5b8205da9c80f0465bc66c8eba7c
--- /dev/null
+++ b/MIGRATION_COMPLETE.md
@@ -0,0 +1,325 @@
+# 🎉 Frontend Migration Complete!
+
+## Summary
+
+Successfully replaced the old Gradio interface with a modern React-based frontend featuring:
+- **Professional Landing Page**: Showcases the agent's capabilities
+- **Modern Chat Interface**: NextChat-style conversational UI
+- **Direct Backend Integration**: Communicates with FastAPI backend
+- **Beautiful Design**: Dark theme with animations and responsive layout
+
+## What Was Changed
+
+### ✅ Backend Updates ([src/api/app.py](src/api/app.py))
+1. **Added CORS middleware** for frontend communication
+2. **Created `/chat` endpoint** for conversational interface
+3. **Static file serving** for built React app
+4. **Catch-all route** to serve `index.html` for client-side routing
+
+### ✅ Frontend Updates
+1. **Removed Google GenAI dependency** from [package.json](FRRONTEEEND/package.json)
+2. **Updated ChatInterface.tsx** to call backend `/chat` endpoint instead of external API
+3. **Added environment configuration**:
+   - `.env` for local development
+   - `.env.production` for production builds
+4. **Updated vite.config.ts** with proxy configuration
+
+### ✅ Configuration Files
+1. **requirements.txt**: Commented out Gradio (no longer needed)
+2. **Dockerfile**: Added multi-stage build for React frontend
+3. **.dockerignore**: Excluded node_modules and frontend dev files
+4. **New Scripts**:
+   - `start.ps1` / `start.sh` - Quick start scripts
+   - `build-and-deploy.ps1` / `build-and-deploy.sh` - Build scripts
+
+### ✅ Documentation
+- **FRONTEND_INTEGRATION.md**: Complete integration guide
+- **README.md**: Updated with frontend announcement
+
+## 🚀 How to Run
+
+### Quick Start (Recommended)
+
+**Windows:**
+```powershell
+.\start.ps1
+```
+
+**Linux/Mac:**
+```bash
+chmod +x start.sh
+./start.sh
+```
+
+### Manual Steps
+
+1. **Build Frontend** (already done ✅):
+```bash
+cd FRRONTEEEND
+npm.cmd install
+npm.cmd run build
+cd ..
+```
+
+2. **Set Environment Variables**:
+```powershell
+# Required
+$env:GROQ_API_KEY="your-groq-api-key-here"
+
+# Optional
+$env:GOOGLE_API_KEY="your-google-api-key"
+```
+
+3. **Start Backend**:
+```bash
+python src\api\app.py
+```
+
+4. **Access Application**:
+Open browser to: **http://localhost:8080**
+
+## 🏗️ Architecture
+
+```
+┌─────────────────────────────────────────────────────────┐
+│                      Browser                             │
+│                                                          │
+│  ┌──────────────────────────────────────────────────┐  │
+│  │   React Frontend (Port 8080)                     │  │
+│  │   - Landing Page (HeroGeometric, etc.)           │  │
+│  │   - Chat Interface (ChatInterface.tsx)           │  │
+│  └──────────────────────────────────────────────────┘  │
+│                         │                                │
+│                         │ HTTP POST /chat                │
+└─────────────────────────┼────────────────────────────────┘
+                          │
+                          ▼
+┌─────────────────────────────────────────────────────────┐
+│               FastAPI Backend (Port 8080)                │
+│                                                          │
+│  ┌──────────────────────────────────────────────────┐  │
+│  │   API Endpoints                                   │  │
+│  │   - POST /chat      → Chat with agent            │  │
+│  │   - POST /run       → Full workflow              │  │
+│  │   - POST /profile   → Dataset profiling          │  │
+│  │   - GET  /tools     → List tools                 │  │
+│  │   - GET  /*         → Serve React app            │  │
+│  └──────────────────────────────────────────────────┘  │
+│                         │                                │
+│                         ▼                                │
+│  ┌──────────────────────────────────────────────────┐  │
+│  │   DataScienceCopilot (orchestrator.py)           │  │
+│  │   - 82+ Tools                                     │  │
+│  │   - Groq LLM                                      │  │
+│  │   - Session Memory                                │  │
+│  └──────────────────────────────────────────────────┘  │
+└─────────────────────────────────────────────────────────┘
+```
+
+## 🎯 Key Endpoints
+
+### `/chat` - Conversational Interface
+```typescript
+POST /chat
+Content-Type: application/json
+
+{
+  "messages": [
+    {"role": "user", "content": "Profile my dataset"},
+    {"role": "assistant", "content": "..."}
+  ],
+  "stream": false
+}
+```
+
+**Response:**
+```json
+{
+  "success": true,
+  "message": "I can help you profile your dataset...",
+  "model": "llama-3.3-70b-versatile",
+  "provider": "groq"
+}
+```
+
+### `/run` - Complete Workflow
+```bash
+POST /run
+Content-Type: multipart/form-data
+
+file: <dataset.csv>
+task_description: "Predict house prices"
+target_col: "price"
+```
+
+### `/profile` - Quick Profiling
+```bash
+POST /profile
+Content-Type: multipart/form-data
+
+file: <dataset.csv>
+```
+
+## 📝 Environment Variables
+
+### Backend (.env or system)
+```env
+# Required
+GROQ_API_KEY=your-groq-api-key
+
+# Optional
+GOOGLE_API_KEY=your-google-api-key
+GCP_PROJECT_ID=your-project-id
+LLM_PROVIDER=groq  # or "gemini"
+```
+
+### Frontend (FRRONTEEEND/.env)
+```env
+# Development
+VITE_API_URL=http://localhost:8080
+
+# Production (FRRONTEEEND/.env.production)
+VITE_API_URL=https://your-cloud-run-url.run.app
+```
+
+## 🐳 Docker Deployment
+
+The Dockerfile now includes a multi-stage build:
+
+```bash
+# Build image
+docker build -t data-science-agent .
+
+# Run container
+docker run -p 8080:8080 \
+  -e GROQ_API_KEY=your-key \
+  data-science-agent
+```
+
+## ☁️ Google Cloud Run Deployment
+
+```bash
+# Build and push
+gcloud builds submit --tag gcr.io/YOUR-PROJECT-ID/data-science-agent
+
+# Deploy
+gcloud run deploy data-science-agent \
+  --image gcr.io/YOUR-PROJECT-ID/data-science-agent \
+  --platform managed \
+  --region us-central1 \
+  --allow-unauthenticated \
+  --set-env-vars GROQ_API_KEY=your-api-key
+```
+
+## 🔍 Testing
+
+### Test Backend API
+```bash
+# Health check
+curl http://localhost:8080/health
+
+# List tools
+curl http://localhost:8080/tools
+
+# Chat
+curl -X POST http://localhost:8080/chat \
+  -H "Content-Type: application/json" \
+  -d '{
+    "messages": [
+      {"role": "user", "content": "Hello, what can you do?"}
+    ]
+  }'
+```
+
+### Test Frontend
+1. Open browser: http://localhost:8080
+2. Click "Launch Console"
+3. Type a message and send
+
+## 🎨 Frontend Development
+
+For frontend development with hot-reloading:
+
+**Terminal 1 - Backend:**
+```bash
+python src\api\app.py
+```
+
+**Terminal 2 - Frontend:**
+```bash
+cd FRRONTEEEND
+npm.cmd run dev
+```
+
+Access:
+- Frontend Dev: http://localhost:3000
+- Backend API: http://localhost:8080
+
+## 📦 Build Status
+
+✅ **Frontend Built**: FRRONTEEEND/dist/ contains:
+- index.html
+- assets/index-[hash].js (384 KB)
+
+✅ **Backend Ready**: src/api/app.py configured to:
+- Serve static files from FRRONTEEEND/dist/assets
+- Route all non-API requests to index.html
+- Handle /chat endpoint
+
+## 🔄 Migration Notes
+
+### What's Deprecated
+- ❌ `chat_ui.py` - Old Gradio interface (kept for reference)
+- ❌ Direct Google GenAI calls from frontend
+
+### What's New
+- ✅ React 19 + TypeScript
+- ✅ Vite 6 build system
+- ✅ Tailwind CSS styling
+- ✅ Framer Motion animations
+- ✅ Backend-first architecture
+
+## 🐛 Troubleshooting
+
+### Issue: Frontend shows 404
+**Solution**: Make sure you've built the frontend:
+```bash
+cd FRRONTEEEND
+npm.cmd run build
+```
+
+### Issue: API errors in chat
+**Solution**: 
+1. Check backend is running: `python src\api\app.py`
+2. Verify GROQ_API_KEY is set
+3. Check console for errors
+
+### Issue: CORS errors
+**Solution**: The backend has CORS enabled. If issues persist, check the `allow_origins` in app.py
+
+### Issue: Module import errors
+**Solution**: Make sure all Python dependencies are installed:
+```bash
+pip install -r requirements.txt
+```
+
+## 📚 Additional Resources
+
+- **[FRONTEND_INTEGRATION.md](FRONTEND_INTEGRATION.md)** - Detailed integration guide
+- **[README.md](README.md)** - Main project documentation
+- **[DEPLOYMENT.md](DEPLOYMENT.md)** - Cloud deployment guide
+
+## ✨ Next Steps
+
+1. **File Upload**: Add file upload capability to ChatInterface
+2. **Visualizations**: Display charts and plots in chat
+3. **Session Persistence**: Store chat history in backend
+4. **Authentication**: Add user authentication
+5. **Streaming**: Implement streaming responses
+6. **Dark/Light Mode**: Add theme toggle
+
+---
+
+**Status**: ✅ Ready to use!
+
+**Last Updated**: December 27, 2025
diff --git a/QUICK_REFERENCE.txt b/QUICK_REFERENCE.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4e06dc80390328983024f3d499373234f0c0b59e
--- /dev/null
+++ b/QUICK_REFERENCE.txt
@@ -0,0 +1,71 @@
+╔═══════════════════════════════════════════════════════════════╗
+║           🚀 DATA SCIENCE AGENT - QUICK REFERENCE            ║
+║              Now powered by Google Gemini! 🤖                 ║
+╚═══════════════════════════════════════════════════════════════╝
+
+┌───────────────────────────────────────────────────────────────┐
+│ 1. SET API KEY (REQUIRED!)                                    │
+└───────────────────────────────────────────────────────────────┘
+  
+  PowerShell:
+    $env:GOOGLE_API_KEY="your-google-api-key-here"
+
+  Get your key: https://aistudio.google.com/app/apikey
+
+┌───────────────────────────────────────────────────────────────┐
+│ 2. START THE APPLICATION                                      │
+└───────────────────────────────────────────────────────────────┘
+
+  .\start.ps1
+
+┌───────────────────────────────────────────────────────────────┐
+│ 3. ACCESS THE APP                                             │
+└───────────────────────────────────────────────────────────────┘
+
+  Open browser: http://localhost:8080
+
+┌───────────────────────────────────────────────────────────────┐
+│ WHAT'S INCLUDED                                               │
+└───────────────────────────────────────────────────────────────┘
+
+  ✅ Modern React frontend with landing page
+  ✅ Professional chat interface
+  ✅ Google Gemini 2.0 Flash integration
+  ✅ 82+ data science tools
+  ✅ Complete ML pipeline automation
+
+┌───────────────────────────────────────────────────────────────┐
+│ KEY FILES                                                     │
+└───────────────────────────────────────────────────────────────┘
+
+  📖 GEMINI_UPDATE.md      - Gemini migration details
+  📖 CHECKLIST.md          - Pre-launch checklist
+  📖 MIGRATION_COMPLETE.md - Full change log
+  📖 FRONTEND_INTEGRATION.md - Technical docs
+
+┌───────────────────────────────────────────────────────────────┐
+│ TROUBLESHOOTING                                               │
+└───────────────────────────────────────────────────────────────┘
+
+  Issue: "API key not configured"
+  → Set: $env:GOOGLE_API_KEY="your-key"
+
+  Issue: "Frontend not found"
+  → Run: cd FRRONTEEEND && npm run build
+
+  Issue: "Module not found"
+  → Run: pip install -r requirements.txt
+
+┌───────────────────────────────────────────────────────────────┐
+│ API ENDPOINTS                                                 │
+└───────────────────────────────────────────────────────────────┘
+
+  POST /chat    - Chat with Gemini agent
+  POST /run     - Full ML workflow
+  POST /profile - Quick dataset profiling
+  GET  /tools   - List available tools
+  GET  /docs    - API documentation
+
+╔═══════════════════════════════════════════════════════════════╗
+║  Ready to start? Run: .\start.ps1                            ║
+╚═══════════════════════════════════════════════════════════════╝
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9426a7e7f1b24094d58398c1e89dcd7585f88b72
--- /dev/null
+++ b/README.md
@@ -0,0 +1,632 @@
+# Data Science Agent 🤖
+
+A production-grade **autonomous AI agent** for end-to-end data science workflows. Upload datasets, describe your goal in natural language, and let the AI handle profiling, cleaning, feature engineering, model training, and visualization.
+
+**Key Differentiator**: Not just a chatbot - a true AI agent with 75+ specialized tools, intelligent orchestration, dual LLM support, session memory, code interpreter, and Cloud Run API.
+
+---
+
+> ## 🎉 **NEW: Modern React Frontend!**
+> 
+> The application now features a **professional React-based web interface** with a beautiful landing page and chat UI, replacing the old Gradio interface.
+> 
+> **Quick Start:**
+> ```powershell
+> .\start.ps1  # Windows
+> ```
+> or
+> ```bash
+> ./start.sh   # Linux/Mac
+> ```
+> 
+> 📖 **[See Full Frontend Integration Guide →](FRONTEND_INTEGRATION.md)**
+
+---
+
+## 🎯 Project Vision
+
+Build an **autonomous data science system** that achieves **50-70th percentile performance** on Kaggle competitions through intelligent automation, proving AI agents can handle real-world ML workflows end-to-end.
+
+---
+
+## ✨ Core Features
+
+### **🤖 Intelligent Agent System**
+- **82+ Specialized Tools** across 11 categories (profiling, cleaning, feature engineering, ML, visualization, BigQuery)
+- **Dual LLM Support**: Groq (llama-3.3-70b) + Google Gemini (2.0-flash-exp)
+- **Smart Orchestration**: LLM-powered function calling with intelligent tool chaining
+- **Session Memory**: Contextual awareness across conversations ("cross-validate it", "try with Ridge")
+- **Code Interpreter**: Write and execute custom Python code for tasks beyond predefined tools
+- **Error Recovery**: Automatic retry with corrected parameters
+- **Reasoning Modules**: Dedicated LLM reasoning layer with 19 specialized functions
+- **Cloud Integration**: BigQuery data access + GCS artifact storage
+
+### 🎨 **Multiple Interfaces**
+- **Gradio Web UI** (`chat_ui.py`): Upload files, chat interface, visual plots
+- **CLI Interface** (`src/cli.py`): Command-line workflow automation
+- **REST API** (`src/api/app.py`): Cloud Run-ready FastAPI wrapper
+- **Python SDK**: Direct programmatic access
+
+### 📊 **Complete ML Pipeline**
+1. **Data Profiling** → Statistics, types, quality issues
+2. **Data Cleaning** → Smart imputation, outlier handling, type conversion
+3. **Feature Engineering** → Time features, encoding, interactions, ratios
+4. **Model Training** → XGBoost, LightGBM, CatBoost, ensemble methods
+5. **Hyperparameter Tuning** → Optuna-based optimization
+6. **Visualization** → Matplotlib, Plotly, interactive dashboards
+7. **EDA Reports** → Sweetviz, ydata-profiling HTML reports
+8. **Explainability** → SHAP values, feature importance
+
+### ⚡ **Performance & Scale**
+- **Token Optimization**: 34% reduction in LLM context (compressed tool schemas)
+- **SQLite Caching**: Memoization of expensive operations with TTL
+- **Polars & DuckDB**: 10-100x faster than pandas for large datasets
+- **Rate Limiting**: Intelligent API call management (Groq: 12K TPM, Gemini: 10 RPM)
+- **Cloud Ready**: FastAPI service for Google Cloud Run deployment
+
+---
+
+## 🏗️ Architecture
+
+### **System Design**
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│                   User Interfaces                            │
+│  Gradio UI  │  CLI  │  REST API  │  Python SDK               │
+└─────────────────────────┬───────────────────────────────────┘
+                          ▼
+┌─────────────────────────────────────────────────────────────┐
+│              DataScienceCopilot Orchestrator                 │
+│  • LLM Function Calling (Groq/Gemini)                       │
+│  • Session Memory Management                                 │
+│  • Tool Execution & Chaining                                 │
+│  • Error Recovery & Retry Logic                              │
+└─────────────────────────┬───────────────────────────────────┘
+                          ▼
+┌─────────────────────────────────────────────────────────────┐
+│                    75+ Specialized Tools                     │
+│  Data Profiling │ Cleaning │ Feature Engineering             │
+│  Model Training │ Visualization │ EDA Reports                │
+│  NLP/Text │ Computer Vision │ Time Series │ MLOps           │
+└─────────────────────────┬───────────────────────────────────┘
+                          ▼
+┌─────────────────────────────────────────────────────────────┐
+│              Execution & Storage Backends                    │
+│  Local: Polars, sklearn, XGBoost                            │
+│  Cloud: BigQuery, Vertex AI, Cloud Storage (planned)        │
+│  Cache: SQLite with TTL                                      │
+└─────────────────────────────────────────────────────────────┘
+```
+
+### **Tech Stack**
+
+| Layer | Technologies |
+|-------|-------------|
+| **LLM** | Groq (llama-3.3-70b), Google Gemini (2.0-flash-exp) |
+| **Data Processing** | Polars, DuckDB, Pandas, PyArrow, BigQuery |
+| **ML/AI** | scikit-learn, XGBoost, LightGBM, CatBoost, Optuna |
+| **Visualization** | Matplotlib, Seaborn, Plotly |
+| **EDA Reports** | Sweetviz, ydata-profiling |
+| **Explainability** | SHAP, LIME |
+| **APIs** | FastAPI, Uvicorn |
+| **UI** | Gradio, Typer + Rich (CLI) |
+| **Storage** | SQLite (cache), CSV, Parquet, Google Cloud Storage |
+| **Cloud** | Google Cloud Run, BigQuery, GCS, Vertex AI (planned) |
+
+---
+
+## 🚀 Quick Start
+
+### **Prerequisites**
+- Python 3.9+
+- API Keys: [Groq](https://console.groq.com) or [Google AI Studio](https://makersuite.google.com/app/apikey)
+
+### **Installation**
+
+```bash
+# Clone repository
+git clone https://github.com/Surfing-Ninja/Data-Science-Agent.git
+cd Data-Science-Agent
+
+# Create virtual environment
+python -m venv .venv
+source .venv/bin/activate  # On Windows: .venv\Scripts\activate
+
+# Install dependencies
+pip install -r requirements.txt
+
+# Set up environment variables
+cp .env.example .env
+# Edit .env and add your API keys:
+# GROQ_API_KEY=your_groq_key
+# GOOGLE_API_KEY=your_google_key (optional)
+# LLM_PROVIDER=groq  # or "gemini"
+```
+
+### **Usage Examples**
+
+#### **1. Gradio Web UI** (Recommended for beginners)
+```bash
+python chat_ui.py
+# Opens at http://localhost:7860
+# Upload CSV → Ask: "Analyze this data and predict house prices"
+```
+
+#### **2. CLI Interface**
+```bash
+# Complete workflow
+python src/cli.py analyze data.csv --target price --task "Predict house prices"
+
+# Quick profiling
+python src/cli.py profile data.csv
+
+# Train models only
+python src/cli.py train cleaned.csv Survived --task-type classification
+```
+
+#### **3. Python SDK**
+```python
+from src.orchestrator import DataScienceCopilot
+
+# Initialize agent
+agent = DataScienceCopilot(
+    provider="groq",  # or "gemini"
+    reasoning_effort="medium"
+)
+
+# Run workflow
+result = agent.analyze(
+    file_path="titanic.csv",
+    task_description="Build a model to predict passenger survival",
+    target_col="Survived"
+)
+
+print(f"Status: {result['status']}")
+print(f"Best Model: {result['best_model']}")
+print(f"Accuracy: {result['best_score']}")
+```
+
+#### **4. REST API** (Cloud Run Ready)
+```bash
+# Start local server
+cd src/api
+python app.py
+# Server runs at http://localhost:8080
+
+# Make API call
+curl -X POST http://localhost:8080/run \
+  -F "file=@data.csv" \
+  -F "task_description=Analyze and predict churn" \
+  -F "target_col=churn"
+```
+
+---
+
+## 📁 Project Structure
+
+```
+Data-Science-Agent/
+├── src/
+│   ├── orchestrator.py              # Main agent brain (1,136 lines)
+│   ├── cli.py                       # CLI interface (346 lines)
+│   ├── api/
+│   │   └── app.py                   # FastAPI Cloud Run wrapper (331 lines)
+│   ├── bigquery/                    # BigQuery integration 🆕
+│   │   ├── __init__.py             # BigQuery tools (4 functions)
+│   │   └── client.py               # BigQuery client wrapper
+│   ├── storage/                     # Artifact storage 🆕
+│   │   ├── artifact_store.py       # Local + GCS backends (613 lines)
+│   │   └── helpers.py              # Storage helper functions (125 lines)
+│   ├── reasoning/                   # LLM reasoning layer 🆕
+│   │   ├── __init__.py             # Core reasoning engine (350 lines)
+│   │   ├── data_understanding.py   # Data insights (6 functions)
+│   │   ├── model_explanation.py    # Model interpretation (6 functions)
+│   │   └── business_summary.py     # Business translations (7 functions)
+│   ├── cache/
+│   │   └── cache_manager.py        # SQLite caching with TTL
+│   ├── tools/                       # 82+ specialized tools
+│   │   ├── data_profiling.py       # Dataset analysis
+│   │   ├── data_cleaning.py        # Cleaning & preprocessing
+│   │   ├── feature_engineering.py  # Feature creation
+│   │   ├── model_training.py       # ML training
+│   │   ├── visualization_engine.py # Matplotlib/Seaborn plots
+│   │   ├── plotly_visualizations.py # Interactive charts
+│   │   ├── eda_reports.py          # Sweetviz, ydata-profiling
+│   │   ├── advanced_*.py           # Advanced features
+│   │   └── tools_registry.py       # All 82 tool definitions (1,600+ lines)
+│   └── utils/                       # Helper utilities
+│       ├── polars_helpers.py       # Data manipulation
+│       └── validation.py           # Input validation
+├── chat_ui.py                       # Gradio web interface (912 lines)
+├── examples/
+│   └── titanic_example.py           # Complete workflow demo
+├── outputs/
+│   ├── data/                        # Processed datasets
+│   ├── models/                      # Trained models (.pkl)
+│   ├── plots/                       # Visualizations (.png, .html)
+│   └── reports/                     # EDA reports (.html)
+├── cache_db/                        # SQLite cache storage
+├── requirements.txt                 # Python dependencies
+├── .env.example                     # Environment template
+└── README.md                        # This file
+```
+
+---
+
+## 🛠️ Tool Categories (82 Tools Total)
+
+### **📊 Data Profiling & Analysis (7 tools)**
+- `profile_dataset`, `detect_data_quality_issues`, `analyze_correlations`, `get_smart_summary`, `compare_datasets`, `calculate_statistics`, `detect_skewness`
+
+### **☁️ BigQuery Integration (4 tools)** 🆕
+- `bigquery_profile_table`, `bigquery_load_table`, `bigquery_execute_query`, `bigquery_write_results`
+
+### **🧹 Data Cleaning (8 tools)**
+- `clean_missing_values`, `handle_outliers`, `remove_duplicates`, `filter_rows`, `rename_columns`, `drop_columns`, `sort_data`, `fix_data_types`
+
+### **🔧 Feature Engineering (13 tools)**
+- `encode_categorical`, `force_numeric_conversion`, `smart_type_inference`, `create_time_features`, `create_interaction_features`, `create_aggregation_features`, `create_ratio_features`, `create_statistical_features`, `create_log_features`, `create_binned_features`, `engineer_text_features`, `auto_feature_engineering`, `auto_feature_selection`
+
+### **🤖 Model Training & Tuning (6 tools)**
+- `train_baseline_models`, `hyperparameter_tuning`, `train_ensemble_models`, `perform_cross_validation`, `generate_model_report`, `auto_ml_pipeline`
+
+### **📈 Visualization (11 tools)**
+- `generate_all_plots`, `generate_data_quality_plots`, `generate_eda_plots`, `generate_model_performance_plots`, `generate_feature_importance_plot`, `generate_interactive_scatter`, `generate_interactive_histogram`, `generate_interactive_correlation_heatmap`, `generate_interactive_box_plots`, `generate_interactive_time_series`, `generate_plotly_dashboard`
+
+### **📊 EDA Reports (3 tools)**
+- `generate_sweetviz_report`, `generate_ydata_profiling_report`, `generate_combined_eda_report`
+
+### **🔬 Advanced Analysis (11 tools)**
+- `perform_eda_analysis`, `detect_model_issues`, `detect_anomalies`, `detect_and_handle_multicollinearity`, `perform_statistical_tests`, `analyze_root_cause`, `detect_trends_and_seasonality`, `detect_anomalies_advanced`, `perform_hypothesis_testing`, `analyze_distribution`, `perform_segment_analysis`
+
+### **📝 Data Wrangling (3 tools)**
+- `merge_datasets`, `concat_datasets`, `reshape_dataset`
+
+### **🚀 MLOps & Production (5 tools)**
+- `monitor_model_drift`, `explain_predictions`, `generate_model_card`, `perform_ab_test_analysis`, `detect_feature_leakage`
+
+### **⏰ Time Series (3 tools)**
+- `forecast_time_series`, `detect_seasonality_trends`, `create_time_series_features`
+
+### **💼 Business Intelligence (4 tools)**
+- `perform_cohort_analysis`, `perform_rfm_analysis`, `detect_causal_relationships`, `generate_business_insights`
+
+### **📚 NLP/Text (4 tools)**
+- `perform_topic_modeling`, `perform_named_entity_recognition`, `analyze_sentiment_advanced`, `perform_text_similarity`
+
+### **🖼️ Computer Vision (3 tools)**
+- `extract_image_features`, `perform_image_clustering`, `analyze_tabular_image_hybrid`
+
+---
+
+## 🎯 Advanced Features
+
+### **1. Session Memory**
+The agent remembers context across conversations:
+
+```python
+# Conversation 1
+"Train a model on earthquake.csv to predict magnitude"
+→ Agent trains XGBoost, achieves 0.92 R²
+
+# Conversation 2 (Same session)
+"Cross-validate it"
+→ Agent knows: model=XGBoost, dataset=earthquake.csv, target=magnitude
+→ Runs 5-fold CV automatically
+```
+
+### **2. Code Interpreter**
+Execute custom Python code for tasks beyond predefined tools:
+
+```python
+User: "Make a Plotly scatter with custom dropdown filters"
+
+Agent: execute_python_code(code='''
+import plotly.graph_objects as go
+df = pd.read_csv('./temp/data.csv')
+# Custom visualization code...
+fig.write_html('./outputs/code/custom_plot.html')
+''')
+```
+
+### **3. Token Optimization**
+System stays under LLM token limits even with 75 tools:
+
+| Component | Before | After | Savings |
+|-----------|--------|-------|---------|
+| Tool Schemas | 8,193 tokens | 5,463 tokens | 34% |
+| Tool Results | 5,000+ tokens | 50-200 tokens | 90%+ |
+
+### **4. Error Recovery**
+Agent learns from errors and auto-corrects:
+
+```python
+# Attempt 1
+train_baseline_models(target_col="magnitude")
+→ Error: Column 'magnitude' not found. Hint: Did you mean 'mag'?
+
+# Attempt 2 (Automatic)
+train_baseline_models(target_col="mag")
+→ Success! Trained 4 models, best: XGBoost (0.92 R²)
+```
+
+---
+
+## ☁️ Cloud Features
+
+### **1. BigQuery Integration** 🆕
+Direct access to BigQuery tables without local downloads:
+
+```python
+# Profile a BigQuery table
+agent.chat("Profile the table project.dataset.sales")
+
+# Query and analyze
+agent.chat("Query top 10 customers by revenue from BigQuery")
+
+# Write results back
+agent.chat("Write the cleaned data to BigQuery table project.dataset.sales_clean")
+```
+
+**Available Tools:**
+- `bigquery_profile_table`: Get statistics for any BigQuery table
+- `bigquery_load_table`: Load BigQuery data into local Polars DataFrame
+- `bigquery_execute_query`: Run SQL queries directly on BigQuery
+- `bigquery_write_results`: Write processed data back to BigQuery
+
+**Setup:**
+```bash
+# Install BigQuery dependencies
+pip install google-cloud-bigquery db-dtypes
+
+# Set environment variable
+export GOOGLE_APPLICATION_CREDENTIALS="/path/to/service-account-key.json"
+```
+
+**Looker-Compatible Schemas:**
+
+The project defines stable BigQuery table schemas for BI tools (see [`BIGQUERY_SCHEMAS.md`](BIGQUERY_SCHEMAS.md)):
+- 📊 `model_metrics` - Model performance tracking over time
+- 🎯 `feature_importance` - Feature impact analysis
+- 🔮 `predictions` - Prediction monitoring with actuals
+- 📋 `data_profile_summary` - Data quality metrics
+
+**Design Principles:**
+- Stable schemas (no breaking changes without versioning)
+- Consistent snake_case naming
+- Clear dimension/metric separation
+- Dashboard-ready with sample Looker views
+
+### **2. Artifact Storage** 🆕
+Unified storage abstraction - switch between local and GCS with zero code changes:
+
+```python
+# Local storage (default)
+agent.save_model(model, "my_model.pkl")  
+# → Saves to outputs/models/my_model.pkl
+
+# GCS storage (automatic when GCS credentials present)
+agent.save_model(model, "my_model.pkl")
+# → Saves to gs://your-bucket/models/my_model_v1.pkl with versioning
+```
+
+**Features:**
+- **Automatic Backend Selection**: Uses GCS if credentials available, falls back to local
+- **Versioning**: Automatic version suffixes for GCS artifacts
+- **Metadata**: Stores creation time, size, checksums
+- **Unified API**: Same code works for local and cloud storage
+
+**Setup:**
+```bash
+# Install GCS dependencies
+pip install google-cloud-storage
+
+# Set bucket (optional, defaults to local)
+export GCS_BUCKET="your-gcs-bucket-name"
+```
+
+### **3. Reasoning Modules** 🆕
+Dedicated LLM reasoning layer with clear boundaries (no raw data access, no training decisions):
+
+```python
+from reasoning.data_understanding import explain_dataset
+from reasoning.model_explanation import explain_model_performance
+from reasoning.business_summary import create_executive_summary
+
+# Data insights
+insights = explain_dataset(summary={
+    "rows": 10000,
+    "columns": 20,
+    "missing_values": {"age": {"count": 150, "percentage": 1.5}}
+})
+
+# Model explanations
+explanation = explain_model_performance(metrics={
+    "accuracy": 0.95,
+    "precision": 0.92,
+    "recall": 0.88
+}, task_type="classification")
+
+# Business summaries
+summary = create_executive_summary(
+    project_results={"model_accuracy": 0.95},
+    project_name="churn_prediction",
+    business_objective="Reduce customer churn"
+)
+```
+
+**19 Reasoning Functions:**
+- **Data Understanding**: explain_dataset, suggest_transformations, identify_feature_engineering_opportunities, explain_missing_values, compare_datasets (6 functions)
+- **Model Explanation**: explain_model_performance, interpret_feature_importance, diagnose_model_failure, explain_prediction, compare_models, explain_overfitting (6 functions)
+- **Business Summary**: create_executive_summary, estimate_business_impact, create_stakeholder_report, translate_technical_to_business, prioritize_next_steps, explain_to_customer, assess_deployment_readiness (7 functions)
+
+**Design Principles:**
+- ✅ **NO Raw Data Access**: Only summaries/statistics allowed
+- ✅ **NO Training Decisions**: Only explanations, never execution
+- ✅ **Structured Output**: JSON schemas for cacheability
+- ✅ **Dual Backend**: Works with both Gemini and Groq
+
+---
+
+## 🔧 Configuration
+
+### **Environment Variables** (`.env`)
+
+```bash
+# LLM Provider
+LLM_PROVIDER=groq               # "groq" or "gemini"
+GROQ_API_KEY=your_groq_key
+GOOGLE_API_KEY=your_google_key  # Optional
+
+# Model Selection
+GROQ_MODEL=llama-3.3-70b-versatile
+GEMINI_MODEL=gemini-2.0-flash-exp
+REASONING_EFFORT=medium         # low, medium, high
+
+# Cache Settings
+CACHE_DB_PATH=./cache_db/cache.db
+CACHE_TTL_SECONDS=86400         # 24 hours
+
+# Cloud Features (Optional)
+GCS_BUCKET=your-gcs-bucket-name                           # For artifact storage
+GOOGLE_APPLICATION_CREDENTIALS=/path/to/service-key.json  # For BigQuery + GCS
+
+# Cloud Run (for API deployment)
+PORT=8080
+```
+
+### **Provider Comparison**
+
+| Feature | Groq | Gemini |
+|---------|------|--------|
+| **Model** | llama-3.3-70b-versatile | gemini-2.0-flash-exp |
+| **Speed** | ⚡ Extremely fast (LPU) | 🚀 Very fast |
+| **Free Tier** | 100K tokens/day | 1,500 requests/day |
+| **Rate Limit** | 12K tokens/min | 10 requests/min |
+| **Best For** | High-volume, low-latency | Free tier, high quota |
+
+---
+
+## 🚀 Cloud Deployment (Google Cloud Run)
+
+### **Deploy REST API**
+
+```bash
+# 1. Build Docker image (Dockerfile provided)
+docker build -t data-science-agent .
+
+# 2. Push to Google Container Registry
+gcloud builds submit --tag gcr.io/PROJECT_ID/data-science-agent
+
+# 3. Deploy to Cloud Run
+gcloud run deploy data-science-agent \
+  --image gcr.io/PROJECT_ID/data-science-agent \
+  --platform managed \
+  --region us-central1 \
+  --allow-unauthenticated \
+  --memory 4Gi \
+  --timeout 3600 \
+  --set-env-vars GROQ_API_KEY=your_key,LLM_PROVIDER=groq
+
+# 4. Test deployment
+curl -X POST https://your-service-url/run \
+  -F "file=@data.csv" \
+  -F "task_description=Predict churn"
+```
+
+### **API Endpoints**
+
+- `GET /` - Health check
+- `GET /health` - Readiness probe
+- `POST /run` - Full analysis workflow
+- `POST /profile` - Quick dataset profiling
+- `GET /tools` - List all available tools
+
+---
+
+## 🗺️ Roadmap
+
+### **Phase 1: Core Agent** ✅ COMPLETE
+- [x] 75 specialized tools
+- [x] Dual LLM support (Groq + Gemini)
+- [x] CLI + Gradio UI
+- [x] SQLite caching
+- [x] Token optimization
+
+### **Phase 2: Intelligence** ✅ COMPLETE
+- [x] Session memory
+- [x] Code interpreter
+- [x] Error recovery
+- [x] EDA reports (Sweetviz, ydata-profiling)
+- [x] Interactive Plotly visualizations
+
+### **Phase 3: Cloud Native** ✅ COMPLETE
+- [x] FastAPI Cloud Run wrapper with 4 REST endpoints
+- [x] BigQuery integration (4 tools: profile, load, query, write)
+- [x] Artifact Storage abstraction (Local ↔ GCS switching)
+- [x] Reasoning modules for LLM explanations (19 functions)
+- [x] Looker-compatible BigQuery schemas (4 stable tables)
+- [ ] Vertex AI model training (planned)
+- [ ] Cloud Logging & Monitoring (planned)
+
+### **Phase 4: Enterprise** 📋 PLANNED
+- [ ] Multi-user authentication
+- [ ] Team workspaces
+- [ ] Model registry
+- [ ] Automated retraining pipelines
+
+### **Phase 5: Kaggle Integration** 🎯 FUTURE
+- [ ] Direct Kaggle API integration
+- [ ] Automated competition workflow
+- [ ] Ensemble strategies
+- [ ] Submission automation
+
+---
+
+## 🤝 Contributing
+
+Contributions welcome! Areas for improvement:
+
+1. **New Tools**: Time series forecasting, NLP preprocessing, image augmentation
+2. **Cloud Backends**: AWS, Azure support
+3. **Performance**: Optimize tool execution, reduce latency
+4. **UI/UX**: Better visualization, workflow builder
+5. **Documentation**: Tutorials, video guides, blog posts
+
+---
+
+## 📜 License
+
+MIT License - See LICENSE file for details
+
+---
+
+## 📧 Support & Community
+
+- **Issues**: [GitHub Issues](https://github.com/Surfing-Ninja/Data-Science-Agent/issues)
+- **Discussions**: [GitHub Discussions](https://github.com/Surfing-Ninja/Data-Science-Agent/discussions)
+
+---
+
+## 📊 Project Stats
+
+- **Lines of Code**: ~18,000+
+- **Tools**: 82 specialized functions (75 core + 4 BigQuery + 3 storage helpers)
+- **Reasoning Functions**: 19 LLM-powered explanation modules
+- **Supported Models**: 10+ (LR, Ridge, Lasso, RF, XGBoost, LightGBM, CatBoost, etc.)
+- **Visualization Types**: 20+ (static + interactive)
+- **Data Formats**: CSV, Parquet, JSON, BigQuery tables
+- **Cloud Platforms**: Google Cloud (Run, BigQuery, GCS) - AWS/Azure planned
+
+---
+
+<div align="center">
+
+**Built with ❤️ for the Data Science Community**
+
+*"Making data science accessible through AI automation"*
+
+⭐ Star this repo if you find it useful! ⭐
+
+</div>
diff --git a/build-and-deploy.ps1 b/build-and-deploy.ps1
new file mode 100644
index 0000000000000000000000000000000000000000..02b25c1c963c065c44dc1d0f57f6d8208579e478
--- /dev/null
+++ b/build-and-deploy.ps1
@@ -0,0 +1,39 @@
+# Build and Deploy Script for Data Science Agent (Windows)
+
+Write-Host "🚀 Building and Deploying Data Science Agent..." -ForegroundColor Cyan
+
+# Step 1: Build React Frontend
+Write-Host ""
+Write-Host "📦 Building React frontend..." -ForegroundColor Yellow
+Set-Location FRRONTEEEND
+npm.cmd install
+if ($LASTEXITCODE -ne 0) {
+    Write-Host "❌ Frontend npm install failed!" -ForegroundColor Red
+    exit 1
+}
+npm.cmd run build
+if ($LASTEXITCODE -ne 0) {
+    Write-Host "❌ Frontend build failed!" -ForegroundColor Red
+    exit 1
+}
+Set-Location ..
+
+Write-Host ""
+Write-Host "✅ Frontend built successfully!" -ForegroundColor Green
+Write-Host "   Built files are in: FRRONTEEEND\dist" -ForegroundColor Gray
+
+# Step 2: Install Python dependencies
+Write-Host ""
+Write-Host "📦 Installing Python dependencies..." -ForegroundColor Yellow
+pip install -r requirements.txt
+if ($LASTEXITCODE -ne 0) {
+    Write-Host "⚠️ Some Python dependencies may have failed to install" -ForegroundColor Yellow
+}
+
+Write-Host ""
+Write-Host "✅ Build complete!" -ForegroundColor Green
+Write-Host ""
+Write-Host "To run the application:" -ForegroundColor Cyan
+Write-Host "  python src\api\app.py" -ForegroundColor White
+Write-Host ""
+Write-Host "Access the app at: http://localhost:8080" -ForegroundColor Green
diff --git a/build-and-deploy.sh b/build-and-deploy.sh
new file mode 100644
index 0000000000000000000000000000000000000000..2bfdd13fbe1286a399f3da9a17e45945909f30ca
--- /dev/null
+++ b/build-and-deploy.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+# Build and Deploy Script for Data Science Agent
+
+set -e  # Exit on error
+
+echo "🚀 Building and Deploying Data Science Agent..."
+
+# Step 1: Build React Frontend
+echo ""
+echo "📦 Building React frontend..."
+cd FRRONTEEEND
+npm.cmd install
+npm.cmd run build
+cd ..
+
+# Step 2: Copy built frontend to deployment location (if needed)
+echo ""
+echo "✅ Frontend built successfully!"
+echo "   Built files are in: FRRONTEEEND/dist"
+
+# Step 3: Install Python dependencies
+echo ""
+echo "📦 Installing Python dependencies..."
+pip install -r requirements.txt
+
+echo ""
+echo "✅ Build complete!"
+echo ""
+echo "To run the application:"
+echo "  1. Backend: python -m uvicorn src.api.app:app --host 0.0.0.0 --port 8080"
+echo "  2. Or use: python src/api/app.py"
+echo ""
+echo "Access the app at: http://localhost:8080"
diff --git a/cache_db/.gitkeep b/cache_db/.gitkeep
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/chat_ui.py b/chat_ui.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f5b67eabcf84a2c3422538a90c3af2e1bff9478
--- /dev/null
+++ b/chat_ui.py
@@ -0,0 +1,1073 @@
+"""
+AI Agent Data Scientist - Interactive Chat UI
+==============================================
+
+A simple web interface to interact with your AI Agent.
+Upload datasets, ask questions, and get AI-powered insights!
+"""
+
+import gradio as gr
+import sys
+import os
+import shutil
+from pathlib import Path
+import traceback
+
+# Add src to path
+sys.path.append('src')
+
+from tools.data_profiling import profile_dataset, detect_data_quality_issues
+from tools.model_training import train_baseline_models
+
+# Try to import AI agent (optional)
+try:
+    from orchestrator import DataScienceCopilot
+    agent = DataScienceCopilot()
+    AI_ENABLED = True
+    print("✅ AI Agent loaded successfully!")
+    print(f"📊 Model: {agent.model}")
+    print(f"🔧 Tools available: {len(agent.tool_functions)}")
+except Exception as e:
+    print(f"ℹ️  Running in manual mode (AI agent not available)")
+    print(f"   Error: {str(e)}")
+    print("💡 You can still use all the quick actions and tools!")
+    AI_ENABLED = False
+    agent = None
+
+# Store uploaded file path
+current_file = None
+current_profile = None
+last_agent_response = None  # Store last agent response for visualization extraction
+
+
+# Helper functions for Gradio 6.x message format
+def add_message(history, role, content):
+    """Add a message to history in Gradio 6.x format."""
+    if history is None:
+        history = []
+    history.append({"role": role, "content": content})
+    return history
+
+
+def add_user_message(history, content):
+    """Add a user message to history."""
+    return add_message(history, "user", content)
+
+
+def add_assistant_message(history, content):
+    """Add an assistant message to history."""
+    return add_message(history, "assistant", content)
+
+
+def update_last_assistant_message(history, content):
+    """Update the last assistant message in history."""
+    if history and len(history) > 0 and history[-1].get("role") == "assistant":
+        history[-1]["content"] = content
+    return history
+
+
+def get_last_user_content(history):
+    """Get the content of the last user message."""
+    if history:
+        for msg in reversed(history):
+            if msg.get("role") == "user":
+                return msg.get("content", "")
+    return ""
+
+
+def analyze_dataset(file, user_message, history):
+    """Process uploaded dataset(s) and user message. Supports single or multiple file uploads."""
+    global current_file, current_profile, last_agent_response
+    
+    # Initialize with empty plot list (will collect PNG file paths)
+    plots_paths = []
+    html_reports = []  # Initialize HTML reports list
+    
+    # Initialize history if None
+    if history is None:
+        history = []
+    
+    # Debug: Log the call
+    print(f"[DEBUG] analyze_dataset called - file: {file is not None}, message: '{user_message}', current_file: {current_file}")
+    
+    try:
+        # Handle file uploads (single or multiple)
+        if file is not None:
+            # file can be a single filepath or a list of filepaths
+            files_to_process = file if isinstance(file, list) else [file]
+            
+            # Filter out None values
+            files_to_process = [f for f in files_to_process if f is not None]
+            
+            if len(files_to_process) > 0:
+                print(f"[DEBUG] Processing {len(files_to_process)} file(s) upload")
+                
+                # Copy all files to simpler paths
+                os.makedirs("./temp", exist_ok=True)
+                processed_files = []
+                seen_files = {}  # Track files by content hash to detect duplicates
+                duplicate_count = 0
+                
+                for uploaded_file in files_to_process:
+                    simple_filename = Path(uploaded_file.name if hasattr(uploaded_file, 'name') else uploaded_file).name
+                    file_source = uploaded_file.name if hasattr(uploaded_file, 'name') else uploaded_file
+                    
+                    # Calculate file hash to detect duplicates (even with different names)
+                    import hashlib
+                    hasher = hashlib.md5()
+                    with open(file_source, 'rb') as f:
+                        # Read file in chunks to handle large files efficiently
+                        for chunk in iter(lambda: f.read(8192), b""):
+                            hasher.update(chunk)
+                    file_hash = hasher.hexdigest()
+                    
+                    # Check if this exact file was already uploaded
+                    if file_hash in seen_files:
+                        print(f"[DEBUG] Duplicate file detected: {simple_filename} (same as {seen_files[file_hash]})")
+                        duplicate_count += 1
+                        continue  # Skip duplicate
+                    
+                    # Not a duplicate - process it
+                    simple_path = f"./temp/{simple_filename}"
+                    
+                    # Handle filename collision (different files with same name)
+                    if os.path.exists(simple_path):
+                        # Check if existing file is the same (by comparing with already processed files)
+                        existing_in_processed = simple_path in processed_files
+                        if not existing_in_processed:
+                            # Different file with same name - add suffix
+                            base_name = Path(simple_filename).stem
+                            extension = Path(simple_filename).suffix
+                            counter = 1
+                            while os.path.exists(f"./temp/{base_name}_{counter}{extension}"):
+                                counter += 1
+                            simple_filename = f"{base_name}_{counter}{extension}"
+                            simple_path = f"./temp/{simple_filename}"
+                            print(f"[DEBUG] Filename collision - renamed to: {simple_filename}")
+                    
+                    shutil.copy2(file_source, simple_path)
+                    processed_files.append(simple_path)
+                    seen_files[file_hash] = simple_filename
+                    print(f"[DEBUG] Copied file to: {simple_path}")
+                
+                # Set current_file to the first file (for single-file operations)
+                # For multi-file operations, the agent will use all files from ./temp/
+                current_file = processed_files[0] if processed_files else None
+                
+                # Only show file upload response if there's no user message
+                if not (user_message and user_message.strip()):
+                    if len(processed_files) == 0:
+                        # All files were duplicates
+                        response = f"⚠️ **No New Files Uploaded**\n\n"
+                        response += f"All {len(files_to_process)} file(s) were duplicates of already uploaded files.\n\n"
+                        response += "Your previously uploaded dataset is still active."
+                    elif len(processed_files) == 1:
+                        # Single file upload - show detailed profile
+                        response = f"📊 **Dataset Uploaded Successfully!**\n\n"
+                        if duplicate_count > 0:
+                            response += f"ℹ️ *({duplicate_count} duplicate file(s) were skipped)*\n\n"
+                        response += f"**File:** {Path(current_file).name}\n\n"
+                        
+                        # Get basic profile
+                        profile = profile_dataset(current_file)
+                        current_profile = profile
+                        
+                        response += f"**Dataset Overview:**\n"
+                        response += f"- Rows: {profile['shape']['rows']:,}\n"
+                        response += f"- Columns: {profile['shape']['columns']}\n"
+                        
+                        # Handle memory_usage (can be float or dict)
+                        memory = profile.get('memory_usage', 0)
+                        if isinstance(memory, dict):
+                            memory = memory.get('total_mb', 0)
+                        response += f"- Memory: {memory:.2f} MB\n\n"
+                        
+                        response += f"**Column Types:**\n"
+                        response += f"- Numeric: {len(profile['column_types']['numeric'])} columns\n"
+                        response += f"- Categorical: {len(profile['column_types']['categorical'])} columns\n"
+                        response += f"- Datetime: {len(profile['column_types']['datetime'])} columns\n\n"
+                        
+                        # Check data quality
+                        quality = detect_data_quality_issues(current_file)
+                        if quality['critical']:
+                            response += f"🔴 **Critical Issues:** {len(quality['critical'])}\n"
+                            for issue in quality['critical'][:3]:
+                                response += f"  - {issue['message']}\n"
+                        if quality['warning']:
+                            response += f"🟡 **Warnings:** {len(quality['warning'])}\n"
+                            for issue in quality['warning'][:3]:
+                                response += f"  - {issue['message']}\n"
+                    else:
+                        # Multiple files uploaded
+                        response = f"📊 **{len(processed_files)} Datasets Uploaded Successfully!**\n\n"
+                        if duplicate_count > 0:
+                            response += f"ℹ️ *({duplicate_count} duplicate file(s) were skipped)*\n\n"
+                        response += f"**Files:**\n"
+                        for i, fp in enumerate(processed_files, 1):
+                            response += f"{i}. {Path(fp).name}\n"
+                        response += f"\n**💡 You can now use multi-dataset operations!**\n\n"
+                    
+                    response += f"\n\n💬 **What would you like to do with {'this dataset' if len(processed_files) == 1 else 'these datasets'}?**\n\n"
+                    response += "You can ask me to:\n"
+                    if len(processed_files) > 1:
+                        response += "- **Merge these datasets** (e.g., 'merge customers and orders on customer_id')\n"
+                        response += "- **Combine/concatenate** them (e.g., 'combine all monthly sales files')\n"
+                    response += "- Train a classification or regression model\n"
+                    response += "- Analyze specific columns\n"
+                    response += "- Detect outliers\n"
+                    response += "- Engineer features\n"
+                    response += "- Generate predictions\n"
+                    response += "- And much more!\n"
+                    
+                    # Add assistant message to history
+                    history = add_assistant_message(history, response)
+                    yield history, "", [], []
+                    return
+                # If user uploaded file AND sent a message, don't return - continue to process the message
+                elif user_message and user_message.strip():
+                    # Continue processing the message below
+                    pass
+        
+        # If user sends a message about the current file
+        print(f"[DEBUG] Checking message conditions: user_message={bool(user_message and user_message.strip())}, current_file={bool(current_file)}")
+        if user_message and user_message.strip() and current_file:
+            print(f"[DEBUG] User message detected. AI_ENABLED={AI_ENABLED}, agent={agent is not None}")
+            if AI_ENABLED and agent:
+                print(f"[DEBUG] Entering AI Agent block...")
+                try:
+                    # Show immediate processing message
+                    print(f"🤖 AI Agent analyzing: {user_message}")
+                    history = add_user_message(history, user_message)
+                    history = add_assistant_message(history, "🤖 **AI Agent is thinking...**\n\n⏳ Analyzing your request and planning the workflow...")
+                    yield history, "", [], []
+                    
+                    # Use the AI agent to process the request
+                    print(f"📂 File path: {current_file}")
+                    print(f"📝 Task: {user_message}")
+                    print(f"🚀 Calling agent.analyze()...")
+                    
+                    agent_response = agent.analyze(
+                        file_path=current_file,
+                        task_description=user_message,
+                        use_cache=False,  # Disable cache to avoid dict hashing issues
+                        stream=False
+                    )
+                    
+                    print(f"✅ Agent response received: {agent_response.get('status', 'unknown')}")
+                    
+                    # Store agent response for visualization extraction
+                    last_agent_response = agent_response
+                    
+                    # Format the response
+                    if agent_response.get('status') == 'success':
+                        response = f"🤖 **AI Agent Analysis Complete!**\n\n"
+                        response += f"{agent_response.get('summary', '')}\n\n"
+                        
+                        if 'workflow_history' in agent_response and agent_response['workflow_history']:
+                            response += f"**Execution Summary:**\n"
+                            response += f"- Tools Executed: {len(agent_response['workflow_history'])}\n"
+                            response += f"- Iterations: {agent_response.get('iterations', 0)}\n"
+                            response += f"- Time: {agent_response.get('execution_time', 0):.1f}s\n\n"
+                            
+                            # Find and display MODEL TRAINING RESULTS with ALL METRICS
+                            model_results = None
+                            for step in agent_response['workflow_history']:
+                                if step.get('tool') == 'train_baseline_models':
+                                    result = step.get('result', {})
+                                    if isinstance(result, dict) and 'result' in result:
+                                        model_results = result['result']
+                                    elif isinstance(result, dict):
+                                        model_results = result
+                                    break
+                            
+                            if model_results and 'models' in model_results:
+                                response += f"## 🎯 Model Training Results\n\n"
+                                task_type = model_results.get('task_type', 'unknown')
+                                response += f"**Task Type:** {task_type.title()}\n"
+                                response += f"**Features:** {model_results.get('n_features', 0)}\n"
+                                response += f"**Training Samples:** {model_results.get('train_size', 0):,}\n"
+                                response += f"**Test Samples:** {model_results.get('test_size', 0):,}\n\n"
+                                
+                                # Show ALL models tested
+                                response += "### 📊 All Models Tested:\n\n"
+                                models_data = model_results.get('models', {})
+                                
+                                for model_name, model_info in models_data.items():
+                                    if 'test_metrics' in model_info:
+                                        metrics = model_info['test_metrics']
+                                        response += f"**{model_name}:**\n"
+                                        
+                                        if task_type == 'classification':
+                                            response += f"- Accuracy: {metrics.get('accuracy', 0):.4f}\n"
+                                            response += f"- Precision: {metrics.get('precision', 0):.4f}\n"
+                                            response += f"- Recall: {metrics.get('recall', 0):.4f}\n"
+                                            response += f"- F1 Score: {metrics.get('f1', 0):.4f}\n"
+                                        else:
+                                            response += f"- R² Score: {metrics.get('r2', 0):.4f}\n"
+                                            response += f"- RMSE: {metrics.get('rmse', 0):.2f}\n"
+                                            response += f"- MAE: {metrics.get('mae', 0):.2f}\n"
+                                            response += f"- MAPE: {metrics.get('mape', 0):.2f}%\n"
+                                        response += "\n"
+                                
+                                # Highlight BEST MODEL
+                                best_model = model_results.get('best_model', {})
+                                if best_model and best_model.get('name'):
+                                    response += f"### 🏆 Best Model: **{best_model['name']}**\n"
+                                    response += f"Score: {best_model.get('score', 0):.4f}\n\n"
+                            
+                            # Show workflow execution summary
+                            response += "### 🔧 Workflow Steps:\n"
+                            for i, step in enumerate(agent_response['workflow_history'], 1):
+                                tool_name = step['tool']
+                                success = step['result'].get('success', False)
+                                icon = "✅" if success else "❌"
+                                response += f"{i}. {icon} {tool_name}\n"
+                            response += "\n"
+                            
+                            # Check for plots AND reports in workflow results
+                            html_reports = []  # Separate list for HTML reports
+                            
+                            for step in agent_response['workflow_history']:
+                                result = step.get('result', {})
+                                
+                                # Deep search for plots and reports in nested results
+                                def find_plots_and_reports(obj, plots_list, reports_list):
+                                    if isinstance(obj, dict):
+                                        # Check direct plot/report keys
+                                        for key in ['plot_path', 'plot_file', 'output_path', 'html_path', 'report_path',
+                                                   'plots', 'plot_paths', 'performance_plots', 'feature_importance_plot']:
+                                            if key in obj and obj[key]:
+                                                if isinstance(obj[key], list):
+                                                    for path in obj[key]:
+                                                        if isinstance(path, str) and os.path.exists(path):
+                                                            if path.endswith('.html'):
+                                                                # Check if it's a report (in reports folder) or interactive plot
+                                                                if '/reports/' in path or 'report' in Path(path).stem.lower():
+                                                                    reports_list.append(path)
+                                                                else:
+                                                                    reports_list.append(path)  # Interactive plots also go to reports
+                                                            elif path.endswith(('.png', '.jpg', '.jpeg')):
+                                                                plots_list.append(path)
+                                                elif isinstance(obj[key], str) and os.path.exists(obj[key]):
+                                                    if obj[key].endswith('.html'):
+                                                        if '/reports/' in obj[key] or 'report' in Path(obj[key]).stem.lower():
+                                                            reports_list.append(obj[key])
+                                                        else:
+                                                            reports_list.append(obj[key])
+                                                    elif obj[key].endswith(('.png', '.jpg', '.jpeg')):
+                                                        plots_list.append(obj[key])
+                                        # Recursively search nested dicts
+                                        for value in obj.values():
+                                            find_plots_and_reports(value, plots_list, reports_list)
+                                
+                                find_plots_and_reports(result, plots_paths, html_reports)
+                            
+                            # Remove duplicates while preserving order
+                            plots_paths = list(dict.fromkeys(plots_paths))
+                            html_reports = list(dict.fromkeys(html_reports))
+                            
+                            # Display visualization and report information in response
+                            if plots_paths or html_reports:
+                                response += f"## 📊 Generated Outputs\n\n"
+                                
+                                if plots_paths:
+                                    response += f"### 📈 Visualizations ({len(plots_paths)} plots)\n"
+                                    response += "✅ Plots are displayed in the **Visualization Gallery** below!\n\n"
+                                    
+                                    # List plot files
+                                    for i, plot_path in enumerate(plots_paths[:10], 1):
+                                        try:
+                                            plot_name = Path(plot_path).stem.replace('_', ' ').title()
+                                            rel_path = os.path.relpath(plot_path, '.')
+                                            response += f"{i}. 📊 **{plot_name}**\n"
+                                            response += f"   📁 `{rel_path}`\n\n"
+                                        except Exception as e:
+                                            response += f"{i}. ❌ Error: {str(e)}\n"
+                                
+                                if html_reports:
+                                    response += f"### 📋 Reports & Interactive Plots ({len(html_reports)} files)\n"
+                                    response += "✅ Reports are displayed in the **Reports Viewer** below!\n\n"
+                                    
+                                    # List report files
+                                    for i, report_path in enumerate(html_reports[:10], 1):
+                                        try:
+                                            report_name = Path(report_path).stem.replace('_', ' ').title()
+                                            rel_path = os.path.relpath(report_path, '.')
+                                            file_size = os.path.getsize(report_path) / 1024  # KB
+                                            response += f"{i}. 📄 **{report_name}**\n"
+                                            response += f"   📁 `{rel_path}` ({file_size:.1f} KB)\n\n"
+                                        except Exception as e:
+                                            response += f"{i}. ❌ Error: {str(e)}\n"
+                            else:
+                                response += "ℹ️ No visualizations or reports were generated in this workflow.\n"
+                    else:
+                        response = f"⚠️ **AI Agent Status:** {agent_response.get('status', 'unknown')}\n\n"
+                        response += f"{agent_response.get('message', agent_response.get('error', 'Unknown error'))}\n"
+                    
+                    # Update the last assistant message with the response
+                    history = update_last_assistant_message(history, response)
+                    
+                    # Return plot paths for gallery and html_reports for HTML viewer
+                    # Store html_reports in a format the HTML component can use
+                    yield history, "", plots_paths if plots_paths else [], html_reports if html_reports else []
+                    return
+                except Exception as e:
+                    import sys
+                    exc_type, exc_value, exc_traceback = sys.exc_info()
+                    response = f"⚠️ **AI Agent Error:**\n\n"
+                    response += f"**Error Type:** {exc_type.__name__}\n\n"
+                    response += f"**Error Message:** {str(e)}\n\n"
+                    response += f"**Full Traceback:**\n```python\n{traceback.format_exc()}\n```\n\n"
+                    response += "💡 **Fallback Options:**\n"
+                    response += "- Use the **Quick Train** feature on the right\n"
+                    response += "- Try manual commands: `profile`, `quality`, `columns`\n"
+                    # Update the last assistant message with error
+                    history = update_last_assistant_message(history, response)
+                    yield history, "", plots_paths if plots_paths else []
+                    return
+            else:
+                # Manual mode - Handle commands directly
+                user_msg_lower = user_message.lower().strip()
+                
+                # Handle simple commands manually
+                if 'profile' in user_msg_lower:
+                    response = "📊 **Dataset Profile:**\n\n"
+                    if current_profile:
+                        response += f"**Shape:** {current_profile['shape']['rows']:,} rows × {current_profile['shape']['columns']} columns\n\n"
+                        response += f"**Column Types:**\n"
+                        response += f"- Numeric: {len(current_profile['column_types']['numeric'])} columns\n"
+                        response += f"- Categorical: {len(current_profile['column_types']['categorical'])} columns\n"
+                        response += f"- Datetime: {len(current_profile['column_types']['datetime'])} columns\n\n"
+                        response += f"**Overall Stats:**\n"
+                        response += f"- Total cells: {current_profile['overall_stats']['total_cells']:,}\n"
+                        response += f"- Null values: {current_profile['overall_stats']['total_nulls']} ({current_profile['overall_stats']['null_percentage']:.1f}%)\n"
+                        response += f"- Duplicates: {current_profile['overall_stats']['duplicate_rows']}\n"
+                    else:
+                        response += "Profile information is available at the top of the chat!"
+                        
+                elif 'quality' in user_msg_lower or 'issues' in user_msg_lower:
+                    quality = detect_data_quality_issues(current_file)
+                    response = "🔍 **Data Quality Report:**\n\n"
+                    
+                    if quality['critical']:
+                        response += f"🔴 **Critical Issues:** {len(quality['critical'])}\n"
+                        for issue in quality['critical']:
+                            response += f"  • {issue['message']}\n"
+                        response += "\n"
+                    
+                    if quality['warning']:
+                        response += f"🟡 **Warnings:** {len(quality['warning'])}\n"
+                        for issue in quality['warning'][:5]:  # Show first 5
+                            response += f"  • {issue['message']}\n"
+                        if len(quality['warning']) > 5:
+                            response += f"  • ... and {len(quality['warning']) - 5} more\n"
+                        response += "\n"
+                    
+                    if quality['info']:
+                        response += f"🔵 **Info:** {len(quality['info'])} observations\n"
+                    
+                    if not quality['critical'] and not quality['warning'] and not quality['info']:
+                        response += "✅ No issues detected! Your data looks good.\n"
+                        
+                elif 'columns' in user_msg_lower or 'column' in user_msg_lower:
+                    if current_profile:
+                        response = "📋 **Dataset Columns:**\n\n"
+                        for col, info in current_profile['columns'].items():
+                            nulls = info.get('null_count', 0)
+                            null_pct = (nulls / current_profile['shape']['rows'] * 100) if current_profile['shape']['rows'] > 0 else 0
+                            response += f"• **{col}** ({info['type']})\n"
+                            response += f"  - Nulls: {nulls} ({null_pct:.1f}%)\n"
+                            if 'unique' in info:
+                                response += f"  - Unique: {info['unique']}\n"
+                    else:
+                        response = "📋 **Columns:** Please upload a file first to see column information."
+                
+                elif 'help' in user_msg_lower:
+                    response = "💡 **Available Commands:**\n\n"
+                    response += "**Manual Commands:**\n"
+                    response += "• `profile` - Show detailed dataset statistics\n"
+                    response += "• `quality` - Check data quality issues\n"
+                    response += "• `columns` - List all columns with details\n"
+                    response += "• `help` - Show this help message\n\n"
+                    response += "**Quick Actions:**\n"
+                    response += "• Use the **Quick Train** panel on the right to train models\n"
+                    response += "• Check **Dataset Info** in the sidebar for quick stats\n"
+                
+                else:
+                    # Default response for unrecognized commands
+                    response = f"💬 **You said:** {user_message}\n\n"
+                    response += "⚠️ AI agent is not available. I can respond to these commands:\n\n"
+                    response += "• `profile` - Show detailed statistics\n"
+                    response += "• `quality` - Check data quality\n"
+                    response += "• `columns` - List all columns\n"
+                    response += "• `help` - Show available commands\n\n"
+                    response += "**Or use Quick Train** on the right to train models directly!\n"
+                
+                # Add user message and assistant response
+                history = add_user_message(history, user_message)
+                history = add_assistant_message(history, response)
+                yield history, "", [], []
+                return
+        
+        # If no file is uploaded yet
+        if user_message and user_message.strip() and not current_file:
+            response = "⚠️ **Please upload a dataset first!**\n\n"
+            response += "Click the 'Upload Dataset' button above and select a CSV or Parquet file."
+            # Add user message and assistant response
+            history = add_user_message(history, user_message)
+            history = add_assistant_message(history, response)
+            yield history, "", [], []
+            return
+            
+    except Exception as e:
+        error_msg = f"❌ **Error:** {str(e)}\n\n"
+        error_msg += "**Traceback:**\n```\n" + traceback.format_exc() + "\n```"
+        if user_message:
+            # Check if we already added the user message
+            last_user = get_last_user_content(history)
+            if last_user != user_message:
+                history = add_user_message(history, user_message)
+            history = add_assistant_message(history, error_msg)
+        else:
+            history = add_assistant_message(history, error_msg)
+        yield history, "", [], []
+        return
+    
+    # Default return if nothing matched
+    yield history, "", [], []
+
+
+def quick_profile(file):
+    """Quick profile display in the sidebar."""
+    if file is None:
+        return "No file uploaded yet."
+    
+    try:
+        profile = profile_dataset(file.name)
+        
+        info = f"**{Path(file.name).name}**\n\n"
+        info += f"📊 {profile['shape']['rows']:,} rows × {profile['shape']['columns']} cols\n\n"
+        info += f"**Columns:**\n"
+        for col, col_info in list(profile['columns'].items())[:10]:
+            info += f"- {col} ({col_info['type']})\n"
+        
+        if len(profile['columns']) > 10:
+            info += f"- ... and {len(profile['columns']) - 10} more\n"
+        
+        return info
+    except Exception as e:
+        return f"Error: {str(e)}"
+
+
+def train_model_ui(file, target_col, model_type, test_size, progress=gr.Progress()):
+    """Train a model directly from the UI."""
+    if file is None:
+        return "⚠️ Please upload a dataset first!"
+    
+    if not target_col:
+        return "⚠️ Please specify a target column!"
+    
+    # Clean up the target column name - remove surrounding quotes if present
+    target_col = target_col.strip().strip("'").strip('"')
+    
+    try:
+        # Show progress
+        progress(0, desc="🔄 Loading dataset...")
+        yield "⏳ **Training in progress...**\n\n📊 Loading dataset..."
+        
+        import time
+        time.sleep(0.5)  # Brief pause for UI feedback
+        
+        progress(0.2, desc="🔄 Preparing data...")
+        yield "⏳ **Training in progress...**\n\n📊 Dataset loaded\n🔄 Preparing data..."
+        
+        time.sleep(0.3)
+        # Determine problem type
+        problem_type = "classification" if model_type == "Classification" else "regression"
+        
+        progress(0.4, desc="🤖 Training models...")
+        yield "⏳ **Training in progress...**\n\n📊 Dataset loaded\n✅ Data prepared\n🤖 Training multiple models..."
+        
+        # Train baseline models
+        result = train_baseline_models(
+            file.name,
+            target_col=target_col,
+            task_type=problem_type,
+            test_size=test_size
+        )
+        
+        progress(0.9, desc="📊 Evaluating results...")
+        
+        # Check if training was successful
+        if result.get('status') == 'error':
+            yield f"❌ **Training Failed**\n\n{result.get('message', 'Unknown error')}"
+            return
+        
+        if 'best_model' not in result:
+            yield f"❌ **Training Failed**\n\nNo models were successfully trained. Result: {result}"
+            return
+        
+        # Get the best model
+        best_model_name = result['best_model']['name']
+        if not best_model_name:
+            yield f"❌ **Training Failed**\n\nNo model could be selected as best model."
+            return
+            
+        best_model_info = result['models'][best_model_name]
+        best_metrics = best_model_info.get('test_metrics', {})
+        
+        output = f"✅ **Model Training Complete!**\n\n"
+        output += f"## 🏆 Best Model: **{best_model_name}**\n\n"
+        
+        output += f"**Dataset Info:**\n"
+        output += f"- Features: {result.get('n_features', 0)}\n"
+        output += f"- Training samples: {result.get('train_size', 0):,}\n"
+        output += f"- Test samples: {result.get('test_size', 0):,}\n\n"
+        
+        if problem_type == "classification":
+            output += f"**Test Metrics:**\n"
+            output += f"- ✅ Accuracy: {best_metrics.get('accuracy', 0):.4f}\n"
+            output += f"- 🎯 Precision: {best_metrics.get('precision', 0):.4f}\n"
+            output += f"- 📊 Recall: {best_metrics.get('recall', 0):.4f}\n"
+            output += f"- 🔥 F1 Score: {best_metrics.get('f1', 0):.4f}\n\n"
+        else:
+            output += f"**Test Metrics:**\n"
+            output += f"- 📈 R² Score: {best_metrics.get('r2', 0):.4f}\n"
+            output += f"- 📉 RMSE: {best_metrics.get('rmse', 0):.2f}\n"
+            output += f"- 📊 MAE: {best_metrics.get('mae', 0):.2f}\n"
+            output += f"- 💯 MAPE: {best_metrics.get('mape', 0):.2f}%\n\n"
+        
+        output += f"## 📊 All Models Comparison:\n\n"
+        for model_name, model_info in result['models'].items():
+            if 'test_metrics' in model_info:
+                test_metrics = model_info['test_metrics']
+                indicator = "🏆 " if model_name == best_model_name else "   "
+                if problem_type == "classification":
+                    f1 = test_metrics.get('f1', 0)
+                    acc = test_metrics.get('accuracy', 0)
+                    output += f"{indicator}**{model_name}:**\n"
+                    output += f"   - F1: {f1:.4f} | Accuracy: {acc:.4f}\n"
+                else:
+                    r2 = test_metrics.get('r2', 0)
+                    rmse = test_metrics.get('rmse', 0)
+                    output += f"{indicator}**{model_name}:**\n"
+                    output += f"   - R²: {r2:.4f} | RMSE: {rmse:.2f}\n"
+            elif 'status' in model_info and model_info['status'] == 'error':
+                output += f"   ❌ **{model_name}:** {model_info.get('message', 'Error')}\n"
+        
+        # Display generated plots if available
+        plots_to_show = []
+        
+        # Check for performance plots
+        if 'performance_plots' in result and result['performance_plots']:
+            if isinstance(result['performance_plots'], list):
+                plots_to_show.extend(result['performance_plots'])
+            else:
+                plots_to_show.append(result['performance_plots'])
+        
+        # Check for feature importance plot
+        if 'feature_importance_plot' in result and result['feature_importance_plot']:
+            plots_to_show.append(result['feature_importance_plot'])
+        
+        # Embed plots
+        if plots_to_show:
+            output += f"\n\n📊 **Visualizations:**\n\n"
+            for plot_path in plots_to_show:
+                if isinstance(plot_path, str) and plot_path.endswith('.html') and os.path.exists(plot_path):
+                    try:
+                        with open(plot_path, 'r', encoding='utf-8') as f:
+                            plot_html = f.read()
+                        # Add plot title based on filename
+                        plot_name = Path(plot_path).stem.replace('_', ' ').title()
+                        output += f"**{plot_name}:**\n"
+                        output += f'<iframe srcdoc="{plot_html.replace(chr(34), "&quot;")}" width="100%" height="500" frameborder="0"></iframe>\n\n'
+                    except Exception as e:
+                        # Fallback to file path
+                        output += f"📁 {Path(plot_path).name}: `{plot_path}`\n"
+        
+        progress(1.0, desc="✅ Complete!")
+        yield output
+            
+    except Exception as e:
+        yield f"❌ **Error:** {str(e)}\n\n```\n{traceback.format_exc()}\n```"
+
+
+def clear_conversation():
+    """Clear the conversation and reset state."""
+    global current_file, current_profile
+    current_file = None
+    current_profile = None
+    return [], None, "", [], ""
+
+
+def format_html_reports(html_paths):
+    """Format HTML reports/plots for display in HTML component."""
+    if not html_paths or len(html_paths) == 0:
+        return "<div style='text-align:center; padding:40px; color:#666;'>No reports generated yet. Try: 'Generate a quality report' or 'Create interactive visualizations'</div>"
+    
+    html_output = """
+    <style>
+        .report-container {
+            padding: 20px;
+            background: #f8f9fa;
+        }
+        .report-card {
+            margin-bottom: 30px;
+            border: 2px solid #dee2e6;
+            border-radius: 12px;
+            overflow: hidden;
+            background: white;
+            box-shadow: 0 4px 6px rgba(0,0,0,0.1);
+        }
+        .report-header {
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            color: white;
+            padding: 15px 20px;
+            font-weight: bold;
+            font-size: 18px;
+            display: flex;
+            justify-content: space-between;
+            align-items: center;
+        }
+        .report-meta {
+            font-size: 12px;
+            opacity: 0.9;
+        }
+        .report-iframe {
+            width: 100%;
+            min-height: 600px;
+            border: none;
+            background: white;
+        }
+        .report-footer {
+            background: #f8f9fa;
+            padding: 10px 20px;
+            font-size: 12px;
+            color: #666;
+            border-top: 1px solid #dee2e6;
+        }
+    </style>
+    <div class="report-container">
+    """
+    
+    html_output += f"<h2 style='color: #667eea; margin-bottom: 20px;'>📋 {len(html_paths)} Report(s) Generated</h2>"
+    
+    for i, html_path in enumerate(html_paths, 1):
+        try:
+            # Get file metadata
+            file_name = Path(html_path).name
+            file_size = os.path.getsize(html_path) / 1024  # KB
+            report_title = Path(html_path).stem.replace('_', ' ').title()
+            
+            # Read the HTML content
+            with open(html_path, 'r', encoding='utf-8') as f:
+                html_content = f.read()
+            
+            # Escape the content for embedding
+            escaped_content = html_content.replace('\\', '\\\\').replace('"', '&quot;').replace("'", "\\'")
+            
+            html_output += f"""
+            <div class="report-card">
+                <div class="report-header">
+                    <span>📊 {i}. {report_title}</span>
+                    <span class="report-meta">{file_size:.1f} KB</span>
+                </div>
+                <iframe class="report-iframe" srcdoc="{escaped_content}"></iframe>
+                <div class="report-footer">
+                    📁 {html_path}
+                </div>
+            </div>
+            """
+        except Exception as e:
+            html_output += f"""
+            <div class="report-card">
+                <div class="report-header" style="background: linear-gradient(135deg, #f44336 0%, #e91e63 100%);">
+                    <span>❌ Error loading: {Path(html_path).name}</span>
+                </div>
+                <div style="padding: 20px;">
+                    <p><strong>Error:</strong> {str(e)}</p>
+                    <p><strong>Path:</strong> {html_path}</p>
+                </div>
+            </div>
+            """
+    
+    html_output += "</div>"
+    
+    return html_output
+
+
+def extract_and_display_plots(agent_response):
+    """Extract plots from agent response and format them for display."""
+    plots_html = ""
+    
+    if not agent_response or agent_response.get('status') != 'success':
+        return gr.update(value="<p style='text-align:center; color:#666;'>No visualizations generated yet. Upload a dataset and run analysis!</p>")
+    
+    workflow_history = agent_response.get('workflow_history', [])
+    if not workflow_history:
+        return gr.update(value="<p style='text-align:center; color:#666;'>No visualizations in this workflow.</p>")
+    
+    # Find all plots
+    plots_paths = []
+    
+    def find_plots(obj, plots_list):
+        if isinstance(obj, dict):
+            # Check direct plot keys
+            for key in ['plot_path', 'plot_file', 'html_path', 'output_path', 
+                       'plots', 'plot_paths', 'performance_plots', 'feature_importance_plot']:
+                if key in obj and obj[key]:
+                    if isinstance(obj[key], list):
+                        for plot_path in obj[key]:
+                            if isinstance(plot_path, str) and plot_path.endswith('.html') and os.path.exists(plot_path):
+                                plots_list.append(plot_path)
+                    elif isinstance(obj[key], str) and obj[key].endswith('.html') and os.path.exists(obj[key]):
+                        plots_list.append(obj[key])
+            # Recursively search nested dicts
+            for value in obj.values():
+                find_plots(value, plots_list)
+    
+    for step in workflow_history:
+        result = step.get('result', {})
+        find_plots(result, plots_paths)
+    
+    # Remove duplicates while preserving order
+    plots_paths = list(dict.fromkeys(plots_paths))
+    
+    if not plots_paths:
+        return gr.update(value="<p style='text-align:center; color:#666;'>No plots were generated in this analysis.</p>")
+    
+    # Build HTML gallery
+    plots_html = f"""
+    <div style='padding: 20px;'>
+        <h2 style='color: #1f77b4; margin-bottom: 20px;'>📊 Visualization Gallery ({len(plots_paths)} plots)</h2>
+    """
+    
+    for i, plot_path in enumerate(plots_paths, 1):
+        try:
+            with open(plot_path, 'r', encoding='utf-8') as f:
+                plot_content = f.read()
+            
+            plot_name = Path(plot_path).stem.replace('_', ' ').title()
+            
+            plots_html += f"""
+            <div style='margin-bottom: 30px; border: 1px solid #ddd; border-radius: 8px; overflow: hidden;'>
+                <div style='background: linear-gradient(90deg, #1f77b4, #2ca02c); color: white; padding: 10px 15px; font-weight: bold;'>
+                    {i}. {plot_name}
+                </div>
+                <div style='padding: 10px; background: white;'>
+                    <iframe srcdoc='{plot_content.replace("'", "&apos;").replace('"', "&quot;")}' 
+                            width='100%' height='500' frameborder='0' 
+                            style='border: none; border-radius: 5px;'></iframe>
+                </div>
+                <div style='background: #f8f9fa; padding: 8px 15px; font-size: 12px; color: #666;'>
+                    📁 {plot_path}
+                </div>
+            </div>
+            """
+        except Exception as e:
+            plots_html += f"""
+            <div style='margin-bottom: 20px; padding: 15px; border: 1px solid #f44336; border-radius: 5px; background: #ffebee;'>
+                <strong>❌ Failed to load: {Path(plot_path).name}</strong><br>
+                <small>{str(e)}</small>
+            </div>
+            """
+    
+    plots_html += "</div>"
+    
+    return gr.update(value=plots_html)
+
+
+# Custom CSS for better visual feedback
+custom_css = """
+.status-box {
+    padding: 10px;
+    border-radius: 5px;
+    background: linear-gradient(90deg, #e8f5e9 0%, #c8e6c9 100%);
+    margin-bottom: 10px;
+    text-align: center;
+    font-weight: bold;
+}
+"""
+
+    # Create the Gradio interface
+with gr.Blocks(title="AI Agent Data Scientist", theme=gr.themes.Soft(), css=custom_css) as demo:
+    gr.Markdown("""
+    # 🤖 AI Agent Data Scientist
+    
+    Upload your dataset and chat with the AI agent to perform data science tasks!
+    
+    **Features:**
+    - 📊 Automatic dataset profiling
+    - 🤖 Natural language queries
+    - 🎯 Model training (classification & regression)
+    - 🔍 Data quality analysis
+    - 📈 Feature engineering
+    - 🎨 **NEW:** Automatic visualization generation!
+    - And 59 tools total!
+    """)
+    
+    # Store agent response for visualization extraction
+    agent_response_state = gr.State(None)
+    
+    with gr.Row():
+        # Left column - Main chat interface
+        with gr.Column(scale=2):
+            # Status indicator
+            status_box = gr.Markdown("🟢 **Ready** - Upload a dataset to begin", elem_classes=["status-box"])
+            
+            chatbot = gr.Chatbot(
+                label="Chat with AI Agent",
+                height=450,
+                show_label=True,
+                avatar_images=(None, "🤖"),
+                sanitize_html=False  # Allow HTML content including iframes
+            )
+            
+            with gr.Row():
+                file_upload = gr.File(
+                    label="📁 Upload Dataset(s) (CSV/Parquet) - Single or Multiple Files",
+                    file_types=[".csv", ".parquet"],
+                    file_count="multiple",  # Allow multiple file uploads
+                    type="filepath"
+                )
+            
+            with gr.Row():
+                user_input = gr.Textbox(
+                    label="Your Message",
+                    placeholder="Ask anything: 'train a model', 'analyze my data', 'generate visualizations'",
+                    lines=2,
+                    scale=4
+                )
+                submit_btn = gr.Button("📤 Send", variant="primary", scale=1)
+            
+            with gr.Row():
+                clear_btn = gr.Button("🗑️ Clear", variant="secondary")        # Right column - Quick actions and info
+        with gr.Column(scale=1):
+            gr.Markdown("## 📊 Dataset Info")
+            dataset_info = gr.Markdown("Upload a dataset to see information here.")
+            
+            gr.Markdown("## 🎯 Quick Train")
+            with gr.Group():
+                target_column = gr.Textbox(
+                    label="Target Column",
+                    placeholder="e.g., 'price', 'class', 'label'"
+                )
+                model_type_choice = gr.Radio(
+                    ["Classification", "Regression"],
+                    label="Model Type",
+                    value="Classification"
+                )
+                test_size_slider = gr.Slider(
+                    0.1, 0.5, 0.3,
+                    label="Test Size",
+                    step=0.05
+                )
+                train_btn = gr.Button("🚀 Train Model", variant="primary")
+            
+            training_output = gr.Markdown("Training results will appear here.")
+            
+            gr.Markdown("""
+            ## 💡 Example Queries
+            
+            - "Train a classification model to predict [target]"
+            - "Show me statistics for [column]"
+            - "Detect outliers in the dataset"
+            - "What are the most important features?"
+            - "Generate a quality report"
+            - "Create polynomial features"
+            - "Balance the dataset using SMOTE"
+            """)
+    
+    # Visualization Gallery Section (Full Width)
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("## 🎨 Visualization Gallery")
+            visualization_gallery = gr.Gallery(
+                label="Generated Plots (PNG/JPG)",
+                show_label=True,
+                elem_id="gallery",
+                columns=2,
+                height=400
+            )
+    
+    # Reports Viewer Section (Full Width)
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("## 📋 Reports & Interactive Visualizations")
+            gr.Markdown("*HTML reports and interactive Plotly charts will be displayed here*")
+            reports_viewer = gr.HTML(
+                value="<div style='text-align:center; padding:40px; color:#666;'>No reports generated yet. Try: 'Generate a quality report' or 'Create interactive visualizations'</div>",
+                elem_id="reports_viewer"
+            )
+    
+    # Create state to hold HTML report paths
+    html_reports_state = gr.State([])
+    
+    # Event handlers with streaming support
+    submit_result = submit_btn.click(
+        fn=analyze_dataset,
+        inputs=[file_upload, user_input, chatbot],
+        outputs=[chatbot, user_input, visualization_gallery, html_reports_state],
+        show_progress="full"  # Show progress bar
+    )
+    submit_result.then(
+        fn=format_html_reports,
+        inputs=[html_reports_state],
+        outputs=[reports_viewer]
+    )
+    
+    user_input_result = user_input.submit(
+        fn=analyze_dataset,
+        inputs=[file_upload, user_input, chatbot],
+        outputs=[chatbot, user_input, visualization_gallery, html_reports_state],
+        show_progress="full"
+    )
+    user_input_result.then(
+        fn=format_html_reports,
+        inputs=[html_reports_state],
+        outputs=[reports_viewer]
+    )
+    
+    file_result = file_upload.change(
+        fn=analyze_dataset,
+        inputs=[file_upload, gr.Textbox(value="", visible=False), chatbot],
+        outputs=[chatbot, user_input, visualization_gallery, html_reports_state],
+        show_progress="full"
+    )
+    file_result.then(
+        fn=quick_profile,
+        inputs=[file_upload],
+        outputs=[dataset_info]
+    )
+    file_result.then(
+        fn=format_html_reports,
+        inputs=[html_reports_state],
+        outputs=[reports_viewer]
+    )
+    
+    train_btn.click(
+        fn=train_model_ui,
+        inputs=[file_upload, target_column, model_type_choice, test_size_slider],
+        outputs=[training_output],
+        show_progress="full"  # Show progress bar
+    )
+    
+    clear_btn.click(
+        clear_conversation,
+        outputs=[chatbot, file_upload, user_input, visualization_gallery, reports_viewer]
+    )
+
+if __name__ == "__main__":
+    print("=" * 70)
+    print("🚀 Starting AI Agent Data Scientist Chat UI...")
+    print("=" * 70)
+    print("\n🌐 The UI will open in your browser automatically.")
+    print("💡 If it doesn't, copy the URL shown below.\n")
+    
+    demo.launch(
+        share=False,  # Set to True to create a public link
+        server_name="0.0.0.0",  # Listen on all interfaces
+        server_port=7865,  # Changed port to avoid conflict
+        show_error=True,
+        inbrowser=True  # Auto-open browser
+    )
diff --git a/cloudbuild.yaml b/cloudbuild.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9ec801c1cc8034030bf899b68f463b6b407e15d3
--- /dev/null
+++ b/cloudbuild.yaml
@@ -0,0 +1,69 @@
+# Google Cloud Build configuration for automated deployments
+# Triggered on git push to main branch
+
+steps:
+  # Step 1: Build the container image
+  - name: 'gcr.io/cloud-builders/docker'
+    args: 
+      - 'build'
+      - '-t'
+      - 'gcr.io/$PROJECT_ID/data-science-agent:$COMMIT_SHA'
+      - '-t'
+      - 'gcr.io/$PROJECT_ID/data-science-agent:latest'
+      - '.'
+    timeout: 600s
+
+  # Step 2: Push the container image to Container Registry
+  - name: 'gcr.io/cloud-builders/docker'
+    args: 
+      - 'push'
+      - 'gcr.io/$PROJECT_ID/data-science-agent:$COMMIT_SHA'
+
+  - name: 'gcr.io/cloud-builders/docker'
+    args: 
+      - 'push'
+      - 'gcr.io/$PROJECT_ID/data-science-agent:latest'
+
+  # Step 3: Deploy to Cloud Run
+  - name: 'gcr.io/google.com/cloudsdktool/cloud-sdk'
+    entrypoint: gcloud
+    args:
+      - 'run'
+      - 'deploy'
+      - 'data-science-agent'
+      - '--image'
+      - 'gcr.io/$PROJECT_ID/data-science-agent:$COMMIT_SHA'
+      - '--region'
+      - 'us-central1'
+      - '--platform'
+      - 'managed'
+      - '--allow-unauthenticated'
+      - '--memory'
+      - '4Gi'
+      - '--cpu'
+      - '2'
+      - '--timeout'
+      - '900'
+      - '--max-instances'
+      - '10'
+      - '--min-instances'
+      - '0'
+      - '--concurrency'
+      - '10'
+      - '--set-env-vars'
+      - 'LLM_PROVIDER=groq,REASONING_EFFORT=medium,CACHE_TTL_SECONDS=86400'
+      - '--set-secrets'
+      - 'GROQ_API_KEY=GROQ_API_KEY:latest,GOOGLE_API_KEY=GOOGLE_API_KEY:latest,GOOGLE_APPLICATION_CREDENTIALS=GOOGLE_APPLICATION_CREDENTIALS:latest'
+
+# Build timeout
+timeout: 1200s
+
+# Images to push to Container Registry
+images:
+  - 'gcr.io/$PROJECT_ID/data-science-agent:$COMMIT_SHA'
+  - 'gcr.io/$PROJECT_ID/data-science-agent:latest'
+
+# Build options
+options:
+  machineType: 'N1_HIGHCPU_8'
+  logging: CLOUD_LOGGING_ONLY
diff --git a/data/.gitkeep b/data/.gitkeep
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/deploy.sh b/deploy.sh
new file mode 100755
index 0000000000000000000000000000000000000000..9795df76e26f99c522a0ad1cbbe907199bf16d4b
--- /dev/null
+++ b/deploy.sh
@@ -0,0 +1,171 @@
+#!/bin/bash
+# Manual deployment script for Google Cloud Run
+# Use this for one-off deployments or CI/CD pipeline integration
+
+set -e  # Exit on error
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+echo -e "${GREEN}🚀 Data Science Agent - Cloud Run Deployment${NC}"
+echo "=================================================="
+
+# Check if gcloud is installed
+if ! command -v gcloud &> /dev/null; then
+    echo -e "${RED}❌ Error: gcloud CLI not found. Install it from: https://cloud.google.com/sdk/install${NC}"
+    exit 1
+fi
+
+# Get GCP Project ID
+if [ -z "$GCP_PROJECT_ID" ]; then
+    echo -e "${YELLOW}⚠️  GCP_PROJECT_ID not set. Using gcloud default project...${NC}"
+    GCP_PROJECT_ID=$(gcloud config get-value project 2>/dev/null)
+    
+    if [ -z "$GCP_PROJECT_ID" ]; then
+        echo -e "${RED}❌ Error: No GCP project configured. Run: gcloud config set project YOUR_PROJECT_ID${NC}"
+        exit 1
+    fi
+fi
+
+echo -e "${GREEN}📋 Project ID: ${GCP_PROJECT_ID}${NC}"
+
+# Configuration
+SERVICE_NAME="data-science-agent"
+REGION="${CLOUD_RUN_REGION:-us-central1}"
+IMAGE_NAME="gcr.io/${GCP_PROJECT_ID}/${SERVICE_NAME}"
+MEMORY="${MEMORY:-4Gi}"
+CPU="${CPU:-2}"
+MAX_INSTANCES="${MAX_INSTANCES:-10}"
+TIMEOUT="${TIMEOUT:-900}"
+
+echo "Region: ${REGION}"
+echo "Image: ${IMAGE_NAME}:latest"
+echo "Memory: ${MEMORY}"
+echo "CPU: ${CPU}"
+echo ""
+
+# Step 1: Enable required APIs
+echo -e "${YELLOW}🔧 Step 1/5: Enabling required Google Cloud APIs...${NC}"
+gcloud services enable \
+    cloudbuild.googleapis.com \
+    run.googleapis.com \
+    containerregistry.googleapis.com \
+    secretmanager.googleapis.com \
+    --project=${GCP_PROJECT_ID} \
+    --quiet
+
+echo -e "${GREEN}✅ APIs enabled${NC}"
+echo ""
+
+# Step 2: Create secrets (if not exist)
+echo -e "${YELLOW}🔐 Step 2/5: Checking secrets...${NC}"
+
+create_secret_if_not_exists() {
+    local secret_name=$1
+    local secret_value=$2
+    
+    if gcloud secrets describe ${secret_name} --project=${GCP_PROJECT_ID} &>/dev/null; then
+        echo "  ℹ️  Secret ${secret_name} already exists"
+    else
+        if [ -n "${secret_value}" ]; then
+            echo "  ➕ Creating secret: ${secret_name}"
+            echo -n "${secret_value}" | gcloud secrets create ${secret_name} \
+                --data-file=- \
+                --project=${GCP_PROJECT_ID} \
+                --quiet
+        else
+            echo -e "  ${YELLOW}⚠️  ${secret_name} not provided. You'll need to create it manually:${NC}"
+            echo "     gcloud secrets create ${secret_name} --data-file=- --project=${GCP_PROJECT_ID}"
+        fi
+    fi
+}
+
+create_secret_if_not_exists "GROQ_API_KEY" "${GROQ_API_KEY}"
+create_secret_if_not_exists "GOOGLE_API_KEY" "${GOOGLE_API_KEY}"
+
+echo -e "${GREEN}✅ Secrets checked${NC}"
+echo ""
+
+# Step 3: Build container image
+echo -e "${YELLOW}🏗️  Step 3/5: Building container image...${NC}"
+gcloud builds submit \
+    --tag ${IMAGE_NAME}:latest \
+    --project=${GCP_PROJECT_ID} \
+    --timeout=600s \
+    .
+
+echo -e "${GREEN}✅ Container built: ${IMAGE_NAME}:latest${NC}"
+echo ""
+
+# Step 4: Deploy to Cloud Run
+echo -e "${YELLOW}🚀 Step 4/5: Deploying to Cloud Run...${NC}"
+
+# Build the gcloud command
+DEPLOY_CMD="gcloud run deploy ${SERVICE_NAME} \
+    --image ${IMAGE_NAME}:latest \
+    --platform managed \
+    --region ${REGION} \
+    --allow-unauthenticated \
+    --memory ${MEMORY} \
+    --cpu ${CPU} \
+    --timeout ${TIMEOUT} \
+    --max-instances ${MAX_INSTANCES} \
+    --min-instances 0 \
+    --concurrency 10 \
+    --set-env-vars LLM_PROVIDER=groq,REASONING_EFFORT=medium,CACHE_TTL_SECONDS=86400,ARTIFACT_BACKEND=local \
+    --project ${GCP_PROJECT_ID}"
+
+# Add secrets if they exist
+if gcloud secrets describe GROQ_API_KEY --project=${GCP_PROJECT_ID} &>/dev/null; then
+    DEPLOY_CMD="${DEPLOY_CMD} --set-secrets GROQ_API_KEY=GROQ_API_KEY:latest"
+fi
+
+if gcloud secrets describe GOOGLE_API_KEY --project=${GCP_PROJECT_ID} &>/dev/null; then
+    DEPLOY_CMD="${DEPLOY_CMD} --set-secrets GOOGLE_API_KEY=GOOGLE_API_KEY:latest"
+fi
+
+# Execute deployment
+eval ${DEPLOY_CMD}
+
+echo -e "${GREEN}✅ Deployment complete${NC}"
+echo ""
+
+# Step 5: Get service URL
+echo -e "${YELLOW}🌐 Step 5/5: Retrieving service URL...${NC}"
+SERVICE_URL=$(gcloud run services describe ${SERVICE_NAME} \
+    --region ${REGION} \
+    --project ${GCP_PROJECT_ID} \
+    --format 'value(status.url)')
+
+echo ""
+echo -e "${GREEN}========================================${NC}"
+echo -e "${GREEN}✅ DEPLOYMENT SUCCESSFUL!${NC}"
+echo -e "${GREEN}========================================${NC}"
+echo ""
+echo -e "🌐 Service URL: ${GREEN}${SERVICE_URL}${NC}"
+echo ""
+echo "📝 Test endpoints:"
+echo "  Health check:"
+echo "    curl ${SERVICE_URL}/health"
+echo ""
+echo "  List tools:"
+echo "    curl ${SERVICE_URL}/tools"
+echo ""
+echo "  Run analysis:"
+echo "    curl -X POST ${SERVICE_URL}/run \\"
+echo "      -F 'file=@data.csv' \\"
+echo "      -F 'task_description=Analyze this dataset and predict the target column'"
+echo ""
+echo -e "${YELLOW}📊 View logs:${NC}"
+echo "  gcloud run logs read ${SERVICE_NAME} --region ${REGION} --project ${GCP_PROJECT_ID} --limit 50"
+echo ""
+echo -e "${YELLOW}🔧 Manage service:${NC}"
+echo "  gcloud run services describe ${SERVICE_NAME} --region ${REGION} --project ${GCP_PROJECT_ID}"
+echo ""
+
+# Save service URL to file
+echo "${SERVICE_URL}" > .cloud_run_url
+echo -e "${GREEN}💾 Service URL saved to .cloud_run_url${NC}"
diff --git a/examples/titanic_example.py b/examples/titanic_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..4609640527198bfcc5d60b76697106387d630d5d
--- /dev/null
+++ b/examples/titanic_example.py
@@ -0,0 +1,166 @@
+"""
+Titanic Example - Demonstrating the complete Data Science Copilot workflow
+"""
+
+import sys
+import os
+from pathlib import Path
+
+# Add src to path
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
+
+from orchestrator import DataScienceCopilot
+from rich.console import Console
+from rich.panel import Panel
+
+console = Console()
+
+
+def main():
+    """
+    Complete example using the Titanic dataset.
+    
+    This demonstrates the full workflow:
+    1. Dataset profiling
+    2. Quality issue detection
+    3. Data cleaning
+    4. Feature engineering
+    5. Model training
+    6. Report generation
+    """
+    
+    console.print(Panel.fit(
+        "🚢 Titanic Survival Prediction - Complete Workflow Example",
+        style="bold blue"
+    ))
+    
+    # Setup
+    titanic_path = "./data/titanic.csv"
+    
+    # Check if dataset exists
+    if not Path(titanic_path).exists():
+        console.print("\n[yellow]⚠ Titanic dataset not found at ./data/titanic.csv[/yellow]")
+        console.print("[yellow]Please download it from: https://www.kaggle.com/c/titanic/data[/yellow]")
+        console.print("[yellow]Or place your own CSV file in the data directory[/yellow]\n")
+        
+        # Use a sample path instead
+        console.print("[blue]Using sample dataset path for demonstration...[/blue]\n")
+        titanic_path = "your_dataset.csv"  # User should replace this
+    
+    # Initialize copilot
+    console.print("\n[bold]Step 1: Initialize Data Science Copilot[/bold]")
+    try:
+        copilot = DataScienceCopilot(reasoning_effort="medium")
+        console.print("[green]✓ Copilot initialized successfully[/green]")
+    except Exception as e:
+        console.print(f"[red]✗ Error: {e}[/red]")
+        console.print("[yellow]Make sure to set GROQ_API_KEY in .env file[/yellow]")
+        return
+    
+    # Define the task
+    task_description = """
+    Analyze the Titanic dataset and build a model to predict passenger survival.
+    
+    Key objectives:
+    1. Understand the data structure and identify quality issues
+    2. Handle missing values appropriately
+    3. Engineer relevant features from available data (e.g., family size, titles from names)
+    4. Train and compare multiple baseline models
+    5. Identify the most important features for prediction
+    6. Provide recommendations for improvement
+    
+    Target: Achieve competitive performance (aim for 50-70th percentile on Kaggle leaderboard)
+    """
+    
+    target_column = "Survived"
+    
+    console.print("\n[bold]Step 2: Run Complete Analysis Workflow[/bold]")
+    console.print(f"Dataset: {titanic_path}")
+    console.print(f"Target: {target_column}")
+    console.print(f"Task: Predict passenger survival\n")
+    
+    # Run analysis
+    try:
+        result = copilot.analyze(
+            file_path=titanic_path,
+            task_description=task_description,
+            target_col=target_column,
+            use_cache=True,
+            max_iterations=15  # Allow more iterations for complex workflow
+        )
+        
+        # Display results
+        if result["status"] == "success":
+            console.print("\n[green]✓ Analysis Complete![/green]\n")
+            
+            # Display summary
+            console.print(Panel(
+                result["summary"],
+                title="📋 Final Analysis Summary",
+                border_style="green"
+            ))
+            
+            # Display workflow steps
+            console.print("\n[bold]🔧 Workflow Steps Executed:[/bold]")
+            for i, step in enumerate(result["workflow_history"], 1):
+                tool = step["tool"]
+                success = step["result"].get("success", False)
+                icon = "✓" if success else "✗"
+                color = "green" if success else "red"
+                console.print(f"{i}. [{color}]{icon}[/{color}] {tool}")
+            
+            # Display statistics
+            console.print(f"\n[bold]📊 Execution Statistics:[/bold]")
+            console.print(f"  Total Iterations: {result['iterations']}")
+            console.print(f"  API Calls Made: {result['api_calls']}")
+            console.print(f"  Execution Time: {result['execution_time']}s")
+            
+            # Check for trained models
+            console.print("\n[bold]🤖 Model Training Results:[/bold]")
+            for step in result["workflow_history"]:
+                if step["tool"] == "train_baseline_models":
+                    if step["result"].get("success"):
+                        models_result = step["result"]["result"]
+                        best_model = models_result.get("best_model", {})
+                        console.print(f"  Best Model: {best_model.get('name')}")
+                        console.print(f"  Score: {best_model.get('score'):.4f}")
+                        console.print(f"  Model Path: {best_model.get('model_path')}")
+            
+            # Save results
+            output_file = "./outputs/reports/titanic_analysis.json"
+            Path(output_file).parent.mkdir(parents=True, exist_ok=True)
+            
+            import json
+            with open(output_file, "w") as f:
+                json.dump(result, f, indent=2)
+            
+            console.print(f"\n[cyan]💾 Full results saved to: {output_file}[/cyan]")
+            
+            # Next steps
+            console.print("\n[bold]🎯 Next Steps:[/bold]")
+            console.print("  1. Review the generated models in ./outputs/models/")
+            console.print("  2. Check data quality reports in ./outputs/reports/")
+            console.print("  3. Examine cleaned datasets in ./outputs/data/")
+            console.print("  4. Use the best model for predictions on new data")
+            
+        elif result["status"] == "error":
+            console.print(f"\n[red]✗ Analysis failed: {result['error']}[/red]")
+            console.print(f"Error type: {result['error_type']}")
+            
+        else:
+            console.print(f"\n[yellow]⚠ Analysis incomplete: {result.get('message')}[/yellow]")
+    
+    except Exception as e:
+        console.print(f"\n[red]✗ Unexpected error: {e}[/red]")
+        import traceback
+        console.print(traceback.format_exc())
+    
+    # Cache statistics
+    console.print("\n[bold]📦 Cache Statistics:[/bold]")
+    cache_stats = copilot.get_cache_stats()
+    console.print(f"  Valid Entries: {cache_stats['valid_entries']}")
+    console.print(f"  Cache Size: {cache_stats['size_mb']} MB")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..335252e16392817e9481869a605e5470cf404c91
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,98 @@
+# Core Dependencies
+groq==0.11.0
+python-dotenv==1.0.0
+
+# Data Processing
+polars>=0.20.3
+duckdb>=0.10.0
+pyarrow>=14.0.1
+pandas>=2.2.0  # Updated for Python 3.13 compatibility
+
+# Machine Learning
+scikit-learn>=1.4.0
+xgboost>=2.0.3
+lightgbm>=4.6.0
+catboost>=1.2.8
+optuna>=3.5.0
+
+# Explainability
+shap>=0.44.1
+
+# Advanced ML Tools
+imbalanced-learn>=0.12.0
+
+# Statistical Analysis
+scipy>=1.11.4
+statsmodels>=0.14.1
+
+# Visualization
+matplotlib>=3.8.2
+seaborn>=0.13.1
+plotly>=5.18.0  # Interactive visualizations
+
+# EDA Report Generation
+sweetviz>=2.3.1  # Beautiful fast EDA reports
+ydata-profiling>=4.17.0  # Updated for Python 3.13 compatibility
+
+# User Interface
+# gradio>=5.49.1  # Replaced with React frontend
+
+# REST API (Cloud Run)
+fastapi>=0.109.0
+uvicorn>=0.25.0
+python-multipart>=0.0.6  # For file uploads
+
+# Text Processing
+textblob>=0.17.1
+
+# Time Series Forecasting
+prophet>=1.1.5
+holidays>=0.38
+
+# MLOps & Explainability
+lime==0.2.0.1
+fairlearn==0.10.0
+
+# NLP (Optional - Uncomment for advanced NLP tools)
+# These are optional but recommended for full NLP capabilities
+# spacy==3.7.2  # For named entity recognition (perform_named_entity_recognition)
+# transformers==4.35.2  # For transformer-based sentiment & topic modeling
+# sentence-transformers==2.2.2  # For semantic text similarity
+# bertopic==0.16.0  # For advanced topic modeling
+
+# Computer Vision (Optional - Uncomment for CV tools)
+# These are optional but recommended for full CV capabilities
+# torch==2.1.0  # For CNN-based image feature extraction
+# torchvision==0.16.0  # For pre-trained models (ResNet, EfficientNet, VGG)
+Pillow==10.1.0  # For basic image processing
+#opencv-python==4.8.1  # For advanced image processing & color features
+
+# Business Intelligence (Optional - Uncomment for advanced BI tools)
+# These are optional but add specialized capabilities
+# lifetimes==0.11.3  # For customer lifetime value modeling
+# econml==0.15.0  # For advanced causal inference
+
+# CLI & UI
+typer==0.9.0
+rich==13.7.0
+tqdm==4.66.1
+
+# Utilities
+pydantic==2.5.3
+joblib==1.3.2
+
+# Google Cloud Integration
+google-cloud-bigquery==3.14.1
+google-cloud-storage==2.14.0  # For GCS artifact storage
+google-auth==2.25.2
+google-generativeai==0.3.2  # For Gemini LLM support
+
+# Testing
+pytest==7.4.3
+pytest-mock==3.12.0
+pytest-cov==4.1.0
+
+# Development
+black==23.12.1
+flake8==7.0.0
+mypy==1.8.0
diff --git a/setup-deployment.sh b/setup-deployment.sh
new file mode 100755
index 0000000000000000000000000000000000000000..a987714ea09f9eb62d4b268d2f046b58a2824583
--- /dev/null
+++ b/setup-deployment.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+# Quick setup script for macOS deployment prerequisites
+
+set -e
+
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m'
+
+echo -e "${BLUE}🔧 Data Science Agent - Deployment Setup${NC}"
+echo "=========================================="
+echo ""
+
+# Check if Homebrew is installed
+if ! command -v brew &> /dev/null; then
+    echo -e "${RED}❌ Homebrew not found${NC}"
+    echo "Installing Homebrew..."
+    /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
+else
+    echo -e "${GREEN}✅ Homebrew installed${NC}"
+fi
+
+# Install Docker Desktop
+if ! command -v docker &> /dev/null; then
+    echo -e "${YELLOW}📦 Installing Docker Desktop...${NC}"
+    brew install --cask docker
+    echo -e "${GREEN}✅ Docker Desktop installed${NC}"
+    echo -e "${YELLOW}⚠️  Please start Docker Desktop application, then run this script again${NC}"
+    exit 0
+else
+    echo -e "${GREEN}✅ Docker installed${NC}"
+fi
+
+# Check if Docker daemon is running
+if ! docker info &> /dev/null; then
+    echo -e "${YELLOW}⚠️  Docker is installed but not running${NC}"
+    echo "Please start Docker Desktop application, then run this script again"
+    exit 0
+fi
+
+# Install Google Cloud SDK
+if ! command -v gcloud &> /dev/null; then
+    echo -e "${YELLOW}☁️  Installing Google Cloud SDK...${NC}"
+    brew install --cask google-cloud-sdk
+    echo -e "${GREEN}✅ Google Cloud SDK installed${NC}"
+    
+    echo ""
+    echo -e "${YELLOW}📝 Next steps:${NC}"
+    echo "1. Restart your terminal to load gcloud"
+    echo "2. Run: gcloud auth login"
+    echo "3. Run: gcloud auth application-default login"
+    echo "4. Run: gcloud config set project YOUR_PROJECT_ID"
+    echo "5. Run: ./deploy.sh"
+else
+    echo -e "${GREEN}✅ Google Cloud SDK installed${NC}"
+fi
+
+echo ""
+echo -e "${BLUE}========================================${NC}"
+echo -e "${GREEN}✅ Setup complete!${NC}"
+echo ""
+echo "Next steps:"
+echo "1. Authenticate with Google Cloud:"
+echo "   ${YELLOW}gcloud auth login${NC}"
+echo "   ${YELLOW}gcloud auth application-default login${NC}"
+echo ""
+echo "2. Set your GCP project:"
+echo "   ${YELLOW}gcloud config set project YOUR_PROJECT_ID${NC}"
+echo ""
+echo "3. Set your API keys:"
+echo "   ${YELLOW}export GROQ_API_KEY='your-groq-key'${NC}"
+echo "   ${YELLOW}export GOOGLE_API_KEY='your-google-key'${NC}"
+echo ""
+echo "4. Deploy to Cloud Run:"
+echo "   ${YELLOW}./deploy.sh${NC}"
+echo ""
diff --git a/src/__init__.py b/src/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..efafaf637db33ed68f1e861eb9dcdb73a7cf9928
--- /dev/null
+++ b/src/__init__.py
@@ -0,0 +1,7 @@
+"""Data Science Copilot - AI-powered data science automation."""
+
+__version__ = "0.1.0"
+
+from .orchestrator import DataScienceCopilot
+
+__all__ = ["DataScienceCopilot"]
diff --git a/src/api/__init__.py b/src/api/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea089315e4c2e19bdc6bd45becd53829910052df
--- /dev/null
+++ b/src/api/__init__.py
@@ -0,0 +1,4 @@
+"""
+Cloud Run API Module
+FastAPI wrapper for DataScienceCopilot
+"""
diff --git a/src/api/app.py b/src/api/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..90a8f28bfcb29f0eb7dc98f0b8dd65c92c60d6b4
--- /dev/null
+++ b/src/api/app.py
@@ -0,0 +1,513 @@
+"""
+FastAPI Application for Google Cloud Run
+Thin HTTP wrapper around DataScienceCopilot - No logic changes, just API exposure.
+"""
+
+import os
+import sys
+import tempfile
+import shutil
+from pathlib import Path
+from typing import Optional, Dict, Any, List
+import logging
+from dotenv import load_dotenv
+
+# Load environment variables from .env file
+load_dotenv()
+
+from fastapi import FastAPI, File, UploadFile, Form, HTTPException, Request
+from fastapi.responses import JSONResponse, FileResponse
+from fastapi.staticfiles import StaticFiles
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+
+# Add src to path for imports
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
+
+from orchestrator import DataScienceCopilot
+
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+# Initialize FastAPI
+app = FastAPI(
+    title="Data Science Agent API",
+    description="Cloud Run wrapper for autonomous data science workflows",
+    version="1.0.0"
+)
+
+# Enable CORS for frontend
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # Configure this properly in production
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+# Initialize agent once (singleton pattern for stateless service)
+# Agent itself is stateless - no conversation memory between requests
+agent: Optional[DataScienceCopilot] = None
+
+# Mount static files for React frontend
+frontend_path = Path(__file__).parent.parent.parent / "FRRONTEEEND" / "dist"
+if frontend_path.exists():
+    app.mount("/assets", StaticFiles(directory=str(frontend_path / "assets")), name="assets")
+    logger.info(f"✅ Frontend assets mounted from {frontend_path}")
+
+
+@app.on_event("startup")
+async def startup_event():
+    """Initialize DataScienceCopilot on service startup."""
+    global agent
+    try:
+        logger.info("Initializing DataScienceCopilot...")
+        agent = DataScienceCopilot(
+            reasoning_effort="medium",
+            provider=os.getenv("LLM_PROVIDER", "groq")
+        )
+        logger.info(f"✅ Agent initialized with provider: {agent.provider}")
+    except Exception as e:
+        logger.error(f"❌ Failed to initialize agent: {e}")
+        raise
+
+
+@app.get("/api/health")
+async def root():
+    """Health check endpoint."""
+    return {
+        "service": "Data Science Agent API",
+        "status": "healthy",
+        "provider": agent.provider if agent else "not initialized",
+        "tools_available": len(agent.tool_functions) if agent else 0
+    }
+
+
+@app.get("/health")
+async def health_check():
+    """
+    Health check for Cloud Run.
+    Returns 200 if service is ready to accept requests.
+    """
+    if agent is None:
+        raise HTTPException(status_code=503, detail="Agent not initialized")
+    
+    return {
+        "status": "healthy",
+        "agent_ready": True,
+        "provider": agent.provider,
+        "tools_count": len(agent.tool_functions)
+    }
+
+
+class AnalysisRequest(BaseModel):
+    """Request model for analysis endpoint (JSON body)."""
+    task_description: str
+    target_col: Optional[str] = None
+    use_cache: bool = True
+    max_iterations: int = 20
+
+
+@app.post("/run")
+async def run_analysis(
+    file: UploadFile = File(..., description="Dataset file (CSV or Parquet)"),
+    task_description: str = Form(..., description="Natural language task description"),
+    target_col: Optional[str] = Form(None, description="Target column name for prediction"),
+    use_cache: bool = Form(True, description="Enable caching for expensive operations"),
+    max_iterations: int = Form(20, description="Maximum workflow iterations")
+) -> JSONResponse:
+    """
+    Run complete data science workflow on uploaded dataset.
+    
+    This is a thin wrapper - all logic lives in DataScienceCopilot.analyze().
+    
+    Args:
+        file: CSV or Parquet file upload
+        task_description: Natural language description of the task
+        target_col: Optional target column for ML tasks
+        use_cache: Whether to use cached results
+        max_iterations: Maximum number of workflow steps
+        
+    Returns:
+        JSON response with analysis results, workflow history, and execution stats
+        
+    Example:
+        ```bash
+        curl -X POST http://localhost:8080/run \
+          -F "file=@data.csv" \
+          -F "task_description=Analyze this dataset and predict house prices" \
+          -F "target_col=price"
+        ```
+    """
+    if agent is None:
+        raise HTTPException(status_code=503, detail="Agent not initialized")
+    
+    # Validate file format
+    filename = file.filename.lower()
+    if not (filename.endswith('.csv') or filename.endswith('.parquet')):
+        raise HTTPException(
+            status_code=400,
+            detail="Invalid file format. Only CSV and Parquet files are supported."
+        )
+    
+    # Use /tmp for Cloud Run (ephemeral storage)
+    temp_dir = Path("/tmp") / "data_science_agent"
+    temp_dir.mkdir(parents=True, exist_ok=True)
+    
+    temp_file_path = None
+    
+    try:
+        # Save uploaded file to temporary location
+        temp_file_path = temp_dir / file.filename
+        logger.info(f"Saving uploaded file to: {temp_file_path}")
+        
+        with open(temp_file_path, "wb") as buffer:
+            shutil.copyfileobj(file.file, buffer)
+        
+        logger.info(f"File saved successfully: {file.filename} ({os.path.getsize(temp_file_path)} bytes)")
+        
+        # Call existing agent logic - NO CHANGES to orchestrator
+        logger.info(f"Starting analysis with task: {task_description}")
+        result = agent.analyze(
+            file_path=str(temp_file_path),
+            task_description=task_description,
+            target_col=target_col,
+            use_cache=use_cache,
+            max_iterations=max_iterations
+        )
+        
+        logger.info(f"Analysis completed: {result.get('status')}")
+        
+        # Filter out non-JSON-serializable objects (like matplotlib/plotly Figures)
+        def make_json_serializable(obj):
+            """Recursively convert objects to JSON-serializable format."""
+            if isinstance(obj, dict):
+                return {k: make_json_serializable(v) for k, v in obj.items()}
+            elif isinstance(obj, list):
+                return [make_json_serializable(item) for item in obj]
+            elif hasattr(obj, '__class__') and obj.__class__.__name__ in ['Figure', 'Axes', 'Artist']:
+                # Skip matplotlib/plotly Figure objects
+                return f"<{obj.__class__.__name__} object - see artifacts>"
+            elif isinstance(obj, (str, int, float, bool, type(None))):
+                return obj
+            else:
+                # Try to convert to string for other types
+                try:
+                    return str(obj)
+                except:
+                    return f"<{type(obj).__name__}>"
+        
+        serializable_result = make_json_serializable(result)
+        
+        # Return result as-is from orchestrator
+        return JSONResponse(
+            content={
+                "success": result.get("status") == "success",
+                "result": serializable_result,
+                "metadata": {
+                    "filename": file.filename,
+                    "task": task_description,
+                    "target": target_col,
+                    "provider": agent.provider
+                }
+            },
+            status_code=200
+        )
+    
+    except Exception as e:
+        logger.error(f"Analysis failed: {str(e)}", exc_info=True)
+        raise HTTPException(
+            status_code=500,
+            detail={
+                "error": str(e),
+                "error_type": type(e).__name__,
+                "message": "Analysis workflow failed. Check logs for details."
+            }
+        )
+    
+    finally:
+        # Cleanup temporary file
+        if temp_file_path and temp_file_path.exists():
+            try:
+                temp_file_path.unlink()
+                logger.info(f"Cleaned up temporary file: {temp_file_path}")
+            except Exception as e:
+                logger.warning(f"Failed to cleanup temp file: {e}")
+
+
+@app.post("/profile")
+async def profile_dataset(
+    file: UploadFile = File(..., description="Dataset file (CSV or Parquet)")
+) -> JSONResponse:
+    """
+    Quick dataset profiling without full workflow.
+    
+    Returns basic statistics, data types, and quality issues.
+    Useful for initial data exploration without running full analysis.
+    
+    Example:
+        ```bash
+        curl -X POST http://localhost:8080/profile \
+          -F "file=@data.csv"
+        ```
+    """
+    if agent is None:
+        raise HTTPException(status_code=503, detail="Agent not initialized")
+    
+    filename = file.filename.lower()
+    if not (filename.endswith('.csv') or filename.endswith('.parquet')):
+        raise HTTPException(
+            status_code=400,
+            detail="Invalid file format. Only CSV and Parquet files are supported."
+        )
+    
+    temp_dir = Path("/tmp") / "data_science_agent"
+    temp_dir.mkdir(parents=True, exist_ok=True)
+    temp_file_path = None
+    
+    try:
+        # Save file temporarily
+        temp_file_path = temp_dir / file.filename
+        with open(temp_file_path, "wb") as buffer:
+            shutil.copyfileobj(file.file, buffer)
+        
+        # Import profiling tool directly
+        from tools.data_profiling import profile_dataset as profile_tool
+        from tools.data_profiling import detect_data_quality_issues
+        
+        # Run profiling tools
+        logger.info(f"Profiling dataset: {file.filename}")
+        profile_result = profile_tool(str(temp_file_path))
+        quality_result = detect_data_quality_issues(str(temp_file_path))
+        
+        return JSONResponse(
+            content={
+                "success": True,
+                "filename": file.filename,
+                "profile": profile_result,
+                "quality_issues": quality_result
+            },
+            status_code=200
+        )
+    
+    except Exception as e:
+        logger.error(f"Profiling failed: {str(e)}", exc_info=True)
+        raise HTTPException(
+            status_code=500,
+            detail={
+                "error": str(e),
+                "error_type": type(e).__name__
+            }
+        )
+    
+    finally:
+        if temp_file_path and temp_file_path.exists():
+            try:
+                temp_file_path.unlink()
+            except Exception as e:
+                logger.warning(f"Failed to cleanup temp file: {e}")
+
+
+@app.get("/tools")
+async def list_tools():
+    """
+    List all available tools in the agent.
+    
+    Returns tool names organized by category.
+    Useful for understanding agent capabilities.
+    """
+    if agent is None:
+        raise HTTPException(status_code=503, detail="Agent not initialized")
+    
+    from tools.tools_registry import get_tools_by_category
+    
+    return {
+        "total_tools": len(agent.tool_functions),
+        "tools_by_category": get_tools_by_category(),
+        "all_tools": list(agent.tool_functions.keys())
+    }
+
+
+class ChatMessage(BaseModel):
+    """Chat message model."""
+    role: str  # 'user' or 'assistant'
+    content: str
+
+
+class ChatRequest(BaseModel):
+    """Chat request model."""
+    messages: List[ChatMessage]
+    stream: bool = False
+
+
+@app.post("/chat")
+async def chat(request: ChatRequest) -> JSONResponse:
+    """
+    Chat endpoint for conversational interface.
+    
+    Processes chat messages and returns agent responses.
+    Uses the same underlying agent as /run but in chat format.
+    
+    Args:
+        request: Chat request with message history
+        
+    Returns:
+        JSON response with agent's reply
+    """
+    if agent is None:
+        raise HTTPException(status_code=503, detail="Agent not initialized")
+    
+    try:
+        # Extract the latest user message
+        user_messages = [msg for msg in request.messages if msg.role == "user"]
+        if not user_messages:
+            raise HTTPException(status_code=400, detail="No user message found")
+        
+        latest_message = user_messages[-1].content
+        
+        # Check for API key
+        api_key = os.getenv("GOOGLE_API_KEY")
+        if not api_key:
+            raise HTTPException(
+                status_code=500,
+                detail="GOOGLE_API_KEY not configured. Please set the environment variable."
+            )
+        
+        # Use Google Gemini API
+        import google.generativeai as genai
+        
+        logger.info(f"Configuring Gemini with API key (length: {len(api_key)})")
+        genai.configure(api_key=api_key)
+        
+        # Initialize Gemini model
+        model = genai.GenerativeModel(
+            model_name=os.getenv("GEMINI_MODEL", "gemini-2.5-flash-lite"),
+            system_instruction="You are a Senior Data Science Autonomous Agent. You help users with end-to-end machine learning, data profiling, visualization, and strategic insights. Use a professional, technical yet accessible tone. Provide code snippets in Python if requested. You have access to tools for data analysis, ML training, visualization, and more."
+        )
+        
+        # Convert messages to Gemini format (exclude system message, just conversation)
+        chat_history = []
+        for msg in request.messages[:-1]:  # Exclude the latest message
+            chat_history.append({
+                "role": "user" if msg.role == "user" else "model",
+                "parts": [msg.content]
+            })
+        
+        # Start chat with history
+        chat = model.start_chat(history=chat_history)
+        
+        # Send the latest message
+        response = chat.send_message(latest_message)
+        
+        assistant_message = response.text
+        
+        return JSONResponse(
+            content={
+                "success": True,
+                "message": assistant_message,
+                "model": "gemini-2.0-flash-exp",
+                "provider": "gemini"
+            },
+            status_code=200
+        )
+    
+    except Exception as e:
+        logger.error(f"Chat failed: {str(e)}", exc_info=True)
+        raise HTTPException(
+            status_code=500,
+            detail={
+                "error": str(e),
+                "error_type": type(e).__name__
+            }
+        )
+
+
+# Error handlers
+@app.exception_handler(HTTPException)
+async def http_exception_handler(request, exc):
+    """Custom error response format."""
+    return JSONResponse(
+        status_code=exc.status_code,
+        content={
+            "success": False,
+            "error": exc.detail,
+            "status_code": exc.status_code
+        }
+    )
+
+
+@app.exception_handler(Exception)
+async def general_exception_handler(request, exc):
+    """Catch-all error handler."""
+    logger.error(f"Unhandled exception: {str(exc)}", exc_info=True)
+    return JSONResponse(
+        status_code=500,
+        content={
+            "success": False,
+            "error": "Internal server error",
+            "detail": str(exc),
+            "error_type": type(exc).__name__
+        }
+    )
+
+
+@app.get("/outputs/{file_path:path}")
+async def serve_output_files(file_path: str):
+    """
+    Serve generated output files (reports, plots, models, etc.).
+    """
+    output_path = Path("./outputs") / file_path
+    
+    if not output_path.exists():
+        raise HTTPException(status_code=404, detail=f"File not found: {file_path}")
+    
+    if not output_path.is_file():
+        raise HTTPException(status_code=400, detail="Path is not a file")
+    
+    # Security: prevent directory traversal
+    try:
+        output_path.resolve().relative_to(Path("./outputs").resolve())
+    except ValueError:
+        raise HTTPException(status_code=403, detail="Access denied")
+    
+    return FileResponse(output_path)
+
+
+@app.get("/{full_path:path}")
+async def serve_frontend(full_path: str):
+    """
+    Serve React frontend for all non-API routes.
+    This should be the last route defined.
+    """
+    frontend_path = Path(__file__).parent.parent.parent / "FRRONTEEEND" / "dist"
+    
+    # Try to serve the requested file
+    file_path = frontend_path / full_path
+    if file_path.is_file():
+        return FileResponse(file_path)
+    
+    # Default to index.html for client-side routing
+    index_path = frontend_path / "index.html"
+    if index_path.exists():
+        return FileResponse(index_path)
+    
+    # Frontend not built
+    raise HTTPException(
+        status_code=404,
+        detail="Frontend not found. Please build the frontend first: cd FRRONTEEEND && npm run build"
+    )
+
+
+# Cloud Run listens on PORT environment variable
+if __name__ == "__main__":
+    import uvicorn
+    
+    port = int(os.getenv("PORT", 8080))
+    
+    uvicorn.run(
+        "app:app",
+        host="0.0.0.0",
+        port=port,
+        log_level="info"
+    )
diff --git a/src/cache/__init__.py b/src/cache/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8daccf6775e41c99de33cf5d7da4fbdb839a57c
--- /dev/null
+++ b/src/cache/__init__.py
@@ -0,0 +1,5 @@
+"""Cache module initialization."""
+
+from .cache_manager import CacheManager
+
+__all__ = ["CacheManager"]
diff --git a/src/cache/cache_manager.py b/src/cache/cache_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..da84afdf113c83aa91727e5cd80486d074eb5b46
--- /dev/null
+++ b/src/cache/cache_manager.py
@@ -0,0 +1,292 @@
+"""
+Cache Manager for Data Science Copilot
+Uses SQLite for persistent caching of API responses and computation results.
+"""
+
+import hashlib
+import json
+import sqlite3
+import time
+from pathlib import Path
+from typing import Any, Optional
+import pickle
+
+
+class CacheManager:
+    """
+    Manages caching of LLM responses and expensive computations.
+    
+    Uses SQLite for persistence and supports TTL-based invalidation.
+    Cache keys are generated from file hashes and operation parameters.
+    """
+    
+    def __init__(self, db_path: str = "./cache_db/cache.db", ttl_seconds: int = 86400):
+        """
+        Initialize cache manager.
+        
+        Args:
+            db_path: Path to SQLite database file
+            ttl_seconds: Time-to-live for cache entries (default 24 hours)
+        """
+        self.db_path = Path(db_path)
+        self.ttl_seconds = ttl_seconds
+        
+        # Ensure cache directory exists
+        self.db_path.parent.mkdir(parents=True, exist_ok=True)
+        
+        # Initialize database
+        self._init_db()
+    
+    def _init_db(self) -> None:
+        """Create cache table if it doesn't exist."""
+        try:
+            conn = sqlite3.connect(self.db_path)
+            cursor = conn.cursor()
+            
+            cursor.execute("""
+                CREATE TABLE IF NOT EXISTS cache (
+                    key TEXT PRIMARY KEY,
+                    value BLOB NOT NULL,
+                    created_at INTEGER NOT NULL,
+                    expires_at INTEGER NOT NULL,
+                    metadata TEXT
+                )
+            """)
+            
+            # Create index on expires_at for efficient cleanup
+            cursor.execute("""
+                CREATE INDEX IF NOT EXISTS idx_expires_at 
+                ON cache(expires_at)
+            """)
+            
+            conn.commit()
+            conn.close()
+            print(f"✅ Cache database initialized at {self.db_path}")
+        except Exception as e:
+            print(f"⚠️ Error initializing cache database: {e}")
+            print(f"   Attempting to recreate database...")
+            try:
+                # Remove corrupted database and recreate
+                if self.db_path.exists():
+                    self.db_path.unlink()
+                
+                conn = sqlite3.connect(self.db_path)
+                cursor = conn.cursor()
+                
+                cursor.execute("""
+                    CREATE TABLE cache (
+                        key TEXT PRIMARY KEY,
+                        value BLOB NOT NULL,
+                        created_at INTEGER NOT NULL,
+                        expires_at INTEGER NOT NULL,
+                        metadata TEXT
+                    )
+                """)
+                
+                cursor.execute("""
+                    CREATE INDEX idx_expires_at 
+                    ON cache(expires_at)
+                """)
+                
+                conn.commit()
+                conn.close()
+                print(f"✅ Cache database recreated successfully")
+            except Exception as e2:
+                print(f"❌ Failed to recreate cache database: {e2}")
+                print(f"   Cache functionality will be disabled")
+    
+    def _generate_key(self, *args, **kwargs) -> str:
+        """
+        Generate a unique cache key from arguments.
+        
+        Args:
+            *args: Positional arguments to hash
+            **kwargs: Keyword arguments to hash
+            
+        Returns:
+            MD5 hash of the arguments
+        """
+        # Combine args and kwargs into a single string
+        key_data = json.dumps({"args": args, "kwargs": kwargs}, sort_keys=True)
+        return hashlib.md5(key_data.encode()).hexdigest()
+    
+    def get(self, key: str) -> Optional[Any]:
+        """
+        Retrieve value from cache.
+        
+        Args:
+            key: Cache key
+            
+        Returns:
+            Cached value if exists and not expired, None otherwise
+        """
+        try:
+            conn = sqlite3.connect(self.db_path)
+            cursor = conn.cursor()
+            
+            current_time = int(time.time())
+            
+            cursor.execute("""
+                SELECT value, expires_at 
+                FROM cache 
+                WHERE key = ? AND expires_at > ?
+            """, (key, current_time))
+            
+            result = cursor.fetchone()
+            conn.close()
+        except sqlite3.OperationalError as e:
+            print(f"⚠️ Cache read error: {e}")
+            print(f"   Reinitializing cache database...")
+            self._init_db()
+            return None
+        except Exception as e:
+            print(f"⚠️ Unexpected cache error: {e}")
+            return None
+        
+        if result:
+            value_blob, expires_at = result
+            # Deserialize using pickle for complex Python objects
+            return pickle.loads(value_blob)
+        
+        return None
+    
+    def set(self, key: str, value: Any, ttl_override: Optional[int] = None, 
+            metadata: Optional[dict] = None) -> None:
+        """
+        Store value in cache.
+        
+        Args:
+            key: Cache key
+            value: Value to cache (must be pickleable)
+            ttl_override: Optional override for TTL (seconds)
+            metadata: Optional metadata to store with cache entry
+        """
+        try:
+            conn = sqlite3.connect(self.db_path)
+            cursor = conn.cursor()
+            
+            current_time = int(time.time())
+            ttl = ttl_override if ttl_override is not None else self.ttl_seconds
+            expires_at = current_time + ttl
+            
+            # Serialize value using pickle
+            value_blob = pickle.dumps(value)
+            
+            # Serialize metadata as JSON
+            metadata_json = json.dumps(metadata) if metadata else None
+            
+            cursor.execute("""
+                INSERT OR REPLACE INTO cache (key, value, created_at, expires_at, metadata)
+                VALUES (?, ?, ?, ?, ?)
+            """, (key, value_blob, current_time, expires_at, metadata_json))
+            
+            conn.commit()
+            conn.close()
+        except sqlite3.OperationalError as e:
+            print(f"⚠️ Cache write error: {e}")
+            print(f"   Reinitializing cache database...")
+            self._init_db()
+        except Exception as e:
+            print(f"⚠️ Unexpected cache error during write: {e}")
+    
+    def invalidate(self, key: str) -> bool:
+        """
+        Remove specific entry from cache.
+        
+        Args:
+            key: Cache key to invalidate
+            
+        Returns:
+            True if entry was removed, False if not found
+        """
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        
+        cursor.execute("DELETE FROM cache WHERE key = ?", (key,))
+        deleted = cursor.rowcount > 0
+        
+        conn.commit()
+        conn.close()
+        
+        return deleted
+    
+    def clear_expired(self) -> int:
+        """
+        Remove all expired entries from cache.
+        
+        Returns:
+            Number of entries removed
+        """
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        
+        current_time = int(time.time())
+        cursor.execute("DELETE FROM cache WHERE expires_at <= ?", (current_time,))
+        deleted = cursor.rowcount
+        
+        conn.commit()
+        conn.close()
+        
+        return deleted
+    
+    def clear_all(self) -> None:
+        """Remove all entries from cache."""
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        
+        cursor.execute("DELETE FROM cache")
+        
+        conn.commit()
+        conn.close()
+    
+    def get_stats(self) -> dict:
+        """
+        Get cache statistics.
+        
+        Returns:
+            Dictionary with cache stats (total entries, expired, size)
+        """
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        
+        current_time = int(time.time())
+        
+        # Total entries
+        cursor.execute("SELECT COUNT(*) FROM cache")
+        total = cursor.fetchone()[0]
+        
+        # Valid entries
+        cursor.execute("SELECT COUNT(*) FROM cache WHERE expires_at > ?", (current_time,))
+        valid = cursor.fetchone()[0]
+        
+        # Database size
+        cursor.execute("SELECT page_count * page_size FROM pragma_page_count(), pragma_page_size()")
+        size_bytes = cursor.fetchone()[0]
+        
+        conn.close()
+        
+        return {
+            "total_entries": total,
+            "valid_entries": valid,
+            "expired_entries": total - valid,
+            "size_mb": round(size_bytes / (1024 * 1024), 2)
+        }
+    
+    def generate_file_hash(self, file_path: str) -> str:
+        """
+        Generate hash of file contents for cache key.
+        
+        Args:
+            file_path: Path to file
+            
+        Returns:
+            MD5 hash of file contents
+        """
+        hasher = hashlib.md5()
+        
+        with open(file_path, 'rb') as f:
+            # Read file in chunks to handle large files
+            for chunk in iter(lambda: f.read(4096), b""):
+                hasher.update(chunk)
+        
+        return hasher.hexdigest()
diff --git a/src/cli.py b/src/cli.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f4c6339d3b42cf299e0995bff700d1047666ee5
--- /dev/null
+++ b/src/cli.py
@@ -0,0 +1,345 @@
+"""
+Command Line Interface for Data Science Copilot
+"""
+
+import typer
+from rich.console import Console
+from rich.table import Table
+from rich.panel import Panel
+from rich.progress import Progress, SpinnerColumn, TextColumn
+from rich import print as rprint
+from pathlib import Path
+import json
+import sys
+import os
+
+# Add src to path
+sys.path.insert(0, os.path.join(os.path.dirname(__file__)))
+
+from orchestrator import DataScienceCopilot
+
+app = typer.Typer(
+    name="datascience-copilot",
+    help="AI-powered Data Science Copilot for automated data analysis and modeling",
+    add_completion=False
+)
+
+console = Console()
+
+
+@app.command()
+def analyze(
+    file_path: str = typer.Argument(..., help="Path to dataset file (CSV or Parquet)"),
+    task: str = typer.Option(
+        "Complete data science workflow: profile, clean, engineer features, and train models",
+        "--task", "-t",
+        help="Description of the analysis task"
+    ),
+    target: str = typer.Option(None, "--target", "-y", help="Target column name for prediction"),
+    output: str = typer.Option("./outputs", "--output", "-o", help="Output directory"),
+    no_cache: bool = typer.Option(False, "--no-cache", help="Disable caching"),
+    reasoning: str = typer.Option("medium", "--reasoning", "-r", help="Reasoning effort (low/medium/high)")
+):
+    """
+    Analyze a dataset and perform complete data science workflow.
+    
+    Example:
+        python cli.py analyze data.csv --target Survived --task "Predict survival"
+    """
+    console.print(Panel.fit(
+        "🤖 Data Science Copilot - AI-Powered Analysis",
+        style="bold blue"
+    ))
+    
+    # Validate file exists
+    if not Path(file_path).exists():
+        console.print(f"[red]✗ Error: File not found: {file_path}[/red]")
+        raise typer.Exit(1)
+    
+    # Initialize copilot
+    try:
+        with Progress(
+            SpinnerColumn(),
+            TextColumn("[progress.description]{task.description}"),
+            console=console
+        ) as progress:
+            task_init = progress.add_task("Initializing Data Science Copilot...", total=None)
+            copilot = DataScienceCopilot(reasoning_effort=reasoning)
+            progress.update(task_init, completed=True)
+    
+    except Exception as e:
+        console.print(f"[red]✗ Error initializing copilot: {e}[/red]")
+        console.print("[yellow]Make sure GROQ_API_KEY is set in .env file[/yellow]")
+        raise typer.Exit(1)
+    
+    # Run analysis
+    console.print(f"\n📊 [bold]Dataset:[/bold] {file_path}")
+    console.print(f"🎯 [bold]Task:[/bold] {task}")
+    if target:
+        console.print(f"🎲 [bold]Target:[/bold] {target}")
+    console.print()
+    
+    try:
+        with Progress(
+            SpinnerColumn(),
+            TextColumn("[progress.description]{task.description}"),
+            console=console
+        ) as progress:
+            task_analyze = progress.add_task("Running analysis workflow...", total=None)
+            
+            result = copilot.analyze(
+                file_path=file_path,
+                task_description=task,
+                target_col=target,
+                use_cache=not no_cache
+            )
+            
+            progress.update(task_analyze, completed=True)
+    
+    except Exception as e:
+        console.print(f"\n[red]✗ Analysis failed: {e}[/red]")
+        raise typer.Exit(1)
+    
+    # Display results
+    if result["status"] == "success":
+        console.print("\n[green]✓ Analysis Complete![/green]\n")
+        
+        # Summary
+        console.print(Panel(
+            result["summary"],
+            title="📋 Analysis Summary",
+            border_style="green"
+        ))
+        
+        # Workflow history
+        console.print("\n[bold]🔧 Tools Executed:[/bold]")
+        for step in result["workflow_history"]:
+            tool_name = step["tool"]
+            success = step["result"].get("success", False)
+            icon = "✓" if success else "✗"
+            color = "green" if success else "red"
+            console.print(f"  [{color}]{icon}[/{color}] {tool_name}")
+        
+        # Stats
+        stats_table = Table(title="📊 Execution Statistics", show_header=False)
+        stats_table.add_column("Metric", style="cyan")
+        stats_table.add_column("Value", style="white")
+        
+        stats_table.add_row("Iterations", str(result["iterations"]))
+        stats_table.add_row("API Calls", str(result["api_calls"]))
+        stats_table.add_row("Execution Time", f"{result['execution_time']}s")
+        
+        console.print()
+        console.print(stats_table)
+        
+        # Save full report
+        report_path = Path(output) / "reports" / f"analysis_{Path(file_path).stem}.json"
+        report_path.parent.mkdir(parents=True, exist_ok=True)
+        with open(report_path, "w") as f:
+            json.dump(result, f, indent=2)
+        
+        console.print(f"\n💾 Full report saved to: [cyan]{report_path}[/cyan]")
+    
+    elif result["status"] == "error":
+        console.print(f"\n[red]✗ Error: {result['error']}[/red]")
+        raise typer.Exit(1)
+    
+    else:
+        console.print(f"\n[yellow]⚠ Analysis incomplete: {result.get('message')}[/yellow]")
+
+
+@app.command()
+def profile(
+    file_path: str = typer.Argument(..., help="Path to dataset file")
+):
+    """
+    Quick profile of a dataset (basic statistics and quality checks).
+    
+    Example:
+        python cli.py profile data.csv
+    """
+    from tools.data_profiling import profile_dataset, detect_data_quality_issues
+    
+    console.print(f"\n📊 [bold]Profiling:[/bold] {file_path}\n")
+    
+    # Profile
+    with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}")) as progress:
+        task1 = progress.add_task("Analyzing dataset...", total=None)
+        profile = profile_dataset(file_path)
+        progress.update(task1, completed=True)
+    
+    # Display basic info
+    info_table = Table(title="Dataset Information", show_header=False)
+    info_table.add_column("Property", style="cyan")
+    info_table.add_column("Value", style="white")
+    
+    info_table.add_row("Rows", str(profile["shape"]["rows"]))
+    info_table.add_row("Columns", str(profile["shape"]["columns"]))
+    info_table.add_row("Memory", f"{profile['memory_usage']['total_mb']} MB")
+    info_table.add_row("Null %", f"{profile['overall_stats']['null_percentage']}%")
+    info_table.add_row("Duplicates", str(profile['overall_stats']['duplicate_rows']))
+    
+    console.print()
+    console.print(info_table)
+    
+    # Column types
+    console.print("\n[bold]Column Types:[/bold]")
+    console.print(f"  Numeric: {len(profile['column_types']['numeric'])}")
+    console.print(f"  Categorical: {len(profile['column_types']['categorical'])}")
+    console.print(f"  Datetime: {len(profile['column_types']['datetime'])}")
+    
+    # Detect issues
+    console.print("\n[bold]Quality Check:[/bold]")
+    with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}")) as progress:
+        task2 = progress.add_task("Detecting quality issues...", total=None)
+        issues = detect_data_quality_issues(file_path)
+        progress.update(task2, completed=True)
+    
+    console.print(f"  🔴 Critical: {issues['summary']['critical_count']}")
+    console.print(f"  🟡 Warnings: {issues['summary']['warning_count']}")
+    console.print(f"  🔵 Info: {issues['summary']['info_count']}")
+
+
+@app.command()
+def clean(
+    file_path: str = typer.Argument(..., help="Path to dataset file"),
+    output: str = typer.Option(None, "--output", "-o", help="Output file path"),
+    strategy: str = typer.Option("auto", "--strategy", "-s", help="Cleaning strategy (auto/median/mean/mode/drop)")
+):
+    """
+    Clean dataset (handle missing values and outliers).
+    
+    Example:
+        python cli.py clean data.csv --output cleaned_data.csv
+    """
+    from tools.data_cleaning import clean_missing_values
+    from tools.data_profiling import profile_dataset
+    
+    if output is None:
+        output = f"./outputs/data/cleaned_{Path(file_path).name}"
+    
+    console.print(f"\n🧹 [bold]Cleaning:[/bold] {file_path}\n")
+    
+    # Get columns with missing values
+    profile = profile_dataset(file_path)
+    cols_with_nulls = {
+        col: "auto"
+        for col, info in profile["columns"].items()
+        if info["null_count"] > 0
+    }
+    
+    if not cols_with_nulls:
+        console.print("[green]✓ No missing values found - dataset is clean![/green]")
+        return
+    
+    console.print(f"Found {len(cols_with_nulls)} columns with missing values")
+    
+    # Clean
+    with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}")) as progress:
+        task = progress.add_task("Cleaning dataset...", total=None)
+        result = clean_missing_values(file_path, cols_with_nulls, output)
+        progress.update(task, completed=True)
+    
+    console.print(f"\n[green]✓ Cleaned dataset saved to: {output}[/green]")
+    console.print(f"  Rows: {result['original_rows']} → {result['final_rows']}")
+
+
+@app.command()
+def train(
+    file_path: str = typer.Argument(..., help="Path to prepared dataset"),
+    target: str = typer.Argument(..., help="Target column name"),
+    task_type: str = typer.Option("auto", "--task-type", help="Task type (classification/regression/auto)")
+):
+    """
+    Train baseline models on prepared dataset.
+    
+    Example:
+        python cli.py train cleaned_data.csv Survived --task-type classification
+    """
+    from tools.model_training import train_baseline_models
+    
+    console.print(f"\n🤖 [bold]Training Models[/bold]\n")
+    console.print(f"📊 Dataset: {file_path}")
+    console.print(f"🎯 Target: {target}\n")
+    
+    # Train
+    with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}")) as progress:
+        task = progress.add_task("Training baseline models...", total=None)
+        result = train_baseline_models(file_path, target, task_type)
+        progress.update(task, completed=True)
+    
+    if "error" in result:
+        console.print(f"[red]✗ Error: {result['message']}[/red]")
+        raise typer.Exit(1)
+    
+    # Display results
+    console.print(f"\n[green]✓ Training Complete![/green]\n")
+    console.print(f"Task Type: {result['task_type']}")
+    console.print(f"Features: {result['n_features']}")
+    console.print(f"Samples: {result['n_samples']}\n")
+    
+    # Model comparison table
+    table = Table(title="Model Performance")
+    table.add_column("Model", style="cyan")
+    
+    # Add metric columns based on task type
+    if result["task_type"] == "classification":
+        table.add_column("Accuracy", justify="right")
+        table.add_column("F1 Score", justify="right")
+    else:
+        table.add_column("R² Score", justify="right")
+        table.add_column("RMSE", justify="right")
+    
+    for model_name, model_result in result["models"].items():
+        if "test_metrics" in model_result:
+            metrics = model_result["test_metrics"]
+            if result["task_type"] == "classification":
+                table.add_row(
+                    model_name,
+                    f"{metrics['accuracy']:.4f}",
+                    f"{metrics['f1']:.4f}"
+                )
+            else:
+                table.add_row(
+                    model_name,
+                    f"{metrics['r2']:.4f}",
+                    f"{metrics['rmse']:.4f}"
+                )
+    
+    console.print(table)
+    
+    # Best model
+    console.print(f"\n🏆 [bold]Best Model:[/bold] {result['best_model']['name']}")
+    console.print(f"   Score: {result['best_model']['score']:.4f}")
+    console.print(f"   Path: {result['best_model']['model_path']}")
+
+
+@app.command()
+def cache_stats():
+    """Show cache statistics."""
+    copilot = DataScienceCopilot()
+    stats = copilot.get_cache_stats()
+    
+    table = Table(title="Cache Statistics")
+    table.add_column("Metric", style="cyan")
+    table.add_column("Value", style="white")
+    
+    table.add_row("Total Entries", str(stats["total_entries"]))
+    table.add_row("Valid Entries", str(stats["valid_entries"]))
+    table.add_row("Expired Entries", str(stats["expired_entries"]))
+    table.add_row("Size", f"{stats['size_mb']} MB")
+    
+    console.print()
+    console.print(table)
+
+
+@app.command()
+def clear_cache():
+    """Clear all cached results."""
+    copilot = DataScienceCopilot()
+    copilot.clear_cache()
+    console.print("[green]✓ Cache cleared successfully[/green]")
+
+
+if __name__ == "__main__":
+    app()
diff --git a/src/orchestrator.py b/src/orchestrator.py
new file mode 100644
index 0000000000000000000000000000000000000000..982e6c215f8987f0ee795e5b481ecc82894e540c
--- /dev/null
+++ b/src/orchestrator.py
@@ -0,0 +1,2125 @@
+"""
+Data Science Copilot Orchestrator
+Main orchestration class that uses LLM function calling to execute data science workflows.
+Supports multiple providers: Groq and Gemini.
+"""
+
+import json
+import os
+from typing import Dict, Any, List, Optional
+from pathlib import Path
+import time
+import httpx
+
+from groq import Groq
+import google.generativeai as genai
+from dotenv import load_dotenv
+
+from cache.cache_manager import CacheManager
+from tools.tools_registry import TOOLS, get_all_tool_names, get_tools_by_category
+from session_memory import SessionMemory
+from session_store import SessionStore
+from tools import (
+    # Basic Tools (13) - UPDATED: Added get_smart_summary + 3 wrangling tools
+    profile_dataset,
+    detect_data_quality_issues,
+    analyze_correlations,
+    get_smart_summary,  # NEW
+    clean_missing_values,
+    handle_outliers,
+    fix_data_types,
+    force_numeric_conversion,
+    smart_type_inference,
+    create_time_features,
+    encode_categorical,
+    train_baseline_models,
+    generate_model_report,
+    # Data Wrangling Tools (3) - NEW
+    merge_datasets,
+    concat_datasets,
+    reshape_dataset,
+    # Advanced Analysis (5)
+    perform_eda_analysis,
+    detect_model_issues,
+    detect_anomalies,
+    detect_and_handle_multicollinearity,
+    perform_statistical_tests,
+    # Advanced Feature Engineering (4)
+    create_interaction_features,
+    create_aggregation_features,
+    engineer_text_features,
+    auto_feature_engineering,
+    # Advanced Preprocessing (3)
+    handle_imbalanced_data,
+    perform_feature_scaling,
+    split_data_strategically,
+    # Advanced Training (3)
+    hyperparameter_tuning,
+    train_ensemble_models,
+    perform_cross_validation,
+    # Business Intelligence (4)
+    perform_cohort_analysis,
+    perform_rfm_analysis,
+    detect_causal_relationships,
+    generate_business_insights,
+    # Computer Vision (3)
+    extract_image_features,
+    perform_image_clustering,
+    analyze_tabular_image_hybrid,
+    # NLP/Text Analytics (4)
+    perform_topic_modeling,
+    perform_named_entity_recognition,
+    analyze_sentiment_advanced,
+    perform_text_similarity,
+    # Production/MLOps (5)
+    monitor_model_drift,
+    explain_predictions,
+    generate_model_card,
+    perform_ab_test_analysis,
+    detect_feature_leakage,
+    # Time Series (3)
+    forecast_time_series,
+    detect_seasonality_trends,
+    create_time_series_features,
+    # Advanced Insights (6)
+    analyze_root_cause,
+    detect_trends_and_seasonality,
+    detect_anomalies_advanced,
+    perform_hypothesis_testing,
+    analyze_distribution,
+    perform_segment_analysis,
+    # Automated Pipeline (2)
+    auto_ml_pipeline,
+    auto_feature_selection,
+    # Visualization (5)
+    generate_all_plots,
+    generate_data_quality_plots,
+    generate_eda_plots,
+    generate_model_performance_plots,
+    generate_feature_importance_plot,
+    # Interactive Plotly Visualizations (6) - NEW PHASE 2
+    generate_interactive_scatter,
+    generate_interactive_histogram,
+    generate_interactive_correlation_heatmap,
+    generate_interactive_box_plots,
+    generate_interactive_time_series,
+    generate_plotly_dashboard,
+    # EDA Report Generation (3) - NEW PHASE 2
+    generate_sweetviz_report,
+    generate_ydata_profiling_report,
+    generate_combined_eda_report,
+    # Code Interpreter (2) - NEW PHASE 2 - TRUE AI AGENT CAPABILITY
+    execute_python_code,
+    execute_code_from_file,
+    # Cloud Data Sources (4) - NEW: BigQuery Integration
+    load_bigquery_table,
+    write_bigquery_table,
+    profile_bigquery_table,
+    query_bigquery,
+    # Enhanced Feature Engineering (4)
+    create_ratio_features,
+    create_statistical_features,
+    create_log_features,
+    create_binned_features,
+)
+
+
+class DataScienceCopilot:
+    """
+    Main orchestrator for data science workflows using LLM function calling.
+    
+    Supports multiple providers: Groq and Gemini.
+    Uses function calling to intelligently route to data profiling, cleaning,
+    feature engineering, and model training tools.
+    """
+    
+    def __init__(self, groq_api_key: Optional[str] = None, 
+                 google_api_key: Optional[str] = None,
+                 cache_db_path: Optional[str] = None,
+                 reasoning_effort: str = "medium",
+                 provider: Optional[str] = None,
+                 session_id: Optional[str] = None,
+                 use_session_memory: bool = True):
+        """
+        Initialize the Data Science Copilot.
+        
+        Args:
+            groq_api_key: Groq API key (or set GROQ_API_KEY env var)
+            google_api_key: Google API key (or set GOOGLE_API_KEY env var)
+            cache_db_path: Path to cache database
+            reasoning_effort: Reasoning effort for Groq ('low', 'medium', 'high')
+            provider: LLM provider - 'groq' or 'gemini' (or set LLM_PROVIDER env var)
+            session_id: Session ID to resume (None = auto-resume recent or create new)
+            use_session_memory: Enable session-based memory for context across requests
+        """
+        # Load environment variables
+        load_dotenv()
+        
+        # Determine provider
+        self.provider = provider or os.getenv("LLM_PROVIDER", "groq").lower()
+        
+        if self.provider == "groq":
+            # Initialize Groq client
+            api_key = groq_api_key or os.getenv("GROQ_API_KEY")
+            if not api_key:
+                raise ValueError("Groq API key must be provided or set in GROQ_API_KEY env var")
+            
+            self.groq_client = Groq(api_key=api_key)
+            self.model = os.getenv("GROQ_MODEL", "llama-3.3-70b-versatile")
+            self.reasoning_effort = reasoning_effort
+            self.gemini_model = None
+            print(f"🤖 Initialized with Groq provider - Model: {self.model}")
+            
+        elif self.provider == "gemini":
+            # Initialize Gemini client
+            api_key = google_api_key or os.getenv("GOOGLE_API_KEY")
+            if not api_key:
+                raise ValueError("Google API key must be provided or set in GOOGLE_API_KEY env var")
+            
+            genai.configure(api_key=api_key)
+            self.model = os.getenv("GEMINI_MODEL", "gemini-2.0-flash-exp")
+            self.gemini_model = genai.GenerativeModel(
+                self.model,
+                generation_config={"temperature": 0.1}
+            )
+            self.groq_client = None
+            print(f"🤖 Initialized with Gemini provider - Model: {self.model}")
+            
+        else:
+            raise ValueError(f"Unsupported provider: {self.provider}. Choose 'groq' or 'gemini'")
+        
+        # Initialize cache
+        cache_path = cache_db_path or os.getenv("CACHE_DB_PATH", "./cache_db/cache.db")
+        self.cache = CacheManager(db_path=cache_path)
+        
+        # 🧠 Initialize session memory
+        self.use_session_memory = use_session_memory
+        if use_session_memory:
+            self.session_store = SessionStore()
+            
+            # Try to load existing session or create new one
+            if session_id:
+                # Explicit session ID provided - load it
+                self.session = self.session_store.load(session_id)
+                if not self.session:
+                    print(f"⚠️  Session {session_id} not found, creating new session")
+                    self.session = SessionMemory(session_id=session_id)
+                else:
+                    print(f"✅ Loaded session: {session_id}")
+            else:
+                # Try to continue recent session (within 24 hours)
+                self.session = self.session_store.get_recent_session(max_age_hours=24)
+                if self.session:
+                    print(f"✅ Resuming recent session: {self.session.session_id}")
+                else:
+                    # No recent session - create new one
+                    self.session = SessionMemory()
+                    print(f"✅ Created new session: {self.session.session_id}")
+            
+            # Show context if available
+            if self.session.last_dataset or self.session.last_model:
+                print(f"📝 Session Context:")
+                if self.session.last_dataset:
+                    print(f"   - Last dataset: {self.session.last_dataset}")
+                if self.session.last_model:
+                    print(f"   - Last model: {self.session.last_model} (score: {self.session.best_score:.4f})" if self.session.best_score else f"   - Last model: {self.session.last_model}")
+        else:
+            self.session = None
+            print("⚠️  Session memory disabled")
+        
+        # Tools registry
+        self.tools_registry = TOOLS
+        self.tool_functions = self._build_tool_functions_map()
+        
+        # Token tracking
+        self.total_tokens_used = 0
+        self.api_calls_made = 0
+        
+        # Rate limiting for Gemini (10 RPM free tier)
+        self.last_api_call_time = 0
+        self.min_api_call_interval = 1.0 if self.provider == "gemini" else 0  # 1s = safe for Gemini API
+        
+        # Ensure output directories exist
+        Path("./outputs").mkdir(exist_ok=True)
+        Path("./outputs/models").mkdir(exist_ok=True)
+        Path("./outputs/reports").mkdir(exist_ok=True)
+        Path("./outputs/data").mkdir(exist_ok=True)
+    
+    def _build_tool_functions_map(self) -> Dict[str, callable]:
+        """Build mapping of tool names to their functions - All 75 tools."""
+        return {
+            # Basic Tools (13) - UPDATED: Added 4 new tools
+            "profile_dataset": profile_dataset,
+            "detect_data_quality_issues": detect_data_quality_issues,
+            "analyze_correlations": analyze_correlations,
+            "get_smart_summary": get_smart_summary,  # NEW
+            "clean_missing_values": clean_missing_values,
+            "handle_outliers": handle_outliers,
+            "fix_data_types": fix_data_types,
+            "force_numeric_conversion": force_numeric_conversion,
+            "smart_type_inference": smart_type_inference,
+            "create_time_features": create_time_features,
+            "encode_categorical": encode_categorical,
+            "train_baseline_models": train_baseline_models,
+            "generate_model_report": generate_model_report,
+            # Data Wrangling Tools (3) - NEW
+            "merge_datasets": merge_datasets,
+            "concat_datasets": concat_datasets,
+            "reshape_dataset": reshape_dataset,
+            # Advanced Analysis (5)
+            "perform_eda_analysis": perform_eda_analysis,
+            "detect_model_issues": detect_model_issues,
+            "detect_anomalies": detect_anomalies,
+            "detect_and_handle_multicollinearity": detect_and_handle_multicollinearity,
+            "perform_statistical_tests": perform_statistical_tests,
+            # Advanced Feature Engineering (4)
+            "create_interaction_features": create_interaction_features,
+            "create_aggregation_features": create_aggregation_features,
+            "engineer_text_features": engineer_text_features,
+            "auto_feature_engineering": auto_feature_engineering,
+            # Advanced Preprocessing (3)
+            "handle_imbalanced_data": handle_imbalanced_data,
+            "perform_feature_scaling": perform_feature_scaling,
+            "split_data_strategically": split_data_strategically,
+            # Advanced Training (3)
+            "hyperparameter_tuning": hyperparameter_tuning,
+            "train_ensemble_models": train_ensemble_models,
+            "perform_cross_validation": perform_cross_validation,
+            # Business Intelligence (4)
+            "perform_cohort_analysis": perform_cohort_analysis,
+            "perform_rfm_analysis": perform_rfm_analysis,
+            "detect_causal_relationships": detect_causal_relationships,
+            "generate_business_insights": generate_business_insights,
+            # Computer Vision (3)
+            "extract_image_features": extract_image_features,
+            "perform_image_clustering": perform_image_clustering,
+            "analyze_tabular_image_hybrid": analyze_tabular_image_hybrid,
+            # NLP/Text Analytics (4)
+            "perform_topic_modeling": perform_topic_modeling,
+            "perform_named_entity_recognition": perform_named_entity_recognition,
+            "analyze_sentiment_advanced": analyze_sentiment_advanced,
+            "perform_text_similarity": perform_text_similarity,
+            # Production/MLOps (5)
+            "monitor_model_drift": monitor_model_drift,
+            "explain_predictions": explain_predictions,
+            "generate_model_card": generate_model_card,
+            "perform_ab_test_analysis": perform_ab_test_analysis,
+            "detect_feature_leakage": detect_feature_leakage,
+            # Time Series (3)
+            "forecast_time_series": forecast_time_series,
+            "detect_seasonality_trends": detect_seasonality_trends,
+            "create_time_series_features": create_time_series_features,
+            # Advanced Insights (6)
+            "analyze_root_cause": analyze_root_cause,
+            "detect_trends_and_seasonality": detect_trends_and_seasonality,
+            "detect_anomalies_advanced": detect_anomalies_advanced,
+            "perform_hypothesis_testing": perform_hypothesis_testing,
+            "analyze_distribution": analyze_distribution,
+            "perform_segment_analysis": perform_segment_analysis,
+            # Automated Pipeline (2)
+            "auto_ml_pipeline": auto_ml_pipeline,
+            "auto_feature_selection": auto_feature_selection,
+            # Visualization (5)
+            "generate_all_plots": generate_all_plots,
+            "generate_data_quality_plots": generate_data_quality_plots,
+            "generate_eda_plots": generate_eda_plots,
+            "generate_model_performance_plots": generate_model_performance_plots,
+            "generate_feature_importance_plot": generate_feature_importance_plot,
+            # Interactive Plotly Visualizations (6) - NEW PHASE 2
+            "generate_interactive_scatter": generate_interactive_scatter,
+            "generate_interactive_histogram": generate_interactive_histogram,
+            "generate_interactive_correlation_heatmap": generate_interactive_correlation_heatmap,
+            "generate_interactive_box_plots": generate_interactive_box_plots,
+            "generate_interactive_time_series": generate_interactive_time_series,
+            "generate_plotly_dashboard": generate_plotly_dashboard,
+            # EDA Report Generation (3) - NEW PHASE 2
+            "generate_sweetviz_report": generate_sweetviz_report,
+            "generate_ydata_profiling_report": generate_ydata_profiling_report,
+            "generate_combined_eda_report": generate_combined_eda_report,
+            # Code Interpreter (2) - NEW PHASE 2 - TRUE AI AGENT CAPABILITY
+            "execute_python_code": execute_python_code,
+            "execute_code_from_file": execute_code_from_file,
+            # Cloud Data Sources (4) - NEW: BigQuery Integration
+            "load_bigquery_table": load_bigquery_table,
+            "write_bigquery_table": write_bigquery_table,
+            "profile_bigquery_table": profile_bigquery_table,
+            "query_bigquery": query_bigquery,
+            # Enhanced Feature Engineering (4)
+            "create_ratio_features": create_ratio_features,
+            "create_statistical_features": create_statistical_features,
+            "create_log_features": create_log_features,
+            "create_binned_features": create_binned_features,
+        }
+    
+    def _build_system_prompt(self) -> str:
+        """Build comprehensive system prompt for the copilot."""
+        return """You are an autonomous Data Science Agent. You EXECUTE tasks, not advise.
+
+**CRITICAL: Use the provided function calling tools. Do NOT generate XML-style function calls.**
+
+**CRITICAL: Detect the user's intent and use the appropriate workflow.**
+
+**🎯 INTENT DETECTION (ALWAYS DO THIS FIRST):**
+
+**A. CODE-ONLY TASKS** - User wants to execute custom Python code:
+- Keywords: "execute", "run code", "calculate", "generate data", "create plot", "custom visualization"
+- No dataset file provided (file_path="dummy" or similar)
+- Specific programming task (Fibonacci, custom charts, synthetic data, etc.)
+- **ACTION**: Use execute_python_code tool ONCE and IMMEDIATELY return success. DO NOT run ML workflow!
+- **CRITICAL**: After execute_python_code succeeds → STOP IMMEDIATELY, return summary, DO NOT call any other tools!
+- **Example**: "Calculate Fibonacci" → execute_python_code → RETURN SUCCESS ✓ (NO other tools!)
+
+**B. VISUALIZATION-ONLY REQUESTS** - User wants charts/graphs without ML:
+- Keywords: "generate plots", "create dashboard", "visualize", "show graphs", "interactive charts"
+- **NO keywords for ML**: No "train", "predict", "model", "classify", "forecast"
+- Real dataset provided BUT only wants visualization
+- **ACTION**: Generate visualizations directly, skip data cleaning/ML steps
+- **Workflow**: 
+  1. generate_interactive_scatter() OR generate_plotly_dashboard() 
+  2. STOP - DO NOT clean data, encode, or train models!
+- **Example**: "Generate interactive plots for Magnitude and latitude" → generate_interactive_scatter → DONE ✓
+
+**C. DATA ANALYSIS WITH ML** - Full workflow with model training:
+- Real dataset file path provided (CSV, Excel, etc. - NOT "dummy")
+- Keywords: "train model", "predict", "classify", "build model", "forecast"
+- User wants: cleaning + feature engineering + model training
+- **ACTION**: Run full ML workflow (steps 1-15 below)
+- **Example**: "Train a model to predict earthquake magnitude" → Full pipeline
+
+**D. UNCLEAR/AMBIGUOUS REQUESTS** - Intent is not obvious:
+- User says: "analyze", "look at", "check", "review" (without specifics)
+- Could mean: visualization only OR full ML OR just exploration
+- **ACTION**: ASK USER to clarify BEFORE starting work
+- **Questions to ask**:
+  - "Would you like me to: (1) Just create visualizations, (2) Train a predictive model, or (3) Both?"
+  - "Do you need model training or just want to explore the data visually?"
+- **DO NOT ASSUME** - Always ask when unclear!
+
+**E. SIMPLE QUESTIONS** - User asks for explanation/advice:
+- Keywords: "what is", "how to", "explain", "recommend"
+- **ACTION**: Answer directly, no tools needed
+
+---
+
+**WORKFLOW FOR VISUALIZATION-ONLY (Type B above):**
+- User wants: "generate plots", "create dashboard", "visualize X and Y"
+- **DO NOT run full pipeline** - Skip cleaning, encoding, training!
+- **Quick workflow**:
+  1. If specific columns mentioned → generate_interactive_scatter(x_col, y_col)
+  2. If "dashboard" mentioned → generate_plotly_dashboard(file_path, target_col)
+  3. STOP - Return success
+- **Example**: "Generate interactive plots for Magnitude and latitude"
+  → generate_interactive_scatter(x_col="mag", y_col="latitude") → DONE ✓
+
+**📊 COLUMN SELECTION FOR VAGUE REQUESTS:**
+When user doesn't specify columns (e.g., "plot a scatter" without mentioning X/Y):
+
+1. **Analyze the dataset structure and domain**:
+   - Inspect column names, types, and value ranges
+   - Identify patterns: spatial coordinates (lat/lon, x/y), temporal data (dates, timestamps), 
+     categorical hierarchies, numerical measurements, identifiers
+   - Infer domain from filename/columns (geographic, financial, health, retail, etc.)
+   
+2. **Apply intelligent selection strategies**:
+   
+   **For Scatter Plots** - Choose variables with meaningful relationships:
+   - Geographic data: Pair coordinate columns (latitude+longitude, x+y coordinates)
+   - Price/size relationships: Pair cost with quantity/area/volume metrics
+   - Performance metrics: Pair effort/input with outcome/output variables
+   - Temporal relationships: Pair time with trend variables
+   - Categorical vs numeric: Use most important numeric split by key category
+   
+   **For Histograms** - Select the primary measure of interest:
+   - Target variable (if identified): The variable being predicted/analyzed
+   - Main metric: Revenue, score, magnitude, count, amount (key business/scientific measure)
+   - Distribution of interest: Variable with expected patterns (age, income, frequency)
+   - First numeric column with meaningful range (avoid IDs, binary flags)
+   
+   **For Box Plots** - Show distribution comparisons:
+   - Numeric variable grouped by categorical (e.g., price by category, score by region)
+   - Multiple related numeric variables side-by-side
+   
+   **For Time Series** - Identify temporal patterns:
+   - Date/datetime column + primary metric to track over time
+   - Multiple metrics over time if related (sales, costs, profit)
+   
+   **For Heatmaps** - No column choice needed (shows all numeric correlations)
+   
+3. **Selection principles** (no dataset-specific bias):
+   - Avoid ID columns, constants, or binary flags for visualizations
+   - Prefer columns with high variance and meaningful ranges
+   - Choose natural pairs (coordinates, input-output, cause-effect)
+   - Select variables that answer implicit questions about the data
+   - When uncertain, pick columns that reveal the most information
+   
+4. **ALWAYS EXPLAIN YOUR REASONING** in the final summary:
+   - State WHAT columns you chose
+   - Explain WHY those columns (their relationship/significance)
+   - Describe WHAT INSIGHTS the visualization reveals
+   
+   ✅ Good explanation:
+   "I created a scatter plot of [Column A] vs [Column B] because they represent [relationship type].
+   This visualization reveals [pattern/insight]. For the histogram, I chose [Column C] as it's 
+   the [primary metric/target variable], showing [distribution pattern]."
+   
+   ❌ Bad explanation:
+   "Scatter plot created" (no reasoning about column selection)
+
+**TRANSPARENCY RULE**: Justify every column choice with domain-agnostic reasoning based on data 
+structure, variable relationships, and expected insights - not hardcoded domain assumptions.
+
+**WORKFLOW FOR FULL ML ANALYSIS (Type C above):**
+- User wants: model training, prediction, classification
+- Execute steps IN ORDER (1 → 2 → 3 → ... → 15)
+- Each step runs ONCE (unless explicitly noted like "call for each datetime column")
+- After step completes successfully (✓ Completed) → IMMEDIATELY move to NEXT step
+- DO NOT repeat steps, DO NOT go backwards, DO NOT skip steps (unless optional)
+- Track your progress: "Completed steps 1-8, now executing step 9..."
+
+**FULL ML WORKFLOW (Execute ALL steps - DO NOT SKIP):**
+1. profile_dataset(file_path) - ONCE ONLY
+2. detect_data_quality_issues(file_path) - ONCE ONLY
+3. generate_data_quality_plots(file_path, output_dir="./outputs/plots/quality") - Generate quality visualizations
+4. clean_missing_values(file_path, strategy="auto", output="./outputs/data/cleaned.csv")
+5. handle_outliers(cleaned, method="clip", columns=["all"], output="./outputs/data/no_outliers.csv")
+6. force_numeric_conversion(latest, columns=["all"], output="./outputs/data/numeric.csv", errors="coerce")
+7. **IF DATETIME COLUMNS EXIST**: create_time_features(latest, date_col="<column_name>", output="./outputs/data/time_features.csv") - Extract year/month/day/hour/weekday/timestamp from each datetime column
+8. encode_categorical(latest, method="auto", output="./outputs/data/encoded.csv")
+9. generate_eda_plots(encoded, target_col, output_dir="./outputs/plots/eda") - Generate EDA visualizations
+10. **ONLY IF USER EXPLICITLY REQUESTED ML**: train_baseline_models(encoded, target_col, task_type="auto")
+11. **HYPERPARAMETER TUNING (OPTIONAL - Smart Decision)**:
+    - IF user says "optimize", "tune", "improve", "best model possible" → ALWAYS tune
+    - IF best model score < 0.90 → Tune to improve (user expects good accuracy)
+    - IF best model score > 0.95 → Skip tuning (already excellent)
+    - **How**: hyperparameter_tuning(file_path=encoded, target_col=target_col, model_type="xgboost", n_trials=50)
+    - **Only tune the WINNING model** (don't waste time on others)
+    - **Map model names**: XGBoost→"xgboost", RandomForest→"random_forest", Ridge→"ridge", Lasso→use Ridge
+    - **Note**: Time features should already be extracted in step 7 (create_time_features)
+12. **CROSS-VALIDATION (OPTIONAL - Production Models)**:
+    - IF user says "validate", "production", "robust", "deploy" → ALWAYS cross-validate
+    - IF best model score > 0.85 → Cross-validate to confirm robustness
+    - ELSE → Skip (focus on improving score first with tuning)
+    - **How**: perform_cross_validation(file_path=encoded, target_col=target_col, model_type="xgboost", cv_strategy="kfold", n_splits=5)
+    - **Use same model type as winner** (e.g., if XGBoost won, use model_type="xgboost")
+    - **Provides**: Mean CV score ± std dev (shows if model is reliable)
+    - **Note**: Time features should already be extracted in step 7 (create_time_features)
+13. **AFTER TRAINING/TUNING**: generate_combined_eda_report(encoded, target_col, output_dir="./outputs/reports") - Generate comprehensive HTML reports
+14. **INTERACTIVE DASHBOARD (OPTIONAL - Smart Detection)**:
+    - **ALWAYS generate IF user mentions**: "dashboard", "interactive", "plotly", "visualize", "charts", "graphs", "plots"
+    - **ALWAYS generate IF user wants exploration**: "explore", "show me", "visualize data"
+    - **SKIP IF**: User only wants model training without visualization
+    - **How**: generate_plotly_dashboard(encoded, target_col, output_dir="./outputs/plots/interactive")
+    - **What it creates**: Correlation heatmap, box plots, scatter plots, histograms - all interactive with zoom/pan/hover
+    - **Works with ANY dataset**: Automatically detects numeric/categorical columns and generates appropriate visualizations
+15. STOP when the user's request is fulfilled
+
+**CRITICAL RULES:**
+
+🚨 **RULE #1 - NEVER REPEAT SUCCESSFUL TOOLS**:
+  - If a tool returns "✓ Completed" → MOVE TO NEXT STEP IMMEDIATELY
+  - DO NOT call the same tool again (even with different arguments)
+  - DO NOT call a different tool for the same task
+  - Examples:
+    * encode_categorical succeeded → DO NOT call execute_python_code for encoding
+    * create_time_features succeeded → DO NOT call execute_python_code for time features
+    * clean_missing_values succeeded → DO NOT call execute_python_code for cleaning
+  - **ONLY EXCEPTION**: Different columns require separate calls (e.g., create_time_features for 'time' AND 'updated')
+
+🚨 **RULE #2 - ENCODING IS ONE-TIME ONLY**:
+  - Categorical encoding happens ONCE in step 8
+  - If encode_categorical succeeds → SKIP to step 9 (generate_eda_plots)
+  - DO NOT call execute_python_code with pd.get_dummies() or one-hot encoding
+  - DO NOT call encode_categorical again
+  - The file ./outputs/data/encoded.csv exists? → Encoding is DONE, move forward!
+
+🚨 **RULE #3 - PREFER SPECIALIZED TOOLS**:
+  - For time features → USE create_time_features(), NOT execute_python_code
+  - For encoding → USE encode_categorical(), NOT execute_python_code
+  - For cleaning → USE clean_missing_values(), NOT execute_python_code
+  - For outliers → USE handle_outliers(), NOT execute_python_code
+  - ONLY use execute_python_code when NO specialized tool exists!
+
+- DO NOT repeat profile_dataset or detect_data_quality_issues multiple times
+- DO NOT call smart_type_inference after encoding - data is ready
+- **⚠️ ERROR RECOVERY - If a Tool Fails**:
+  - DO NOT get stuck retrying the same failed tool
+  - MOVE FORWARD to the next step (reports, visualizations, etc.)
+  - Example: If hyperparameter_tuning fails → generate_combined_eda_report
+  - Example: If encode_categorical fails → try force_numeric_conversion OR move to EDA
+  - **NEVER let one failure stop the entire workflow!**
+- **⚠️ HYPERPARAMETER TUNING - When to Use**:
+  - AFTER train_baseline_models completes successfully
+  - ONLY tune the BEST performing model (highest score)
+  - DO NOT tune all 6 models (waste of time!)
+  - Tune IF: user wants "optimize"/"improve" OR best score < 0.90
+  - Skip IF: best score > 0.95 (already excellent)
+  - **How to call**: hyperparameter_tuning(file_path, target_col, model_type="xgboost", n_trials=50)
+  - **Model types**: "xgboost", "random_forest", "ridge", "logistic"
+  - **Example**: If XGBoost wins → hyperparameter_tuning(..., model_type="xgboost")
+- **⚠️ CROSS-VALIDATION - When to Use**:
+  - AFTER hyperparameter_tuning (or if user explicitly requests validation)
+  - Use to confirm model robustness with confidence intervals
+  - IF best score > 0.85 → Cross-validate to ensure consistency
+  - IF user says "validate", "production", "deploy" → ALWAYS cross-validate
+  - **How to call**: perform_cross_validation(file_path, target_col, model_type="xgboost", cv_strategy="kfold", n_splits=5)
+  - **Use same model_type as winner** (e.g., XGBoost→"xgboost", RandomForest→"random_forest")
+  - **Returns**: Mean score ± std dev across folds (e.g., "0.92 ± 0.03" means reliable)
+- **ALWAYS generate EDA reports after training/tuning** using generate_combined_eda_report
+- **⭐ INTERACTIVE DASHBOARD - When to Generate**:
+  - **ALWAYS IF user says**: "dashboard", "interactive", "plotly", "visualize", "charts", "graphs", "show plots", "explore data"
+  - **ALWAYS IF analysis/exploration request**: "analyze dataset", "show insights", "explore patterns"
+  - **SKIP IF**: User ONLY wants model training (e.g., "just train model", "only predict")
+  - **Tool**: generate_plotly_dashboard(encoded, target_col, output_dir="./outputs/plots/interactive")
+  - **Works with ANY dataset**: Auto-detects columns and generates appropriate visualizations
+- **ONLY train models when user explicitly asks with keywords**: "train", "predict", "model", "classification", "regression", "forecast", "build a model"
+- **For analysis/exploration requests ONLY**: Stop after EDA plots/dashboard - DO NOT train models
+- **Read user intent carefully**: "analyze" ≠ "train", "show insights" ≠ "predict"
+- **When target column is unclear**: Ask user before training
+
+**🎯 CRITICAL EXAMPLES - DETECT INTENT CORRECTLY:**
+
+**Type B (Visualization-Only) - NO ML WORKFLOW:**
+- ✅ "Generate interactive plots for Magnitude and latitude"
+  → generate_interactive_scatter(x_col="mag", y_col="latitude") → STOP
+- ✅ "Create a dashboard showing correlations"
+  → generate_plotly_dashboard(file_path) → STOP
+- ✅ "Visualize the distribution of sales"
+  → generate_interactive_histogram(column="sales") → STOP
+- ✅ "Show me graphs of temperature over time"
+  → generate_interactive_time_series() → STOP
+
+**Type C (Full ML) - RUN COMPLETE WORKFLOW:**
+- ✅ "Train a model to predict earthquake magnitude"
+  → Full pipeline (steps 1-15)
+- ✅ "Build a classifier for fraud detection"
+  → Full pipeline (steps 1-15)
+- ✅ "Analyze data and train model to forecast sales"
+  → Full pipeline (steps 1-15)
+
+**Type D (Unclear) - ASK USER:**
+- ❓ "Analyze this earthquake dataset"
+  → ASK: "Would you like me to (1) Create visualizations, (2) Train a predictive model, or (3) Both?"
+- ❓ "Look at this CSV file"
+  → ASK: "What would you like me to do? Visualize data or build a model?"
+- ❓ "Check out my sales data"
+  → ASK: "Do you want to explore the data visually or train a forecasting model?"
+
+**⚠️ COMMON MISTAKES - AVOID THESE:**
+- ❌ User says "generate plots" → Agent runs full ML workflow (WRONG!)
+- ❌ User says "visualize" → Agent cleans data, encodes, trains models (WRONG!)
+- ❌ User says "analyze" → Agent assumes ML training (WRONG - ask first!)
+- ✅ User says "generate plots" → Agent creates plots and STOPS (CORRECT!)
+- ✅ User says "train model" → Agent runs full pipeline (CORRECT!)
+
+⭐ **CODE INTERPRETER - HOW TO USE:**
+
+**For CODE-ONLY Tasks (Type A):**
+1. User asks to "execute code", "calculate", "generate data", "create custom plot"
+2. Call execute_python_code with the full Python code
+3. STOP after code executes - DO NOT run ML workflow!
+4. Example:
+   ```
+   execute_python_code(
+       code='''
+import numpy as np
+# Calculate fibonacci
+def fib(n):
+    a, b = 0, 1
+    for _ in range(n):
+        print(a)
+        a, b = b, a+b
+fib(20)
+       ''',
+       working_directory="./outputs/code"
+   )
+   # Then STOP - task complete!
+   ```
+
+**For Data Analysis Workflow (Type B):**
+Use specialized tools FIRST. Only use execute_python_code for:
+1. **Custom Visualizations**: Specific plot types (dropdown filters, custom buttons, animated charts)
+2. **Domain-Specific Calculations**: Custom business metrics, specialized formulas
+3. **Custom Data Transformations**: Unique reshaping not covered by tools
+4. **Interactive Widgets**: Plotly dropdowns, sliders, buttons
+
+**⚠️ DO NOT USE execute_python_code FOR:**
+- ❌ Time feature extraction → USE create_time_features() tool
+- ❌ Categorical encoding → USE encode_categorical() tool
+- ❌ Missing values → USE clean_missing_values() tool
+- ❌ Outliers → USE handle_outliers() tool
+- ❌ Standard EDA plots → USE generate_eda_plots() or generate_plotly_dashboard()
+- ❌ Model training → USE train_baseline_models() or hyperparameter_tuning()
+- ❌ Tasks with dedicated tools → USE THE TOOL, NOT custom code!
+
+**Rule of Thumb:**
+- CODE-ONLY task? → execute_python_code ONCE → STOP
+- Data analysis task? → Use specialized tools, execute_python_code only for custom needs
+- If a specialized tool exists → USE THE TOOL, not custom code
+
+**KEY TOOLS (77 total available via function calling):**
+- force_numeric_conversion: Converts string columns to numeric (auto-detects, skips text)
+- clean_missing_values: "auto" mode supported
+- encode_categorical: one-hot/target/frequency encoding
+- train_baseline_models: Trains multiple models automatically
+- **⭐ execute_python_code**: Write and run custom Python code for ANY task not covered by tools (TRUE AI AGENT capability)
+- **execute_code_from_file**: Run existing Python scripts
+- Advanced: hyperparameter_tuning, train_ensemble_models, perform_eda_analysis, handle_imbalanced_data, perform_feature_scaling, detect_anomalies, detect_and_handle_multicollinearity, auto_feature_engineering, forecast_time_series, explain_predictions, generate_business_insights, perform_topic_modeling, extract_image_features, monitor_model_drift
+- NEW Advanced Insights: analyze_root_cause, detect_trends_and_seasonality, detect_anomalies_advanced, perform_hypothesis_testing, analyze_distribution, perform_segment_analysis
+- NEW Automation: auto_ml_pipeline (zero-config full pipeline), auto_feature_selection
+- NEW Visualization: generate_all_plots, generate_data_quality_plots, generate_eda_plots, generate_model_performance_plots, generate_feature_importance_plot
+- NEW Interactive Plotly Visualizations: generate_interactive_scatter, generate_interactive_histogram, generate_interactive_correlation_heatmap, generate_interactive_box_plots, generate_interactive_time_series, generate_plotly_dashboard (interactive web-based plots with zoom/pan/hover)
+- NEW EDA Report Generation: generate_sweetviz_report (beautiful fast reports), generate_ydata_profiling_report (comprehensive detailed analysis), generate_combined_eda_report (both in one call)
+- NEW Enhanced Feature Engineering: create_ratio_features, create_statistical_features, create_log_features, create_binned_features
+
+**RULES:**
+✅ **DETECT INTENT FIRST**: Code-only (Type A), Visualization-only (Type B), Full ML (Type C), or Unclear (Type D)?
+✅ **ASK BEFORE ACTING** if user intent is ambiguous (Type D)
+✅ **VISUALIZATION-ONLY**: If user just wants plots → generate_interactive_scatter OR generate_plotly_dashboard → STOP
+✅ **CODE-ONLY Tasks**: execute_python_code → STOP (no ML workflow!)
+✅ **FULL ML ONLY**: If user wants model training → Run complete workflow (steps 1-15)
+✅ Use OUTPUT of each tool as INPUT to next
+✅ Save to ./outputs/data/
+✅ **CRITICAL ERROR RECOVERY - HIGHEST PRIORITY:**
+   - When you see "💡 HINT: Did you mean 'X'?" → IMMEDIATELY retry with 'X'
+   - When tool returns {"suggestion": "Did you mean: X?"} → Extract X and retry
+   - Example: train_baseline_models fails with hint "Did you mean 'mag'?" 
+     → Your NEXT call MUST be: train_baseline_models(..., target_col="mag")
+   - NO OTHER CALLS until you retry with corrected parameter
+✅ **READ ERROR MESSAGES CAREFULLY** - Extract actual column names from errors
+✅ **When training fails with "Column X not found"**: 
+   - Look for "Available columns:" in error message
+   - Look for suggestion in tool_result["suggestion"]
+   - Use the EXACT suggested column name
+   - Common mapping: 'magnitude' → 'mag', 'latitude' → 'lat'
+   - Retry IMMEDIATELY with correct column name (NO OTHER TOOLS FIRST)
+✅ **When file not found**: Check previous step - if it failed, don't continue with that file
+✅ **ASK USER for target column if unclear** - Don't guess!
+✅ **STOP cascading errors**: If a file creation step fails, don't try to use that file in next steps
+✅ When tool fails → analyze error → fix the specific issue → RETRY THAT SAME TOOL (max 1 retry per step)
+❌ NO recommendations without action
+❌ NO stopping after detecting issues
+❌ NO repeating failed file paths - if file wasn't created, use previous working file
+❌ NO repeating the same error twice - learn from error messages
+❌ NO calling different tools when one fails - RETRY the failed tool with corrections first
+❌ NO training models when user only wants analysis/exploration
+❌ NO assuming column names - read error messages for actual names
+❌ NO XML-style function syntax like <function=name />
+
+**ERROR RECOVERY PATTERNS - FOLLOW THESE EXACTLY:**
+
+**Pattern 1: Column Not Found**
+❌ Tool fails: train_baseline_models(file_path="data.csv", target_col="magnitude")
+📋 Error: "Column 'magnitude' not found. 💡 HINT: Did you mean 'mag'?"
+✅ Next call MUST be: train_baseline_models(file_path="data.csv", target_col="mag")
+❌ WRONG: Calling analyze_distribution or any other tool first!
+
+**Pattern 2: File Not Found (Previous Step Failed)**
+❌ Tool fails: auto_feature_engineering(...) → creates engineered_features.csv FAILED
+❌ Next tool fails: train_baseline_models(file_path="engineered_features.csv") → File not found!
+✅ Correct action: Use LAST SUCCESSFUL file → train_baseline_models(file_path="encoded.csv")
+
+**Pattern 3: Missing Argument**
+❌ Tool fails: "missing 1 required positional argument: 'target_col'"
+✅ Next call: Include ALL required arguments
+
+**CRITICAL RULES:**
+1. If tool_result contains "suggestion", extract the suggested value and retry IMMEDIATELY
+2. If you see "💡 HINT:", use that exact value in your retry
+3. RETRY THE SAME TOOL with corrections before moving to different tools
+4. Max 1 retry per tool - if it fails twice, move on with last successful file
+
+**CRITICAL: Call ONE function at a time. Wait for its result before calling the next.**
+
+**USER INTENT DETECTION:**
+- Keywords for ML training: "train", "model", "predict", "classification", "regression", "forecast"
+- Keywords for analysis only: "analyze", "explore", "show", "visualize", "understand", "summary"
+- If ambiguous → Complete data prep, then ASK user about next steps
+
+File chain: original → cleaned.csv → no_outliers.csv → numeric.csv → encoded.csv → models (if requested)
+
+You are a DOER. Complete workflows based on user intent."""
+    
+    def _generate_cache_key(self, file_path: str, task_description: str, 
+                           target_col: Optional[str] = None) -> str:
+        """Generate cache key for a workflow."""
+        # Include file hash to invalidate cache when data changes
+        try:
+            file_hash = self.cache.generate_file_hash(file_path)
+        except:
+            file_hash = "no_file"
+        
+        # Create simple string key (no kwargs unpacking to avoid dict hashing issues)
+        cache_key_str = f"{file_hash}_{task_description}_{target_col or 'no_target'}"
+        return self.cache._generate_key(cache_key_str)
+    
+    def _get_last_successful_file(self, workflow_history: List[Dict]) -> str:
+        """Find the last successfully created file from workflow history."""
+        # Check in reverse order for file-creating tools
+        for step in reversed(workflow_history):
+            result = step.get("result", {})
+            if result.get("success"):
+                # Check for output_path in result
+                if "output_path" in result:
+                    return result["output_path"]
+                # For nested results
+                if "result" in result and isinstance(result["result"], dict):
+                    if "output_path" in result["result"]:
+                        return result["result"]["output_path"]
+        
+        # Default fallback
+        return "./outputs/data/encoded.csv"
+    
+    def _determine_next_step(self, stuck_tool: str, completed_tools: List[str]) -> str:
+        """Determine what the next workflow step should be based on what's stuck."""
+        # Map of stuck tools to their next step
+        next_steps = {
+            "profile_dataset": "detect_data_quality_issues",
+            "detect_data_quality_issues": "generate_data_quality_plots",
+            "generate_data_quality_plots": "clean_missing_values",
+            "clean_missing_values": "handle_outliers",
+            "handle_outliers": "force_numeric_conversion",
+            "force_numeric_conversion": "create_time_features (for datetime columns)",
+            "create_time_features": "encode_categorical",
+            "encode_categorical": "generate_eda_plots",
+            "execute_python_code": "move forward (stop writing custom code!)",
+            "generate_eda_plots": "train_baseline_models",
+            "train_baseline_models": "hyperparameter_tuning OR generate_combined_eda_report",
+            "hyperparameter_tuning": "perform_cross_validation OR generate_combined_eda_report",
+            "perform_cross_validation": "generate_combined_eda_report",
+            "generate_combined_eda_report": "generate_plotly_dashboard",
+            "generate_plotly_dashboard": "WORKFLOW COMPLETE"
+        }
+        
+        return next_steps.get(stuck_tool, "generate_eda_plots OR train_baseline_models")
+    
+    def _execute_tool(self, tool_name: str, arguments: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Execute a single tool function.
+        
+        Args:
+            tool_name: Name of the tool
+            arguments: Tool arguments
+            
+        Returns:
+            Tool execution result
+        """
+        if tool_name not in self.tool_functions:
+            return {
+                "error": f"Tool '{tool_name}' not found",
+                "available_tools": get_all_tool_names()
+            }
+        
+        try:
+            tool_func = self.tool_functions[tool_name]
+            result = tool_func(**arguments)
+            
+            # Check if tool itself returned an error (some tools return dict with 'status': 'error')
+            if isinstance(result, dict) and result.get("status") == "error":
+                tool_result = {
+                    "success": False,
+                    "tool": tool_name,
+                    "arguments": arguments,
+                    "error": result.get("message", result.get("error", "Tool returned error status")),
+                    "error_type": "ToolError"
+                }
+            else:
+                tool_result = {
+                    "success": True,
+                    "tool": tool_name,
+                    "arguments": arguments,
+                    "result": result
+                }
+            
+            # 🧠 Update session memory with tool execution
+            if self.session:
+                self.session.add_workflow_step(tool_name, tool_result)
+            
+            return tool_result
+        
+        except Exception as e:
+            tool_result = {
+                "success": False,
+                "tool": tool_name,
+                "arguments": arguments,
+                "error": str(e),
+                "error_type": type(e).__name__
+            }
+            
+            # Still track failed tools in session
+            if self.session:
+                self.session.add_workflow_step(tool_name, tool_result)
+            
+            return tool_result
+    
+    def _make_json_serializable(self, obj: Any) -> Any:
+        """
+        Convert objects to JSON-serializable format.
+        Handles matplotlib Figures, numpy arrays, and other non-serializable types.
+        """
+        try:
+            import numpy as np
+        except ImportError:
+            np = None
+        
+        try:
+            from matplotlib.figure import Figure
+        except ImportError:
+            Figure = None
+        
+        # Handle dictionaries recursively
+        if isinstance(obj, dict):
+            return {k: self._make_json_serializable(v) for k, v in obj.items()}
+        
+        # Handle lists recursively
+        elif isinstance(obj, (list, tuple)):
+            return [self._make_json_serializable(item) for item in obj]
+        
+        # Handle matplotlib Figure objects
+        elif Figure and isinstance(obj, Figure):
+            return f"<Matplotlib Figure: {id(obj)}>"
+        
+        # Handle numpy arrays
+        elif np and isinstance(obj, np.ndarray):
+            return f"<NumPy array: shape={obj.shape}>"
+        
+        # Handle numpy scalar types
+        elif hasattr(obj, 'item') and callable(obj.item):
+            try:
+                return obj.item()
+            except:
+                return str(obj)
+        
+        # Handle other non-serializable objects
+        elif hasattr(obj, '__dict__') and not isinstance(obj, (str, int, float, bool, type(None))):
+            return f"<{obj.__class__.__name__} object>"
+        
+        # Already serializable
+        return obj
+    
+    def _summarize_tool_result(self, tool_result: Dict[str, Any]) -> str:
+        """
+        Summarize tool result for LLM consumption.
+        Extracts only essential info to avoid token bloat from large dataset outputs.
+        """
+        if not tool_result.get("success"):
+            # Always return errors in full
+            return json.dumps({
+                "error": tool_result.get("error"),
+                "error_type": tool_result.get("error_type")
+            }, indent=2)
+        
+        result = tool_result.get("result", {})
+        tool_name = tool_result.get("tool", "")
+        
+        # Create concise summary based on tool type
+        summary = {"status": "success"}
+        
+        # Profile dataset - extract key stats only
+        if tool_name == "profile_dataset":
+            summary.update({
+                "rows": result.get("basic_info", {}).get("num_rows"),
+                "cols": result.get("basic_info", {}).get("num_columns"),
+                "numeric_cols": len(result.get("numeric_columns", [])),
+                "categorical_cols": len(result.get("categorical_columns", [])),
+                "datetime_cols": len(result.get("datetime_columns", [])),
+                "memory_mb": result.get("basic_info", {}).get("memory_usage_mb"),
+                "missing_values": result.get("basic_info", {}).get("missing_values", 0)
+            })
+        
+        # Data quality - extract issue counts
+        elif tool_name == "detect_data_quality_issues":
+            issues = result.get("issues", {})
+            summary.update({
+                "missing_values": len(issues.get("missing_values", [])),
+                "duplicate_rows": result.get("duplicate_count", 0),
+                "high_cardinality": len(issues.get("high_cardinality", [])),
+                "constant_cols": len(issues.get("constant_columns", [])),
+                "outliers": len(issues.get("outliers", [])),
+                "total_issues": sum([
+                    len(issues.get("missing_values", [])),
+                    result.get("duplicate_count", 0),
+                    len(issues.get("high_cardinality", [])),
+                    len(issues.get("constant_columns", [])),
+                    len(issues.get("outliers", []))
+                ])
+            })
+        
+        # File operations - just confirm path
+        elif tool_name in ["clean_missing_values", "handle_outliers", "fix_data_types", 
+                           "force_numeric_conversion", "encode_categorical", "smart_type_inference"]:
+            summary.update({
+                "output_path": result.get("output_path"),
+                "message": result.get("message", ""),
+                "rows_affected": result.get("rows_removed", result.get("rows_affected", 0))
+            })
+        
+        # Training - extract model performance only
+        elif tool_name == "train_baseline_models":
+            models = result.get("models", {})
+            best = result.get("best_model", {})
+            best_model_name = best.get("name") if isinstance(best, dict) else best
+            summary.update({
+                "best_model": best_model_name,
+                "models_trained": list(models.keys()),
+                "best_score": best.get("score") if isinstance(best, dict) else None,
+                "task_type": result.get("task_type")
+            })
+        
+        # Report generation
+        elif tool_name == "generate_model_report":
+            summary.update({
+                "report_path": result.get("report_path"),
+                "message": "Report generated successfully"
+            })
+        
+        # Default: extract message and status
+        else:
+            summary.update({
+                "message": result.get("message", str(result)[:200]),  # Max 200 chars
+                "output_path": result.get("output_path")
+            })
+        
+        return json.dumps(summary, indent=2)
+    
+    def _format_tool_result(self, tool_result: Dict[str, Any]) -> str:
+        """Format tool result for LLM consumption (alias for summarize)."""
+        return self._summarize_tool_result(tool_result)
+    
+    def _compress_tools_registry(self) -> List[Dict]:
+        """
+        Create compressed version of tools registry.
+        Keeps ALL 46 tools but removes verbose parameter descriptions.
+        """
+        compressed = []
+        
+        for tool in self.tools_registry:
+            # Compress parameters by removing descriptions
+            params = tool["function"]["parameters"]
+            compressed_params = {
+                "type": params["type"],
+                "properties": {},
+                "required": list(params.get("required", []))  # Create new list, not reference
+            }
+            
+            # Keep only type info for properties, remove descriptions
+            for prop_name, prop_value in params.get("properties", {}).items():
+                compressed_prop = {}
+                
+                # Handle oneOf (like clean_missing_values strategy parameter)
+                if "oneOf" in prop_value:
+                    # Deep copy to avoid reference issues
+                    compressed_prop["oneOf"] = json.loads(json.dumps(prop_value["oneOf"]))
+                else:
+                    compressed_prop["type"] = prop_value.get("type", "string")
+                
+                # Keep enum if present (important for validation)
+                if "enum" in prop_value:
+                    compressed_prop["enum"] = list(prop_value["enum"])  # Create new list
+                
+                # Keep array items type - handle both "array" and ["string", "array"]
+                prop_type = prop_value.get("type")
+                is_array_type = False
+                
+                if isinstance(prop_type, list):
+                    is_array_type = "array" in prop_type
+                elif prop_type == "array":
+                    is_array_type = True
+                
+                if is_array_type and "items" in prop_value:
+                    compressed_prop["items"] = {"type": prop_value["items"].get("type", "string")}
+                
+                compressed_params["properties"][prop_name] = compressed_prop
+            
+            compressed_tool = {
+                "type": tool["type"],
+                "function": {
+                    "name": tool["function"]["name"],
+                    "description": tool["function"]["description"][:100],  # Short description
+                    "parameters": compressed_params
+                }
+            }
+            compressed.append(compressed_tool)
+        
+        return compressed
+    
+    def _convert_to_gemini_tools(self, groq_tools: List[Dict]) -> List[Dict]:
+        """
+        Convert Groq/OpenAI format tools to Gemini format.
+        
+        Groq format: {"type": "function", "function": {...}}
+        Gemini format: {"name": "...", "description": "...", "parameters": {...}}
+        
+        Gemini requires:
+        - Property types as UPPERCASE (STRING, NUMBER, BOOLEAN, ARRAY, OBJECT)
+        - No "type": "object" at root parameters level
+        """
+        gemini_tools = []
+        
+        def convert_type(json_type: str) -> str:
+            """Convert JSON Schema type to Gemini type."""
+            type_map = {
+                "string": "STRING",
+                "number": "NUMBER",
+                "integer": "INTEGER",
+                "boolean": "BOOLEAN",
+                "array": "ARRAY",
+                "object": "OBJECT"
+            }
+            
+            # Handle list of types (e.g., ["string", "array"])
+            if isinstance(json_type, list):
+                # Use the first type in the list, or ARRAY if array is in the list
+                if "array" in json_type:
+                    return "ARRAY"
+                elif len(json_type) > 0:
+                    return type_map.get(json_type[0], "STRING")
+                else:
+                    return "STRING"
+            
+            return type_map.get(json_type, "STRING")
+        
+        def convert_properties(properties: Dict) -> Dict:
+            """Convert property definitions to Gemini format."""
+            converted = {}
+            for prop_name, prop_def in properties.items():
+                new_def = {}
+                
+                # Handle oneOf (like clean_missing_values strategy)
+                if "oneOf" in prop_def:
+                    # For oneOf, just pick the first option or simplify
+                    if isinstance(prop_def["oneOf"], list) and len(prop_def["oneOf"]) > 0:
+                        first_option = prop_def["oneOf"][0]
+                        if "type" in first_option:
+                            new_def["type"] = convert_type(first_option["type"])
+                        if "enum" in first_option:
+                            new_def["enum"] = first_option["enum"]
+                    else:
+                        new_def["type"] = "STRING"
+                elif "type" in prop_def:
+                    prop_type = prop_def["type"]
+                    
+                    # Handle list of types (e.g., ["string", "array"])
+                    if isinstance(prop_type, list):
+                        converted_type = convert_type(prop_type)
+                        new_def["type"] = converted_type
+                        
+                        # If it's an array type, we MUST provide items for Gemini
+                        if converted_type == "ARRAY":
+                            if "items" in prop_def:
+                                items_type = prop_def["items"].get("type", "string")
+                                new_def["items"] = {"type": convert_type(items_type)}
+                            else:
+                                # Default to STRING items if not specified
+                                new_def["items"] = {"type": "STRING"}
+                    else:
+                        new_def["type"] = convert_type(prop_type)
+                        
+                        # Handle arrays
+                        if prop_type == "array" and "items" in prop_def:
+                            items_type = prop_def["items"].get("type", "string")
+                            new_def["items"] = {"type": convert_type(items_type)}
+                        elif prop_type == "array":
+                            # Array without items specification - default to STRING
+                            new_def["items"] = {"type": "STRING"}
+                    
+                    # Keep enum
+                    if "enum" in prop_def:
+                        new_def["enum"] = prop_def["enum"]
+                else:
+                    new_def["type"] = "STRING"
+                
+                # Keep description if present
+                if "description" in prop_def:
+                    new_def["description"] = prop_def["description"]
+                
+                converted[prop_name] = new_def
+            
+            return converted
+        
+        for tool in groq_tools:
+            func = tool["function"]
+            params = func.get("parameters", {})
+            
+            # Convert parameters to Gemini format
+            gemini_params = {
+                "type": "OBJECT",  # Gemini uses UPPERCASE
+                "properties": convert_properties(params.get("properties", {})),
+                "required": params.get("required", [])
+            }
+            
+            gemini_tool = {
+                "name": func["name"],
+                "description": func["description"],
+                "parameters": gemini_params
+            }
+            gemini_tools.append(gemini_tool)
+        
+        return gemini_tools
+    
+    def analyze(self, file_path: str, task_description: str, 
+               target_col: Optional[str] = None, 
+               use_cache: bool = True,
+               stream: bool = True,
+               max_iterations: int = 20) -> Dict[str, Any]:
+        """
+        Main entry point for data science analysis.
+        
+        Args:
+            file_path: Path to dataset file
+            task_description: Natural language description of the task
+            target_col: Optional target column name
+            use_cache: Whether to use cached results
+            stream: Whether to stream LLM responses
+            max_iterations: Maximum number of tool execution iterations
+            
+        Returns:
+            Analysis results including summary and tool outputs
+        """
+        start_time = time.time()
+        
+        # Check cache
+        if use_cache:
+            cache_key = self._generate_cache_key(file_path, task_description, target_col)
+            cached = self.cache.get(cache_key)
+            if cached:
+                print("✓ Using cached results")
+                return cached
+        
+        # Build initial messages
+        system_prompt = self._build_system_prompt()
+        
+        # 🧠 RESOLVE AMBIGUITY USING SESSION MEMORY
+        original_file_path = file_path
+        original_target_col = target_col
+        
+        if self.session:
+            # Check if request has ambiguous references
+            resolved_params = self.session.resolve_ambiguity(task_description)
+            
+            # Use resolved params if user didn't specify
+            if not file_path or file_path == "":
+                if resolved_params.get("file_path"):
+                    file_path = resolved_params["file_path"]
+                    print(f"📝 Using dataset from session: {file_path}")
+            
+            if not target_col:
+                if resolved_params.get("target_col"):
+                    target_col = resolved_params["target_col"]
+                    print(f"📝 Using target column from session: {target_col}")
+            
+            # Show session context if available
+            if self.session.last_dataset or self.session.last_model:
+                context_summary = self.session.get_context_summary()
+                print(f"\n{context_summary}\n")
+        
+        # 🎯 PROACTIVE INTENT DETECTION - Tell LLM which tools to use BEFORE it tries wrong ones
+        task_lower = task_description.lower()
+        
+        # Detect user intent
+        wants_viz = any(kw in task_lower for kw in ["plot", "graph", "visualiz", "dashboard", "chart", "show", "display", "create", "generate"])
+        wants_clean = any(kw in task_lower for kw in ["clean", "missing", "impute"])
+        wants_features = any(kw in task_lower for kw in ["feature", "engineer", "time-based", "extract features"])
+        wants_train = any(kw in task_lower for kw in ["train", "model", "predict", "best model"])
+        
+        # 📊 DETECT SPECIFIC PLOT TYPE - Match user's exact visualization request
+        plot_type_guidance = ""
+        if wants_viz:
+            if "histogram" in task_lower or "distribution" in task_lower or "freq" in task_lower:
+                plot_type_guidance = "\n\n📊 **PLOT TYPE DETECTED**: Histogram\n✅ Use: generate_interactive_histogram\n❌ Do NOT use: generate_interactive_scatter (that's for scatter plots!)"
+            elif "scatter" in task_lower or "relationship" in task_lower or "correlation" in task_lower:
+                plot_type_guidance = "\n\n📊 **PLOT TYPE DETECTED**: Scatter Plot\n✅ Use: generate_interactive_scatter\n❌ Do NOT use: generate_interactive_histogram (that's for distributions!)"
+            elif "box plot" in task_lower or "boxplot" in task_lower or "outlier" in task_lower:
+                plot_type_guidance = "\n\n📊 **PLOT TYPE DETECTED**: Box Plot\n✅ Use: generate_interactive_box_plots"
+            elif "time series" in task_lower or "trend" in task_lower or "over time" in task_lower:
+                plot_type_guidance = "\n\n📊 **PLOT TYPE DETECTED**: Time Series\n✅ Use: generate_interactive_time_series"
+            elif "heatmap" in task_lower or "correlation" in task_lower:
+                plot_type_guidance = "\n\n📊 **PLOT TYPE DETECTED**: Heatmap\n✅ Use: generate_interactive_correlation_heatmap"
+            elif "dashboard" in task_lower or "all plot" in task_lower:
+                plot_type_guidance = "\n\n📊 **PLOT TYPE DETECTED**: Dashboard/Multiple Plots\n✅ Use: generate_plotly_dashboard OR generate_all_plots"
+            else:
+                # Generic visualization - let LLM decide based on data
+                plot_type_guidance = "\n\n📊 **PLOT TYPE**: Generic visualization\n✅ Choose appropriate tool based on:\n- Histogram: Single numeric variable distribution\n- Scatter: Relationship between 2 numeric variables\n- Box Plot: Compare distributions across categories\n- Time Series: Data with datetime column"
+        
+        # Build specific guidance based on intent
+        workflow_guidance = ""
+        
+        if wants_train:
+            # Full ML pipeline - ALWAYS run complete workflow for model training
+            workflow_guidance = (
+                "\n\n🎯 **WORKFLOW**: Full ML Pipeline (Training Requested)\n"
+                "Execute ALL steps for best model performance:\n"
+                "1. Profile dataset (understand data)\n"
+                "2. Clean missing values (data quality)\n"
+                "3. Handle outliers (prevent bias)\n"
+                "4. Create features (time features, interactions)\n"
+                "5. Encode categorical (prepare for ML)\n"
+                "6. Train models (baseline + optimization)\n"
+                "7. Generate visualizations (feature importance, residuals, performance)\n"
+                "8. Create reports (comprehensive analysis)\n\n"
+                "⚠️ ALL tools allowed - cleaning, feature engineering, visualization, and training!"
+            )
+        elif wants_clean and wants_viz and not wants_train:
+            # Multi-intent: Clean + Visualize
+            workflow_guidance = (
+                "\n\n🎯 **WORKFLOW**: Multi-Intent (Clean + Visualize)\n"
+                "Steps:\n"
+                "1. clean_missing_values\n"
+                "2. handle_outliers\n"
+                "3. generate_interactive_scatter OR generate_plotly_dashboard\n"
+                "4. STOP (no training!)"
+            )
+        elif wants_viz and not wants_train and not wants_clean:
+            # Visualization only
+            workflow_guidance = (
+                f"\n\n🎯 **WORKFLOW**: Visualization ONLY{plot_type_guidance}\n"
+                "⚠️ DO NOT run profiling or cleaning tools!\n"
+                "✅ YOUR FIRST CALL: Use the EXACT plot type mentioned above\n"
+                "✅ Then STOP immediately (no training, no cleaning needed!)"
+            )
+        elif wants_features and not wants_train:
+            # Feature engineering only
+            workflow_guidance = (
+                "\n\n🎯 **WORKFLOW**: Feature Engineering ONLY\n"
+                "Steps:\n"
+                "1. (Optional) profile_dataset if you need column names\n"
+                "2. create_time_features OR encode_categorical OR create_interaction_features\n"
+                "3. STOP (no training!)"
+            )
+        elif wants_clean and not wants_train and not wants_viz:
+            # Cleaning only
+            workflow_guidance = (
+                "\n\n🎯 **WORKFLOW**: Data Cleaning ONLY\n"
+                "Steps:\n"
+                "1. (Optional) profile_dataset to see issues\n"
+                "2. clean_missing_values\n"
+                "3. handle_outliers\n"
+                "4. STOP (no training, no feature engineering!)"
+            )
+        else:
+            # Default full workflow
+            workflow_guidance = "\n\n🎯 **WORKFLOW**: Complete Analysis\nExecute: profile → clean → encode → train → report"
+        
+        user_message = f"""Please analyze the dataset and complete the following task:
+
+**Dataset**: {file_path}
+**Task**: {task_description}
+**Target Column**: {target_col if target_col else 'Not specified - please infer from data'}{workflow_guidance}"""
+        
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_message}
+        ]
+        
+        # Track workflow
+        workflow_history = []
+        iteration = 0
+        tool_call_counter = {}  # Track how many times each tool has been called
+        
+        # For Gemini, maintain a persistent chat session
+        gemini_chat = None
+        if self.provider == "gemini":
+            gemini_chat = self.gemini_model.start_chat(history=[])
+        
+        while iteration < max_iterations:
+            iteration += 1
+            
+            try:
+                # Prune messages to avoid token bloat (keep system + user + last 8 messages)
+                if len(messages) > 10:
+                    # Keep: system prompt, user message, and last 8 tool interactions
+                    messages = [messages[0], messages[1]] + messages[-8:]
+                    print(f"📊 Pruned conversation history (keeping last 8 messages)")
+                
+                # Use compressed tools registry (all 46 tools but shorter descriptions)
+                tools_to_use = self._compress_tools_registry()
+                
+                # Rate limiting - wait if needed (for Gemini free tier: 10 RPM)
+                if self.min_api_call_interval > 0:
+                    time_since_last_call = time.time() - self.last_api_call_time
+                    if time_since_last_call < self.min_api_call_interval:
+                        wait_time = self.min_api_call_interval - time_since_last_call
+                        print(f"⏳ Rate limiting: waiting {wait_time:.1f}s...")
+                        time.sleep(wait_time)
+                
+                # Call LLM with function calling (provider-specific)
+                if self.provider == "groq":
+                    response = self.groq_client.chat.completions.create(
+                        model=self.model,
+                        messages=messages,
+                        tools=tools_to_use,
+                        tool_choice="auto",
+                        parallel_tool_calls=False,  # Disable parallel calls to prevent XML format errors
+                        temperature=0.1,  # Low temperature for consistent outputs
+                        max_tokens=4096
+                    )
+                    
+                    self.api_calls_made += 1
+                    self.last_api_call_time = time.time()
+                    response_message = response.choices[0].message
+                    tool_calls = response_message.tool_calls
+                    final_content = response_message.content
+                    
+                elif self.provider == "gemini":
+                    # Convert tools to Gemini format
+                    gemini_tools = self._convert_to_gemini_tools(tools_to_use)
+                    
+                    # First iteration: send system + user message
+                    if iteration == 1:
+                        combined_message = f"{messages[0]['content']}\n\n{messages[1]['content']}"
+                        response = gemini_chat.send_message(
+                            combined_message,
+                            tools=gemini_tools
+                        )
+                    else:
+                        # Subsequent iterations: send function responses
+                        # Gemini needs function responses to continue the conversation
+                        # The last message should be a tool response
+                        last_tool_msg = messages[-1]
+                        if last_tool_msg.get("role") == "tool":
+                            # Send function response back to Gemini using proper format
+                            from google.ai.generativelanguage_v1beta.types import content as glm_content
+                            
+                            function_response_part = glm_content.Part(
+                                function_response=glm_content.FunctionResponse(
+                                    name=last_tool_msg["name"],
+                                    response={"result": last_tool_msg["content"]}
+                                )
+                            )
+                            
+                            response = gemini_chat.send_message(
+                                function_response_part,
+                                tools=gemini_tools
+                            )
+                        else:
+                            # Shouldn't happen, but fallback
+                            response = gemini_chat.send_message(
+                                "Continue with the next step.",
+                                tools=gemini_tools
+                            )
+                    
+                    self.api_calls_made += 1
+                    self.last_api_call_time = time.time()
+                    
+                    # Extract function calls from Gemini response
+                    tool_calls = []
+                    final_content = None
+                    
+                    if response.candidates and response.candidates[0].content.parts:
+                        for part in response.candidates[0].content.parts:
+                            if hasattr(part, 'function_call') and part.function_call:
+                                tool_calls.append(part.function_call)
+                            elif hasattr(part, 'text') and part.text:
+                                final_content = part.text
+                
+                # Check if done (no tool calls)
+                if not tool_calls:
+                    # Final response
+                    final_summary = final_content or "Analysis completed"
+                    
+                    # 🧠 Save conversation to session memory
+                    if self.session:
+                        self.session.add_conversation(task_description, final_summary)
+                        self.session_store.save(self.session)
+                        print(f"\n✅ Session saved: {self.session.session_id}")
+                    
+                    result = {
+                        "status": "success",
+                        "summary": final_summary,
+                        "workflow_history": workflow_history,
+                        "iterations": iteration,
+                        "api_calls": self.api_calls_made,
+                        "execution_time": round(time.time() - start_time, 2)
+                    }
+                    
+                    # Cache result
+                    if use_cache:
+                        self.cache.set(cache_key, result, metadata={
+                            "file_path": file_path,
+                            "task": task_description
+                        })
+                    
+                    return result
+                
+                # Execute tool calls (provider-specific format)
+                if self.provider == "groq":
+                    messages.append(response_message)
+                
+                for tool_call in tool_calls:
+                    # Extract tool name and args (provider-specific)
+                    if self.provider == "groq":
+                        tool_name = tool_call.function.name
+                        tool_args = json.loads(tool_call.function.arguments)
+                        tool_call_id = tool_call.id
+                    elif self.provider == "gemini":
+                        tool_name = tool_call.name
+                        # Convert protobuf args to Python dict
+                        tool_args = {}
+                        for key, value in tool_call.args.items():
+                            # Handle different protobuf value types
+                            if isinstance(value, (str, int, float, bool)):
+                                tool_args[key] = value
+                            elif hasattr(value, '__iter__') and not isinstance(value, str):
+                                # Convert lists/repeated fields
+                                tool_args[key] = list(value)
+                            else:
+                                # Fallback: try to convert to string
+                                tool_args[key] = str(value)
+                        tool_call_id = f"gemini_{iteration}_{tool_name}"
+                    
+                    # ⚠️ WORKFLOW STATE TRACKING: Block redundant operations
+                    completed_tools = [step["tool"] for step in workflow_history]
+                    
+                    # 🎯 COMPREHENSIVE INTENT DETECTION SYSTEM
+                    # Detect user's actual intent to prevent running full pipeline for partial tasks
+                    
+                    task_lower = task_description.lower()
+                    
+                    # Define intent keywords
+                    visualization_keywords = ["plot", "graph", "visualiz", "dashboard", "chart", "show", "display", "create", "generate"]
+                    cleaning_keywords = ["clean", "remove missing", "handle missing", "fill missing", "impute"]
+                    feature_eng_keywords = ["feature", "engineer", "create features", "add features", "extract features", "time-based"]
+                    profiling_keywords = ["profile", "explore", "understand", "summarize", "describe", "report", "analysis", "overview", "insights"]
+                    ml_training_keywords = ["train", "model", "predict", "forecast", "classification", "regression", "tune", "optimize", "best model"]
+                    
+                    # Detect what user wants (can be multiple intents)
+                    wants_visualization = any(kw in task_lower for kw in visualization_keywords)
+                    wants_cleaning = any(kw in task_lower for kw in cleaning_keywords)
+                    wants_feature_eng = any(kw in task_lower for kw in feature_eng_keywords)
+                    wants_profiling = any(kw in task_lower for kw in profiling_keywords)
+                    wants_ml_training = any(kw in task_lower for kw in ml_training_keywords)
+                    
+                    # Negation detection - "without", "no", "don't", "skip"
+                    has_negation = any(neg in task_lower for neg in ["without", "no train", "don't train", "skip train", "no model"])
+                    
+                    # Count how many intents detected
+                    intent_count = sum([wants_visualization, wants_cleaning, wants_feature_eng, wants_profiling, wants_ml_training])
+                    
+                    # Multi-intent detection: "Train model + feature engineering + graphs"
+                    is_multi_intent = intent_count > 1
+                    
+                    # Determine intent type and allowed tools
+                    # 🔥 CRITICAL: ML training ALWAYS needs full pipeline + visualization
+                    if wants_ml_training and not has_negation:
+                        # Full ML pipeline - training requires EVERYTHING
+                        user_intent = "FULL_ML_PIPELINE"
+                        allowed_tool_categories = ["all"]  # Allow all tools (cleaning, features, viz, training, reports)
+                        
+                    elif is_multi_intent and not wants_ml_training:
+                        # Multi-intent WITHOUT training (e.g., "clean and visualize")
+                        user_intent = "MULTI_INTENT"
+                        allowed_tool_categories = []
+                        
+                        # Add categories based on detected intents
+                        if wants_profiling:
+                            allowed_tool_categories.append("profiling")
+                        if wants_cleaning:
+                            # Cleaning may need profiling to identify issues
+                            allowed_tool_categories.extend(["profiling", "cleaning"])
+                        if wants_feature_eng:
+                            # Feature engineering may need profiling for column info
+                            allowed_tool_categories.extend(["profiling", "cleaning", "feature_engineering"])
+                        if wants_visualization:
+                            allowed_tool_categories.append("visualization")
+                        
+                        # Remove duplicates
+                        allowed_tool_categories = list(set(allowed_tool_categories))
+                        
+                    elif wants_visualization and not wants_ml_training:
+                        # Visualization ONLY
+                        user_intent = "VISUALIZATION_ONLY"
+                        allowed_tool_categories = ["visualization"]
+                        
+                    elif wants_cleaning and not wants_ml_training:
+                        # Data cleaning ONLY
+                        user_intent = "CLEANING_ONLY"
+                        allowed_tool_categories = ["profiling", "cleaning"]
+                        
+                    elif wants_feature_eng and not wants_ml_training:
+                        # Feature engineering ONLY (may need cleaning first)
+                        user_intent = "FEATURE_ENGINEERING_ONLY"
+                        allowed_tool_categories = ["profiling", "cleaning", "feature_engineering"]
+                        
+                    elif wants_profiling and not wants_ml_training:
+                        # Exploratory analysis ONLY
+                        user_intent = "EXPLORATORY_ANALYSIS"
+                        allowed_tool_categories = ["profiling", "visualization"]
+                        
+                    else:
+                        # Default: Full pipeline if unclear
+                        user_intent = "FULL_ML_PIPELINE"
+                        allowed_tool_categories = ["all"]
+                    
+                    # Categorize tools
+                    tool_categories = {
+                        "profiling": ["profile_dataset", "detect_data_quality_issues", "analyze_correlations", "get_smart_summary"],
+                        "cleaning": ["clean_missing_values", "handle_outliers", "fix_data_types", "force_numeric_conversion", "smart_type_inference"],
+                        "feature_engineering": ["create_time_features", "encode_categorical", "create_interaction_features", 
+                                               "create_aggregation_features", "auto_feature_engineering", "create_ratio_features",
+                                               "create_statistical_features", "create_log_features", "create_binned_features"],
+                        "ml_training": ["train_baseline_models", "hyperparameter_tuning", "perform_cross_validation", 
+                                       "auto_ml_pipeline", "train_ensemble_models"],
+                        "visualization": ["generate_interactive_scatter", "generate_interactive_histogram",
+                                        "generate_interactive_correlation_heatmap", "generate_interactive_box_plots",
+                                        "generate_interactive_time_series", "generate_plotly_dashboard",
+                                        "generate_eda_plots", "generate_all_plots", "generate_data_quality_plots"]
+                    }
+                    
+                    # Determine if tool should be blocked
+                    should_block_tool = False
+                    block_reason = ""
+                    
+                    if "all" not in allowed_tool_categories:
+                        # Find which category this tool belongs to
+                        tool_category = None
+                        for category, tools in tool_categories.items():
+                            if tool_name in tools:
+                                tool_category = category
+                                break
+                        
+                        # Block if tool category not in allowed categories
+                        if tool_category and tool_category not in allowed_tool_categories:
+                            should_block_tool = True
+                            block_reason = f"User intent: {user_intent} (only allows: {', '.join(allowed_tool_categories)})"
+                    
+                    # 🚫 BLOCK tool if it doesn't match user intent
+                    if should_block_tool:
+                        print(f"\n🚫 BLOCKED: {tool_name}")
+                        print(f"   Task: '{task_description}'")
+                        print(f"   User Intent: {user_intent}")
+                        print(f"   Reason: {block_reason}")
+                        print(f"   Allowed categories: {', '.join(allowed_tool_categories)}")
+                        
+                        # Check if user's requested task is already complete
+                        task_complete = False
+                        completion_summary = ""
+                        
+                        if user_intent == "VISUALIZATION_ONLY":
+                            viz_tools_used = [t for t in completed_tools if t in tool_categories["visualization"]]
+                            if viz_tools_used:
+                                task_complete = True
+                                completion_summary = f"✅ Visualization completed: {', '.join(viz_tools_used)}"
+                        
+                        elif user_intent == "CLEANING_ONLY":
+                            cleaning_tools_used = [t for t in completed_tools if t in tool_categories["cleaning"]]
+                            if cleaning_tools_used:
+                                task_complete = True
+                                completion_summary = f"✅ Data cleaning completed: {', '.join(cleaning_tools_used)}"
+                        
+                        elif user_intent == "FEATURE_ENGINEERING_ONLY":
+                            fe_tools_used = [t for t in completed_tools if t in tool_categories["feature_engineering"]]
+                            if fe_tools_used:
+                                task_complete = True
+                                completion_summary = f"✅ Feature engineering completed: {', '.join(fe_tools_used)}"
+                        
+                        elif user_intent == "EXPLORATORY_ANALYSIS":
+                            analysis_tools_used = [t for t in completed_tools if t in tool_categories["profiling"] or t in tool_categories["visualization"]]
+                            if analysis_tools_used:
+                                task_complete = True
+                                completion_summary = f"✅ Exploratory analysis completed: {', '.join(analysis_tools_used)}"
+                        
+                        if task_complete:
+                            print(f"   {completion_summary}")
+                            
+                            final_summary = (
+                                f"{completion_summary}\n\n"
+                                f"Task: {task_description}\n"
+                                f"Intent: {user_intent}\n\n"
+                                f"Tools executed:\n"
+                                f"{chr(10).join(['- ' + tool for tool in completed_tools])}\n\n"
+                                f"Check ./outputs/ for results."
+                            )
+                            
+                            return {
+                                "status": "completed",
+                                "summary": final_summary,
+                                "workflow_history": workflow_history,
+                                "iterations": iteration,
+                                "api_calls": self.api_calls_made,
+                                "execution_time": round(time.time() - start_time, 2)
+                            }
+                        
+                        # Build guidance for LLM based on intent
+                        if user_intent == "VISUALIZATION_ONLY":
+                            next_step_guidance = (
+                                f"✅ YOUR NEXT CALL MUST BE a visualization tool:\n"
+                                f"   - generate_interactive_scatter\n"
+                                f"   - generate_plotly_dashboard\n"
+                                f"   - generate_eda_plots\n"
+                            )
+                        elif user_intent == "CLEANING_ONLY":
+                            next_step_guidance = (
+                                f"✅ YOUR NEXT CALL should be a cleaning tool:\n"
+                                f"   - clean_missing_values\n"
+                                f"   - handle_outliers\n"
+                                f"   - fix_data_types\n"
+                                f"Then STOP (no training!)"
+                            )
+                        elif user_intent == "FEATURE_ENGINEERING_ONLY":
+                            next_step_guidance = (
+                                f"✅ YOUR NEXT CALL should be a feature engineering tool:\n"
+                                f"   - create_time_features\n"
+                                f"   - encode_categorical\n"
+                                f"   - create_interaction_features\n"
+                                f"Then STOP (no training!)"
+                            )
+                        elif user_intent == "EXPLORATORY_ANALYSIS":
+                            next_step_guidance = (
+                                f"✅ YOUR NEXT CALL should be profiling or visualization:\n"
+                                f"   - profile_dataset\n"
+                                f"   - generate_eda_plots\n"
+                                f"   - analyze_correlations\n"
+                                f"Then STOP (no training!)"
+                            )
+                        else:
+                            next_step_guidance = "Continue with appropriate tools for the task."
+                        
+                        # Send blocking message to LLM
+                        block_warning = {
+                            "role": "user",
+                            "content": (
+                                f"🚫 BLOCKED: '{tool_name}' does not match user intent!\n\n"
+                                f"Task: '{task_description}'\n"
+                                f"Detected Intent: {user_intent}\n"
+                                f"Allowed: {', '.join(allowed_tool_categories)}\n"
+                                f"Blocked: {tool_name} (category: {tool_category if 'tool_category' in locals() else 'unknown'})\n\n"
+                                f"{next_step_guidance}\n\n"
+                                f"DO NOT call blocked tools. Proceed with allowed tools only!"
+                            )
+                        }
+                        
+                        # Track blocking
+                        workflow_history.append({
+                            "step": len(workflow_history) + 1,
+                            "tool": "BLOCKED",
+                            "blocked_tool": tool_name,
+                            "reason": block_reason,
+                            "user_intent": user_intent
+                        })
+                        
+                        messages.append(block_warning)
+                        continue
+                    
+                    # CRITICAL: Block execute_python_code if it's doing encoding/time features
+                    if tool_name == "execute_python_code":
+                        code = tool_args.get("code", "")
+                        
+                        # ✅ ALLOW: Data cleanup (dropping columns, fixing types, etc.)
+                        is_cleanup = any(pattern in code.lower() for pattern in [
+                            "drop(columns=", "drop_duplicates", "fillna", "dropna",
+                            "select_dtypes", ".drop(", "errors='ignore'"
+                        ])
+                        
+                        # Block if trying to do encoding (pd.get_dummies, one-hot, etc.) - UNLESS it's cleanup
+                        if any(pattern in code.lower() for pattern in ["get_dummies", "onehot", "one-hot", "one_hot"]):
+                            if "encode_categorical" in completed_tools and not is_cleanup:
+                                print(f"\n🚫 BLOCKED: execute_python_code attempting to re-encode!")
+                                print(f"   encode_categorical already completed. Skipping this call.")
+                                print(f"   Using existing file: {self._get_last_successful_file(workflow_history)}")
+                                
+                                block_warning = {
+                                    "role": "user",
+                                    "content": (
+                                        f"🚫 BLOCKED: You tried to use execute_python_code for encoding, but encode_categorical ALREADY completed!\n\n"
+                                        f"Encoding is DONE. The file exists: {self._get_last_successful_file(workflow_history)}\n\n"
+                                        f"MOVE TO NEXT STEP: generate_eda_plots OR train_baseline_models\n\n"
+                                        f"DO NOT:\n"
+                                        f"- Call execute_python_code for encoding\n"
+                                        f"- Call encode_categorical again\n"
+                                        f"- Repeat any completed step\n\n"
+                                        f"PROCEED to the next workflow step immediately!"
+                                    )
+                                }
+                                messages.append(block_warning)
+                                continue
+                        
+                        # Block if trying to do time feature extraction - UNLESS it's cleanup
+                        if any(pattern in code.lower() for pattern in ["dt.year", "dt.month", "dt.day", "dt.hour", "strptime", "to_datetime"]):
+                            if "create_time_features" in completed_tools and not is_cleanup:
+                                print(f"\n🚫 BLOCKED: execute_python_code attempting time feature extraction!")
+                                print(f"   create_time_features already completed. Skipping this call.")
+                                
+                                block_warning = {
+                                    "role": "user",
+                                    "content": (
+                                        f"🚫 BLOCKED: You tried to use execute_python_code for time features, but create_time_features ALREADY completed!\n\n"
+                                        f"Time features are DONE. Use the existing file: {self._get_last_successful_file(workflow_history)}\n\n"
+                                        f"MOVE TO NEXT STEP: encode_categorical\n\n"
+                                        f"DO NOT call execute_python_code for time feature extraction!"
+                                    )
+                                }
+                                messages.append(block_warning)
+                                continue
+                    
+                    # CRITICAL: Block create_time_features if already called for both datetime columns
+                    if tool_name == "create_time_features":
+                        time_feature_calls = [step for step in workflow_history if step["tool"] == "create_time_features"]
+                        if len(time_feature_calls) >= 2:  # Already called for 'time' and 'updated'
+                            print(f"\n🚫 BLOCKED: create_time_features already called {len(time_feature_calls)} times!")
+                            print(f"   Time features extracted for all datetime columns. Skipping.")
+                            
+                            block_warning = {
+                                "role": "user",
+                                "content": (
+                                    f"🚫 BLOCKED: create_time_features already called {len(time_feature_calls)} times!\n\n"
+                                    f"Time features extraction is COMPLETE for all datetime columns ('time' and 'updated').\n\n"
+                                    f"MOVE TO NEXT STEP: encode_categorical\n\n"
+                                    f"DO NOT call create_time_features again!"
+                                )
+                            }
+                            messages.append(block_warning)
+                            continue
+                    
+                    # CRITICAL: Block encode_categorical if already completed
+                    if tool_name == "encode_categorical":
+                        if "encode_categorical" in completed_tools:
+                            print(f"\n🚫 BLOCKED: encode_categorical already completed!")
+                            print(f"   Categorical encoding is DONE. Skipping.")
+                            
+                            block_warning = {
+                                "role": "user",
+                                "content": (
+                                    f"🚫 BLOCKED: encode_categorical ALREADY completed!\n\n"
+                                    f"Encoding is DONE. Use file: {self._get_last_successful_file(workflow_history)}\n\n"
+                                    f"MOVE TO NEXT STEP: generate_eda_plots\n\n"
+                                    f"DO NOT call encode_categorical again!"
+                                )
+                            }
+                            messages.append(block_warning)
+                            continue
+                    
+                    # CRITICAL: Block smart_type_inference after encoding (data is ready!)
+                    if tool_name == "smart_type_inference":
+                        if "encode_categorical" in completed_tools or "execute_python_code" in completed_tools:
+                            print(f"\n🚫 BLOCKED: smart_type_inference after encoding!")
+                            print(f"   Data is already encoded and ready. Skipping type inference.")
+                            
+                            block_warning = {
+                                "role": "user",
+                                "content": (
+                                    f"🚫 BLOCKED: smart_type_inference is NOT needed after encoding!\n\n"
+                                    f"The data is already encoded and ready for modeling.\n\n"
+                                    f"MOVE TO NEXT STEP: generate_eda_plots OR train_baseline_models\n\n"
+                                    f"DO NOT call smart_type_inference after encoding!"
+                                )
+                            }
+                            messages.append(block_warning)
+                            continue
+                    
+                    # ⚠️ LOOP DETECTION: Prevent calling the same tool multiple times in a row
+                    # EXCEPTION: Don't apply loop detection for execute_python_code in code-only tasks
+                    tool_call_counter[tool_name] = tool_call_counter.get(tool_name, 0) + 1
+                    
+                    # Detect if this is a code-only task (no ML workflow tools used)
+                    ml_tools = ["profile_dataset", "detect_data_quality_issues", "clean_missing_values", 
+                               "encode_categorical", "train_baseline_models"]
+                    is_code_only_task = not any(tool in completed_tools for tool in ml_tools)
+                    
+                    # Skip loop detection for execute_python_code in code-only tasks
+                    should_check_loops = not (is_code_only_task and tool_name == "execute_python_code")
+                    
+                    # Check for loops (same tool called 2+ times consecutively)
+                    if should_check_loops and tool_call_counter[tool_name] >= 2:
+                        # Check if the last call was also this tool (consecutive repetition)
+                        if workflow_history and workflow_history[-1]["tool"] == tool_name:
+                            print(f"\n⚠️  LOOP DETECTED: {tool_name} called {tool_call_counter[tool_name]} times consecutively!")
+                            print(f"   This indicates the workflow is stuck. Skipping and forcing progression.")
+                            print(f"   Last successful file: {self._get_last_successful_file(workflow_history)}")
+                            
+                            # Check if we've completed the main workflow (reports generated)
+                            completed_tools = [step["tool"] for step in workflow_history]
+                            reports_generated = any(tool in completed_tools for tool in [
+                                "generate_combined_eda_report", 
+                                "generate_plotly_dashboard",
+                                "generate_ydata_profiling_report"
+                            ])
+                            training_done = "train_baseline_models" in completed_tools
+                            
+                            # If reports done and we're looping, mark as complete
+                            if reports_generated and training_done:
+                                print(f"   ✅ Main workflow complete. Marking as DONE.")
+                                final_summary = (
+                                    f"Analysis completed successfully! Main steps finished:\n"
+                                    f"- Data profiling and cleaning\n"
+                                    f"- Model training ({completed_tools.count('train_baseline_models')} models trained)\n"
+                                    f"- {'Hyperparameter tuning' if 'hyperparameter_tuning' in completed_tools else 'Baseline models'}\n"
+                                    f"- Comprehensive reports generated\n"
+                                    f"- Interactive visualizations created\n\n"
+                                    f"Check ./outputs/ for all results."
+                                )
+                                
+                                return {
+                                    "status": "completed",
+                                    "summary": final_summary,
+                                    "workflow_history": workflow_history,
+                                    "iterations": iteration,
+                                    "api_calls": self.api_calls_made,
+                                    "execution_time": round(time.time() - start_time, 2)
+                                }
+                            
+                            # Otherwise, force LLM to move on with VERY STRONG warning
+                            next_step = self._determine_next_step(tool_name, completed_tools)
+                            loop_warning = {
+                                "role": "user",
+                                "content": (
+                                    f"🚨 CRITICAL ERROR: You are STUCK IN A LOOP! 🚨\n\n"
+                                    f"You called '{tool_name}' {tool_call_counter[tool_name]} times consecutively.\n"
+                                    f"This step is ALREADY COMPLETE (✓ Completed shown above).\n\n"
+                                    f"**DO NOT call {tool_name} again!**\n"
+                                    f"**DO NOT call execute_python_code for the same task!**\n\n"
+                                    f"NEXT STEP: {next_step}\n\n"
+                                    f"Last successful output file: {self._get_last_successful_file(workflow_history)}\n"
+                                    f"Use this file and proceed to the NEXT step immediately.\n\n"
+                                    f"Remember:\n"
+                                    f"- If a tool succeeds (✓ Completed) → NEVER call it again\n"
+                                    f"- Do NOT use execute_python_code for tasks that have dedicated tools\n"
+                                    f"- Follow the workflow: Steps 1→2→3→...→15 (ONE TIME EACH)"
+                                )
+                            }
+                            messages.append(loop_warning)
+                            continue  # Skip this tool call
+                    
+                    print(f"\n🔧 Executing: {tool_name}")
+                    try:
+                        print(f"   Arguments: {json.dumps(tool_args, indent=2)}")
+                    except:
+                        print(f"   Arguments: {tool_args}")
+                    
+                    # Execute tool
+                    tool_result = self._execute_tool(tool_name, tool_args)
+                    
+                    # Check for errors and display them prominently
+                    if not tool_result.get("success", True):
+                        error_msg = tool_result.get("error", "Unknown error")
+                        error_type = tool_result.get("error_type", "Error")
+                        print(f"   ❌ FAILED: {tool_name}")
+                        print(f"   ⚠️  Error Type: {error_type}")
+                        print(f"   ⚠️  Error Message: {error_msg}")
+                        
+                        # Add recovery guidance with last successful file
+                        last_successful_file = self._get_last_successful_file(workflow_history)
+                        if last_successful_file:
+                            tool_result["recovery_guidance"] = (
+                                f"This tool failed. Use the last successful file for next steps: {last_successful_file}\n"
+                                f"Do NOT try to use the failed tool's output file."
+                            )
+                            print(f"   🔄 Recovery: Use {last_successful_file} for next step")
+                        
+                        # Special handling for execute_python_code errors
+                        if tool_name == "execute_python_code":
+                            stderr = tool_result.get("stderr", "")
+                            hints = tool_result.get("hints", [])
+                            
+                            if stderr:
+                                print(f"   📄 Code Error Details:")
+                                # Show last 10 lines of stderr (most relevant)
+                                stderr_lines = stderr.split('\n')[-10:]
+                                for line in stderr_lines:
+                                    if line.strip():
+                                        print(f"      {line}")
+                            
+                            if hints:
+                                print(f"   💡 Suggestions:")
+                                for hint in hints:
+                                    print(f"      {hint}")
+                            
+                            # Add suggestion to use specialized tools instead
+                            if error_type in ["PermissionError", "FileNotFoundError", "KeyError"]:
+                                tool_result["suggestion"] = (
+                                    f"Consider using specialized tools instead of execute_python_code:\n"
+                                    f"- For file operations: use clean_missing_values(), encode_categorical(), etc.\n"
+                                    f"- For data transformations: use create_ratio_features(), create_statistical_features(), etc.\n"
+                                    f"- Specialized tools are more robust and handle edge cases better!"
+                                )
+                        
+                        # Extract helpful info from common errors and add to result
+                        if "Column" in error_msg and "not found" in error_msg and "Available columns:" in error_msg:
+                            # Extract the column that was searched for and available columns
+                            import re
+                            searched = re.search(r"Column '([^']+)' not found", error_msg)
+                            available = re.search(r"Available columns: (.+?)(?:\n|$)", error_msg)
+                            if searched and available:
+                                searched_col = searched.group(1)
+                                available_cols = [c.strip() for c in available.group(1).split(',')]
+                                
+                                # Find similar column names (case-insensitive partial match)
+                                suggestions = []
+                                searched_lower = searched_col.lower()
+                                for col in available_cols[:20]:  # Check first 20
+                                    if searched_lower in col.lower() or col.lower() in searched_lower:
+                                        suggestions.append(col)
+                                
+                                if suggestions:
+                                    tool_result["suggestion"] = f"Did you mean: {suggestions[0]}? (Similar columns: {', '.join(suggestions[:3])})"
+                                    print(f"   💡 HINT: Did you mean '{suggestions[0]}'?")
+                        
+                        # For critical tools, show detailed error to user
+                        if tool_name in ["train_baseline_models", "auto_ml_pipeline"]:
+                            print(f"\n🔴 CRITICAL ERROR in {tool_name}:")
+                            print(f"   {error_msg}\n")
+                    else:
+                        print(f"   ✓ Completed: {tool_name}")
+                    
+                    # Track in workflow
+                    workflow_history.append({
+                        "iteration": iteration,
+                        "tool": tool_name,
+                        "arguments": tool_args,
+                        "result": tool_result
+                    })
+                    
+                    # ⚡ CRITICAL FIX: Add tool result back to messages so LLM sees it in next iteration!
+                    if self.provider == "groq":
+                        # For Groq, add tool message with the result
+                        # Make error messages MORE PROMINENT if tool failed
+                        # Clean tool_result to make it JSON-serializable
+                        clean_tool_result = self._make_json_serializable(tool_result)
+                        tool_response_content = json.dumps(clean_tool_result)
+                        
+                        # If tool failed, prepend ERROR indicator to make it obvious
+                        if not tool_result.get("success", True):
+                            error_msg = tool_result.get("error", "Unknown error")
+                            suggestion = tool_result.get("suggestion", "")
+                            
+                            # Create VERY EXPLICIT error message
+                            tool_response_content = json.dumps({
+                                "❌ TOOL_FAILED": True,
+                                "tool_name": tool_name,
+                                "error": error_msg,
+                                "suggestion": suggestion,
+                                "⚠️ ACTION_REQUIRED": f"RETRY {tool_name} with corrected parameters. Do NOT call other tools first!",
+                                "💡 HINT": suggestion if suggestion else "Check error message for details"
+                            })
+                        
+                        messages.append({
+                            "role": "tool",
+                            "tool_call_id": tool_call_id,
+                            "name": tool_name,
+                            "content": tool_response_content
+                        })
+                    
+                    elif self.provider == "gemini":
+                        # For Gemini, add to messages for history tracking
+                        # Gemini uses function responses differently but we still track
+                        # Clean tool_result to make it JSON-serializable
+                        clean_tool_result = self._make_json_serializable(tool_result)
+                        tool_response_content = json.dumps(clean_tool_result)
+                        
+                        # If tool failed, make error VERY explicit
+                        if not tool_result.get("success", True):
+                            error_msg = tool_result.get("error", "Unknown error")
+                            suggestion = tool_result.get("suggestion", "")
+                            
+                            tool_response_content = json.dumps({
+                                "❌ TOOL_FAILED": True,
+                                "tool_name": tool_name,
+                                "error": error_msg,
+                                "suggestion": suggestion,
+                                "⚠️ ACTION_REQUIRED": f"RETRY {tool_name} with corrected parameters",
+                                "💡 HINT": suggestion if suggestion else "Check error message"
+                            })
+                        
+                        messages.append({
+                            "role": "tool",
+                            "name": tool_name,
+                            "content": tool_response_content
+                        })
+                    
+                    # Debug: Check if training completed
+                    if tool_name == "train_baseline_models":
+                        print(f"[DEBUG] train_baseline_models executed!")
+                        print(f"[DEBUG]   tool_result keys: {list(tool_result.keys())}")
+                        print(f"[DEBUG]   'best_model' in tool_result: {'best_model' in tool_result}")
+                        if isinstance(tool_result, dict) and 'result' in tool_result:
+                            print(f"[DEBUG]   Nested result keys: {list(tool_result['result'].keys()) if isinstance(tool_result['result'], dict) else 'Not a dict'}")
+                            print(f"[DEBUG]   'best_model' in nested result: {'best_model' in tool_result['result'] if isinstance(tool_result['result'], dict) else False}")
+                        if "best_model" in tool_result:
+                            print(f"[DEBUG]   best_model value: {tool_result['best_model']}")
+                    
+                    # AUTO-FINISH DISABLED: Let agent complete full workflow including EDA reports
+                    # Previously auto-finish would exit immediately after training, preventing
+                    # report generation. Now the agent continues to generate visualizations and reports.
+            
+            except Exception as e:
+                import traceback
+                error_traceback = traceback.format_exc()
+                error_str = str(e)
+                
+                # Log the actual error for debugging
+                print(f"❌ ERROR in analyze loop: {e}")
+                print(f"   Error type: {type(e).__name__}")
+                print(f"   Full error: {error_str}")
+                print(f"   Traceback:\n{error_traceback}")
+                
+                # Handle rate limit errors with retry (be more specific to avoid false positives)
+                if ("429" in error_str or 
+                    "Resource has been exhausted" in error_str or
+                    "quota exceeded" in error_str.lower()):
+                    
+                    retry_delay = 10
+                    if "retry after" in error_str.lower():
+                        import re
+                        match = re.search(r'retry after (\d+)', error_str.lower())
+                        if match:
+                            retry_delay = min(int(match.group(1)) + 2, 15)
+                    
+                    print(f"⏳ Rate limit detected (429/quota). Waiting {retry_delay}s before retry...")
+                    time.sleep(retry_delay)
+                    iteration -= 1
+                    continue
+                
+                # For other errors, don't retry - just report and continue
+                print(f"   Traceback:\n{error_traceback}")
+                
+                # 🧠 Save session even on error
+                if self.session:
+                    self.session.add_conversation(task_description, f"Error: {str(e)}")
+                    self.session_store.save(self.session)
+                
+                return {
+                    "status": "error",
+                    "error": str(e),
+                    "error_type": type(e).__name__,
+                    "traceback": error_traceback,
+                    "workflow_history": workflow_history,
+                    "iterations": iteration
+                }
+        
+        # Max iterations reached
+        # 🧠 Save session
+        if self.session:
+            self.session.add_conversation(task_description, "Workflow incomplete - max iterations reached")
+            self.session_store.save(self.session)
+        
+        return {
+            "status": "incomplete",
+            "message": f"Reached maximum iterations ({max_iterations})",
+            "workflow_history": workflow_history,
+            "iterations": iteration
+        }
+    
+    def get_cache_stats(self) -> Dict[str, Any]:
+        """Get cache statistics."""
+        return self.cache.get_stats()
+    
+    def clear_cache(self) -> None:
+        """Clear all cached results."""
+        self.cache.clear_all()
+    
+    def get_session_id(self) -> Optional[str]:
+        """Get current session ID."""
+        return self.session.session_id if self.session else None
+    
+    def clear_session(self) -> None:
+        """Clear current session context (start fresh)."""
+        if self.session:
+            self.session.clear()
+            print("✅ Session context cleared")
+        else:
+            print("⚠️  No active session")
+    
+    def get_session_context(self) -> str:
+        """Get human-readable session context summary."""
+        if self.session:
+            return self.session.get_context_summary()
+        else:
+            return "No active session"
diff --git a/src/reasoning/__init__.py b/src/reasoning/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb0ae48a1f20175cd6f2be72369a31e092d320bc
--- /dev/null
+++ b/src/reasoning/__init__.py
@@ -0,0 +1,332 @@
+"""
+Reasoning Module - Core Abstraction
+
+Provides clean separation between:
+- Deterministic data processing (tools)
+- Non-deterministic reasoning (LLM)
+
+Design Principles:
+- NO RAW DATA ACCESS - Only summaries/metadata
+- NO TRAINING DECISIONS - Only explanations
+- STRUCTURED I/O - JSON in, JSON + text out
+- CACHEABLE - Deterministic enough to cache
+- REASONING ONLY - No execution, no side effects
+
+Architecture:
+    Tool → Generates Summary → Reasoning Module → Returns Explanation
+    
+    Tool: "Here's what I found: {stats}"
+    Reasoning: "Based on these stats, this means..."
+    
+Usage:
+    from reasoning import get_reasoner
+    
+    reasoner = get_reasoner()
+    result = reasoner.explain_data(
+        summary={"rows": 1000, "columns": 20, "missing": 50}
+    )
+"""
+
+import os
+from typing import Dict, Any, Optional, Union
+from abc import ABC, abstractmethod
+
+
+class ReasoningBackend(ABC):
+    """Abstract base class for reasoning backends."""
+    
+    @abstractmethod
+    def generate(
+        self,
+        prompt: str,
+        system_prompt: Optional[str] = None,
+        temperature: float = 0.1,
+        max_tokens: int = 2048
+    ) -> str:
+        """Generate reasoning response."""
+        pass
+    
+    @abstractmethod
+    def generate_structured(
+        self,
+        prompt: str,
+        schema: Dict[str, Any],
+        system_prompt: Optional[str] = None
+    ) -> Dict[str, Any]:
+        """Generate structured JSON response."""
+        pass
+
+
+class GeminiBackend(ReasoningBackend):
+    """Gemini reasoning backend."""
+    
+    def __init__(self, api_key: Optional[str] = None, model: str = "gemini-2.0-flash-exp"):
+        try:
+            import google.generativeai as genai
+        except ImportError:
+            raise ImportError(
+                "google-generativeai not installed. "
+                "Install with: pip install google-generativeai"
+            )
+        
+        api_key = api_key or os.getenv("GOOGLE_API_KEY")
+        if not api_key:
+            raise ValueError(
+                "Google API key required. Set GOOGLE_API_KEY env var or pass api_key"
+            )
+        
+        genai.configure(api_key=api_key)
+        self.model = genai.GenerativeModel(
+            model,
+            generation_config={"temperature": 0.1}
+        )
+        self.model_name = model
+    
+    def generate(
+        self,
+        prompt: str,
+        system_prompt: Optional[str] = None,
+        temperature: float = 0.1,
+        max_tokens: int = 2048
+    ) -> str:
+        """Generate reasoning response."""
+        # Combine system and user prompts
+        full_prompt = prompt
+        if system_prompt:
+            full_prompt = f"{system_prompt}\n\n{prompt}"
+        
+        response = self.model.generate_content(
+            full_prompt,
+            generation_config={
+                "temperature": temperature,
+                "max_output_tokens": max_tokens
+            }
+        )
+        
+        return response.text
+    
+    def generate_structured(
+        self,
+        prompt: str,
+        schema: Dict[str, Any],
+        system_prompt: Optional[str] = None
+    ) -> Dict[str, Any]:
+        """Generate structured JSON response."""
+        import json
+        
+        # Add schema instruction
+        schema_str = json.dumps(schema, indent=2)
+        structured_prompt = f"""{prompt}
+
+Respond with valid JSON matching this schema:
+{schema_str}
+
+Your response must be valid JSON only, no other text."""
+        
+        response_text = self.generate(structured_prompt, system_prompt)
+        
+        # Extract JSON from response
+        try:
+            # Try direct parse
+            return json.loads(response_text)
+        except json.JSONDecodeError:
+            # Try to extract JSON from markdown code blocks
+            import re
+            json_match = re.search(r'```json\s*\n(.*?)\n```', response_text, re.DOTALL)
+            if json_match:
+                return json.loads(json_match.group(1))
+            
+            # Try to extract any JSON object
+            json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
+            if json_match:
+                return json.loads(json_match.group(0))
+            
+            raise ValueError(f"Failed to extract JSON from response: {response_text[:200]}...")
+
+
+class GroqBackend(ReasoningBackend):
+    """Groq reasoning backend."""
+    
+    def __init__(self, api_key: Optional[str] = None, model: str = "llama-3.3-70b-versatile"):
+        try:
+            from groq import Groq
+        except ImportError:
+            raise ImportError(
+                "groq not installed. "
+                "Install with: pip install groq"
+            )
+        
+        api_key = api_key or os.getenv("GROQ_API_KEY")
+        if not api_key:
+            raise ValueError(
+                "Groq API key required. Set GROQ_API_KEY env var or pass api_key"
+            )
+        
+        self.client = Groq(api_key=api_key)
+        self.model_name = model
+    
+    def generate(
+        self,
+        prompt: str,
+        system_prompt: Optional[str] = None,
+        temperature: float = 0.1,
+        max_tokens: int = 2048
+    ) -> str:
+        """Generate reasoning response."""
+        messages = []
+        
+        if system_prompt:
+            messages.append({"role": "system", "content": system_prompt})
+        
+        messages.append({"role": "user", "content": prompt})
+        
+        response = self.client.chat.completions.create(
+            model=self.model_name,
+            messages=messages,
+            temperature=temperature,
+            max_tokens=max_tokens
+        )
+        
+        return response.choices[0].message.content
+    
+    def generate_structured(
+        self,
+        prompt: str,
+        schema: Dict[str, Any],
+        system_prompt: Optional[str] = None
+    ) -> Dict[str, Any]:
+        """Generate structured JSON response."""
+        import json
+        
+        # Add schema instruction
+        schema_str = json.dumps(schema, indent=2)
+        structured_prompt = f"""{prompt}
+
+Respond with valid JSON matching this schema:
+{schema_str}
+
+Your response must be valid JSON only, no other text."""
+        
+        response_text = self.generate(structured_prompt, system_prompt)
+        
+        # Extract JSON from response
+        try:
+            return json.loads(response_text)
+        except json.JSONDecodeError:
+            # Try to extract JSON from markdown code blocks
+            import re
+            json_match = re.search(r'```json\s*\n(.*?)\n```', response_text, re.DOTALL)
+            if json_match:
+                return json.loads(json_match.group(1))
+            
+            # Try to extract any JSON object
+            json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
+            if json_match:
+                return json.loads(json_match.group(0))
+            
+            raise ValueError(f"Failed to extract JSON from response: {response_text[:200]}...")
+
+
+class ReasoningEngine:
+    """
+    Main reasoning engine.
+    
+    Delegates to appropriate backend (Gemini, Groq, etc).
+    Provides high-level reasoning capabilities.
+    """
+    
+    def __init__(
+        self,
+        backend: Optional[ReasoningBackend] = None,
+        provider: str = "gemini"
+    ):
+        """
+        Initialize reasoning engine.
+        
+        Args:
+            backend: Custom backend instance
+            provider: 'gemini' or 'groq' (if backend not provided)
+        """
+        if backend:
+            self.backend = backend
+        else:
+            provider = provider or os.getenv("LLM_PROVIDER", "gemini")
+            
+            if provider == "gemini":
+                self.backend = GeminiBackend()
+            elif provider == "groq":
+                self.backend = GroqBackend()
+            else:
+                raise ValueError(f"Unsupported provider: {provider}")
+        
+        self.provider = provider
+    
+    def reason(
+        self,
+        prompt: str,
+        system_prompt: Optional[str] = None,
+        temperature: float = 0.1
+    ) -> str:
+        """
+        General-purpose reasoning.
+        
+        Args:
+            prompt: User prompt
+            system_prompt: Optional system context
+            temperature: Creativity (0.0 = deterministic, 1.0 = creative)
+            
+        Returns:
+            Natural language response
+        """
+        return self.backend.generate(prompt, system_prompt, temperature)
+    
+    def reason_structured(
+        self,
+        prompt: str,
+        schema: Dict[str, Any],
+        system_prompt: Optional[str] = None
+    ) -> Dict[str, Any]:
+        """
+        Structured reasoning with JSON output.
+        
+        Args:
+            prompt: User prompt
+            schema: Expected JSON schema
+            system_prompt: Optional system context
+            
+        Returns:
+            Parsed JSON response
+        """
+        return self.backend.generate_structured(prompt, schema, system_prompt)
+
+
+# Singleton instance
+_reasoning_engine: Optional[ReasoningEngine] = None
+
+
+def get_reasoner(
+    backend: Optional[ReasoningBackend] = None,
+    provider: Optional[str] = None
+) -> ReasoningEngine:
+    """
+    Get singleton reasoning engine.
+    
+    Args:
+        backend: Custom backend instance
+        provider: 'gemini' or 'groq'
+        
+    Returns:
+        ReasoningEngine instance
+    """
+    global _reasoning_engine
+    
+    if _reasoning_engine is None or backend is not None:
+        _reasoning_engine = ReasoningEngine(backend=backend, provider=provider)
+    
+    return _reasoning_engine
+
+
+def reset_reasoner():
+    """Reset singleton (for testing)."""
+    global _reasoning_engine
+    _reasoning_engine = None
diff --git a/src/reasoning/business_summary.py b/src/reasoning/business_summary.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a0c0a45ddb3d8e6b1413352a97865a7062cf2c2
--- /dev/null
+++ b/src/reasoning/business_summary.py
@@ -0,0 +1,431 @@
+"""
+Business Summary Module
+
+Translates technical ML results into business-friendly summaries.
+
+KEY RULES:
+- ✅ Accepts: Model results, metrics, insights
+- ❌ NO: Raw technical details in output
+- ✅ Returns: Executive summaries, ROI estimates, actionable recommendations
+- ❌ NO: Code, statistical jargon, complex formulas
+
+Use Cases:
+1. Executive summaries of ML projects
+2. ROI/impact estimation
+3. Stakeholder-friendly reporting
+4. Business recommendations from technical results
+
+Example:
+    from reasoning.business_summary import create_executive_summary
+    
+    results = {
+        "model_accuracy": 0.95,
+        "cost_savings": "$50K/year",
+        "deployment_ready": True
+    }
+    
+    summary = create_executive_summary(results, "churn_prediction")
+    # Returns: "This churn prediction model can save $50K annually..."
+"""
+
+from typing import Dict, Any, List, Optional
+from . import get_reasoner
+
+
+def create_executive_summary(
+    project_results: Dict[str, Any],
+    project_name: str,
+    business_objective: Optional[str] = None
+) -> Dict[str, Any]:
+    """
+    Create executive summary of ML project for non-technical stakeholders.
+    
+    Args:
+        project_results: Technical results (metrics, insights, etc.)
+        project_name: Name of the ML project
+        business_objective: What business problem this solves
+        
+    Returns:
+        {
+            "executive_summary": str,          # 2-3 sentence overview
+            "key_findings": List[str],         # 3-5 bullet points
+            "business_impact": str,            # Expected impact
+            "recommendations": List[str],      # What to do next
+            "risks_and_limitations": List[str] # Important caveats
+        }
+    """
+    reasoner = get_reasoner()
+    
+    objective = ""
+    if business_objective:
+        objective = f"\n**Business Objective:** {business_objective}"
+    
+    prompt = f"""Create an executive summary for this ML project:
+
+**Project:** {project_name}{objective}
+
+**Technical Results:**
+{project_results}
+
+Write for C-level executives who don't understand ML.
+
+Include:
+1. 2-3 sentence executive summary
+2. 3-5 key findings (what we learned)
+3. Business impact (quantified if possible)
+4. Recommendations (what to do next)
+5. Risks and limitations (important caveats)
+
+Use business language, not technical jargon.
+Focus on outcomes, not methods."""
+    
+    system_prompt = """You are translating technical ML results for business executives.
+Avoid jargon like 'accuracy', 'recall', 'features' - use business terms.
+Focus on ROI, impact, and actionable next steps."""
+    
+    schema = {
+        "executive_summary": "string - 2-3 sentence overview",
+        "key_findings": ["array of 3-5 key insights"],
+        "business_impact": "string - Expected business impact",
+        "recommendations": ["array of next steps"],
+        "risks_and_limitations": ["array of important caveats"]
+    }
+    
+    return reasoner.reason_structured(prompt, schema, system_prompt)
+
+
+def estimate_business_impact(
+    model_performance: Dict[str, Any],
+    business_metrics: Dict[str, Any],
+    use_case: str
+) -> Dict[str, Any]:
+    """
+    Estimate business impact of deploying the model.
+    
+    Args:
+        model_performance: Model metrics (accuracy, recall, etc.)
+        business_metrics: Business context
+            Example: {
+                "current_churn_rate": 0.25,
+                "customer_lifetime_value": 1000,
+                "customers": 10000
+            }
+        use_case: Description of use case
+            Example: "churn prediction", "fraud detection", "demand forecasting"
+        
+    Returns:
+        {
+            "estimated_impact": str,           # Quantified impact
+            "assumptions": List[str],          # Key assumptions made
+            "sensitivity": str,                # How sensitive to assumptions
+            "confidence_level": str,           # Confidence in estimates
+            "impact_breakdown": Dict[str, str] # Detailed breakdown
+        }
+    """
+    reasoner = get_reasoner()
+    
+    prompt = f"""Estimate the business impact of deploying this model:
+
+**Use Case:** {use_case}
+
+**Model Performance:**
+{model_performance}
+
+**Business Context:**
+{business_metrics}
+
+Estimate:
+1. Quantified business impact (revenue, cost savings, etc.)
+2. Key assumptions in your calculation
+3. Sensitivity to assumptions
+4. Confidence level in estimates
+5. Breakdown of impact by component
+
+Be conservative in estimates. Show your reasoning."""
+    
+    system_prompt = """You are a business impact analyst.
+Provide realistic, conservative estimates with clear assumptions.
+Show how you calculated impact - don't just guess."""
+    
+    schema = {
+        "estimated_impact": "string - Quantified impact estimate",
+        "assumptions": ["array of key assumptions"],
+        "sensitivity": "string - How sensitive to assumptions",
+        "confidence_level": "string - low/medium/high",
+        "impact_breakdown": {
+            "component": "string - Impact value"
+        }
+    }
+    
+    return reasoner.reason_structured(prompt, schema, system_prompt)
+
+
+def create_stakeholder_report(
+    audience: str,
+    project_status: str,
+    key_metrics: Dict[str, Any],
+    timeline: Optional[Dict[str, str]] = None
+) -> str:
+    """
+    Create customized report for specific stakeholder audience.
+    
+    Args:
+        audience: 'executives', 'engineers', 'business_users', 'data_team'
+        project_status: Current project status
+        key_metrics: Relevant metrics for this audience
+        timeline: Optional timeline information
+        
+    Returns:
+        Natural language report customized for audience
+    """
+    reasoner = get_reasoner()
+    
+    timeline_section = ""
+    if timeline:
+        timeline_section = f"\n**Timeline:**\n{timeline}"
+    
+    # Audience-specific focus
+    audience_focus = {
+        "executives": "ROI, strategic alignment, high-level status",
+        "engineers": "Technical implementation, architecture, performance",
+        "business_users": "How to use, what it means for their work, training needs",
+        "data_team": "Data quality, model performance, monitoring needs"
+    }
+    
+    focus = audience_focus.get(audience, "General status")
+    
+    prompt = f"""Create a report for {audience}:
+
+**Project Status:** {project_status}
+
+**Key Metrics:**
+{key_metrics}{timeline_section}
+
+**Focus Areas:** {focus}
+
+Tailor the report for this specific audience.
+Use language and concepts they understand.
+Highlight what matters most to them."""
+    
+    system_prompt = f"""You are writing a report for {audience}.
+Use appropriate language and detail level for this audience.
+Focus on what they care about most."""
+    
+    return reasoner.reason(prompt, system_prompt, temperature=0.2)
+
+
+def translate_technical_to_business(
+    technical_term: str,
+    context: Optional[str] = None
+) -> str:
+    """
+    Translate technical ML term to business-friendly language.
+    
+    Args:
+        technical_term: ML term to translate
+            Examples: "precision", "recall", "overfitting", "feature importance"
+        context: Optional context for better translation
+        
+    Returns:
+        Business-friendly explanation
+    """
+    reasoner = get_reasoner()
+    
+    context_section = ""
+    if context:
+        context_section = f"\n**Context:** {context}"
+    
+    prompt = f"""Translate this technical ML term to business language:
+
+**Technical Term:** {technical_term}{context_section}
+
+Explain:
+1. What it means in plain English
+2. Why it matters for business
+3. Real-world analogy if helpful
+
+Avoid technical jargon in your explanation."""
+    
+    system_prompt = """You are translating ML concepts for business audiences.
+Use analogies and examples they can relate to.
+Focus on 'why it matters', not 'how it works'."""
+    
+    return reasoner.reason(prompt, system_prompt, temperature=0.1)
+
+
+def prioritize_next_steps(
+    current_results: Dict[str, Any],
+    available_resources: Dict[str, Any],
+    business_constraints: Optional[Dict[str, Any]] = None
+) -> Dict[str, Any]:
+    """
+    Prioritize next steps based on results, resources, and constraints.
+    
+    Args:
+        current_results: Current project state and results
+        available_resources: Available time, budget, team
+        business_constraints: Deadlines, must-haves, etc.
+        
+    Returns:
+        {
+            "high_priority": List[Dict],    # Must-do items
+            "medium_priority": List[Dict],  # Should-do items
+            "low_priority": List[Dict],     # Nice-to-have items
+            "rationale": str                # Prioritization reasoning
+        }
+    """
+    reasoner = get_reasoner()
+    
+    constraints = ""
+    if business_constraints:
+        constraints = f"\n**Business Constraints:**\n{business_constraints}"
+    
+    prompt = f"""Prioritize next steps for this ML project:
+
+**Current Results:**
+{current_results}
+
+**Available Resources:**
+{available_resources}{constraints}
+
+Categorize tasks into:
+1. High Priority (must-do, high impact, blocking)
+2. Medium Priority (should-do, good ROI)
+3. Low Priority (nice-to-have, polish)
+
+For each item, specify:
+- What to do
+- Why it's important
+- Estimated effort
+- Expected impact
+
+Consider resource constraints and business deadlines."""
+    
+    system_prompt = """You are a product/project manager prioritizing ML work.
+Use impact vs effort analysis.
+Be realistic about what can be accomplished with available resources."""
+    
+    schema = {
+        "high_priority": [
+            {
+                "task": "string",
+                "reason": "string",
+                "effort": "string",
+                "impact": "string"
+            }
+        ],
+        "medium_priority": [{"task": "string", "reason": "string", "effort": "string", "impact": "string"}],
+        "low_priority": [{"task": "string", "reason": "string", "effort": "string", "impact": "string"}],
+        "rationale": "string - Overall prioritization logic"
+    }
+    
+    return reasoner.reason_structured(prompt, schema, system_prompt)
+
+
+def explain_to_customer(
+    prediction: Any,
+    explanation_level: str = "simple",
+    allow_appeal: bool = False
+) -> str:
+    """
+    Explain ML prediction to end customer (explainability for users).
+    
+    Args:
+        prediction: What the model predicted
+        explanation_level: 'simple', 'detailed', or 'technical'
+        allow_appeal: Whether customer can appeal the decision
+        
+    Returns:
+        Customer-facing explanation
+    """
+    reasoner = get_reasoner()
+    
+    appeal_text = ""
+    if allow_appeal:
+        appeal_text = "\n\nNote: Customer can appeal this decision, explain how."
+    
+    prompt = f"""Explain this ML prediction to an end customer:
+
+**Prediction:** {prediction}
+
+**Explanation Level:** {explanation_level}
+
+**Requirements:**
+- Be transparent but not technical
+- Build trust, don't confuse
+- Comply with explainability requirements (GDPR, fair lending, etc.)
+- Don't expose proprietary model details{appeal_text}
+
+Focus on:
+- What was decided
+- Key factors that influenced it
+- What customer can do if they disagree"""
+    
+    system_prompt = """You are writing customer-facing explanations.
+Be clear, honest, and empathetic.
+Comply with regulatory explainability requirements.
+Don't say 'the algorithm decided' - take ownership."""
+    
+    return reasoner.reason(prompt, system_prompt, temperature=0.2)
+
+
+def assess_deployment_readiness(
+    model_results: Dict[str, Any],
+    production_requirements: Dict[str, Any]
+) -> Dict[str, Any]:
+    """
+    Assess whether model is ready for production deployment.
+    
+    Args:
+        model_results: Model performance and characteristics
+        production_requirements: Production system requirements
+            Example: {
+                "min_accuracy": 0.90,
+                "max_latency_ms": 100,
+                "required_explainability": True
+            }
+        
+    Returns:
+        {
+            "ready_for_deployment": bool,
+            "readiness_score": float,          # 0-1 score
+            "blockers": List[str],             # Must-fix issues
+            "concerns": List[str],             # Should-fix issues
+            "sign_offs_needed": List[str],     # Required approvals
+            "deployment_recommendation": str    # Go/no-go reasoning
+        }
+    """
+    reasoner = get_reasoner()
+    
+    prompt = f"""Assess deployment readiness:
+
+**Model Results:**
+{model_results}
+
+**Production Requirements:**
+{production_requirements}
+
+Determine:
+1. Whether model is ready for deployment (yes/no)
+2. Readiness score (0-1, where 1 = fully ready)
+3. Blocking issues (must be fixed before deployment)
+4. Concerns (should be addressed but not blockers)
+5. Required sign-offs (legal, compliance, business, etc.)
+6. Go/no-go recommendation with reasoning
+
+Be thorough - production issues are costly."""
+    
+    system_prompt = """You are assessing production deployment readiness.
+Be conservative - it's better to delay than deploy broken model.
+Consider performance, reliability, explainability, fairness, and compliance."""
+    
+    schema = {
+        "ready_for_deployment": "boolean",
+        "readiness_score": "number between 0 and 1",
+        "blockers": ["array of must-fix issues"],
+        "concerns": ["array of should-fix issues"],
+        "sign_offs_needed": ["array of required approvals"],
+        "deployment_recommendation": "string - Go/no-go with reasoning"
+    }
+    
+    return reasoner.reason_structured(prompt, schema, system_prompt)
diff --git a/src/reasoning/data_understanding.py b/src/reasoning/data_understanding.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e1e541dd54f83c96c59a2c6353ee0a2b73c714e
--- /dev/null
+++ b/src/reasoning/data_understanding.py
@@ -0,0 +1,356 @@
+"""
+Data Understanding Module
+
+Provides reasoning about data characteristics, patterns, and quality.
+
+KEY RULES:
+- ✅ Accepts: Statistical summaries, metadata, sample rows
+- ❌ NO: Raw DataFrames, full datasets
+- ✅ Returns: Natural language insights + structured recommendations
+- ❌ NO: Training decisions, model selection
+
+Use Cases:
+1. Explain what data represents
+2. Identify data quality issues
+3. Suggest preprocessing steps
+4. Highlight interesting patterns
+
+Example:
+    from reasoning.data_understanding import explain_dataset
+    
+    summary = {
+        "rows": 10000,
+        "columns": 20,
+        "numeric": 15,
+        "categorical": 5,
+        "missing_values": {"age": 150, "income": 200},
+        "target_distribution": {"yes": 7000, "no": 3000}
+    }
+    
+    explanation = explain_dataset(summary)
+    # Returns: {
+    #     "overview": "This is an imbalanced classification dataset...",
+    #     "quality_issues": ["Missing values in age and income"],
+    #     "recommendations": ["Handle class imbalance", "Impute missing values"],
+    #     "patterns": ["Target class imbalanced (70-30 split)"]
+    # }
+"""
+
+from typing import Dict, Any, List, Optional
+from . import get_reasoner
+
+
+def explain_dataset(
+    summary: Dict[str, Any],
+    target_col: Optional[str] = None
+) -> Dict[str, Any]:
+    """
+    Explain dataset characteristics based on summary statistics.
+    
+    Args:
+        summary: Statistical summary (NO raw data!)
+            Must include: rows, columns, dtypes, missing_values
+            Optional: target_distribution, correlations, outliers
+        target_col: Target column name (if known)
+        
+    Returns:
+        {
+            "overview": str,              # High-level description
+            "quality_issues": List[str],  # Data quality problems
+            "recommendations": List[str], # Suggested preprocessing steps
+            "patterns": List[str],        # Interesting patterns found
+            "target_insights": str        # Target variable insights (if applicable)
+        }
+    """
+    # Validate inputs FIRST (NO raw data allowed!)
+    if "dataframe" in summary or "df" in summary:
+        raise ValueError("Cannot pass raw DataFrames! Pass summary statistics only.")
+    
+    reasoner = get_reasoner()
+    
+    # Build reasoning prompt from summary
+    prompt = f"""Analyze this dataset summary and provide insights:
+
+**Dataset Summary:**
+- Rows: {summary.get('rows', 'unknown')}
+- Columns: {summary.get('columns', 'unknown')}
+- Numeric columns: {summary.get('numeric_columns', [])}
+- Categorical columns: {summary.get('categorical_columns', [])}
+- Missing values: {summary.get('missing_values', {})}
+- Target column: {target_col or 'Not specified'}
+
+**Target Distribution (if available):**
+{summary.get('target_distribution', 'Not provided')}
+
+**Correlations (if available):**
+{summary.get('top_correlations', 'Not provided')}
+
+**Outliers (if available):**
+{summary.get('outliers', 'Not provided')}
+
+Provide:
+1. Overview of what this data represents
+2. Data quality issues identified
+3. Preprocessing recommendations
+4. Interesting patterns noticed
+5. Target variable insights (if classification/regression)
+"""
+    
+    system_prompt = """You are a data understanding expert. Your role is to:
+- Explain what data means in plain English
+- Identify data quality issues
+- Suggest preprocessing steps
+- Highlight patterns
+
+You do NOT:
+- Make training decisions
+- Select models
+- Access raw data
+- Execute any code
+
+You ONLY reason about summaries provided."""
+    
+    schema = {
+        "overview": "string - High-level description of dataset",
+        "quality_issues": ["array of strings - Data quality problems found"],
+        "recommendations": ["array of strings - Preprocessing steps to take"],
+        "patterns": ["array of strings - Interesting patterns noticed"],
+        "target_insights": "string - Insights about target variable"
+    }
+    
+    result = reasoner.reason_structured(prompt, schema, system_prompt)
+    
+    return result
+
+
+def explain_data_profile(
+    profile: Dict[str, Any]
+) -> str:
+    """
+    Generate natural language explanation of data profiling results.
+    
+    Args:
+        profile: Profiling output from tools (column stats, distributions, etc.)
+            Example: {
+                "column_stats": {...},
+                "missing_summary": {...},
+                "cardinality": {...}
+            }
+            
+    Returns:
+        Natural language explanation
+    """
+    reasoner = get_reasoner()
+    
+    prompt = f"""Explain these data profiling results in clear, actionable terms:
+
+{profile}
+
+Focus on:
+- What the data looks like
+- Any concerning patterns
+- Next steps for data cleaning
+"""
+    
+    system_prompt = """You are a data quality expert explaining profiling results.
+Be concise, actionable, and highlight the most important findings."""
+    
+    return reasoner.reason(prompt, system_prompt, temperature=0.1)
+
+
+def suggest_transformations(
+    column_stats: Dict[str, Any],
+    task_type: Optional[str] = None
+) -> Dict[str, List[str]]:
+    """
+    Suggest transformations for each column based on statistics.
+    
+    Args:
+        column_stats: Per-column statistics
+            Example: {
+                "age": {"min": 0, "max": 150, "outliers": 5},
+                "income": {"skewness": 3.5, "distribution": "highly_skewed"}
+            }
+        task_type: 'classification' or 'regression' (if known)
+        
+    Returns:
+        {
+            "age": ["Remove outliers > 100", "Normalize to 0-1 range"],
+            "income": ["Apply log transform (skewed)", "Remove negative values"]
+        }
+    """
+    reasoner = get_reasoner()
+    
+    prompt = f"""Based on these column statistics, suggest transformations:
+
+**Column Statistics:**
+{column_stats}
+
+**Task Type:** {task_type or 'Unknown'}
+
+For each column, suggest:
+- Outlier handling
+- Scaling/normalization
+- Distribution transformations
+- Encoding strategies (for categorical)
+
+Be specific and actionable."""
+    
+    schema = {
+        "column_name": ["array of transformation suggestions"]
+    }
+    
+    return reasoner.reason_structured(prompt, schema)
+
+
+def identify_feature_engineering_opportunities(
+    summary: Dict[str, Any],
+    domain: Optional[str] = None
+) -> List[Dict[str, str]]:
+    """
+    Identify feature engineering opportunities based on data summary.
+    
+    Args:
+        summary: Dataset summary with column names and types
+        domain: Optional domain context (e.g., "healthcare", "finance")
+        
+    Returns:
+        [
+            {
+                "opportunity": "Create age_bins feature",
+                "reason": "Age is continuous but may benefit from binning",
+                "suggested_code": "pd.cut(df['age'], bins=[0,18,35,50,65,100])"
+            },
+            ...
+        ]
+    """
+    reasoner = get_reasoner()
+    
+    domain_context = f"\nDomain: {domain}" if domain else ""
+    
+    prompt = f"""Identify feature engineering opportunities from this data:
+
+**Available Columns:**
+{summary.get('columns', [])}
+
+**Column Types:**
+{summary.get('dtypes', {})}
+
+**Sample Values:**
+{summary.get('sample_values', 'Not provided')}{domain_context}
+
+Suggest:
+1. Interaction features (e.g., BMI from height/weight)
+2. Binning/discretization opportunities
+3. Time-based features (if datetime columns exist)
+4. Encoding strategies
+5. Domain-specific features
+
+For each opportunity, explain WHY it would help."""
+    
+    system_prompt = """You are a feature engineering expert.
+Suggest creative but practical feature transformations.
+Focus on features that typically improve model performance."""
+    
+    schema = {
+        "opportunities": [
+            {
+                "opportunity": "string - What to create",
+                "reason": "string - Why it would help",
+                "suggested_code": "string - Pseudo-code or actual code"
+            }
+        ]
+    }
+    
+    result = reasoner.reason_structured(prompt, schema, system_prompt)
+    return result.get("opportunities", [])
+
+
+def explain_missing_values(
+    missing_summary: Dict[str, Any]
+) -> Dict[str, str]:
+    """
+    Explain missing value patterns and suggest strategies.
+    
+    Args:
+        missing_summary: Summary of missing values
+            Example: {
+                "age": {"count": 150, "percentage": 1.5, "pattern": "random"},
+                "income": {"count": 500, "percentage": 5.0, "pattern": "not_random"}
+            }
+            
+    Returns:
+        {
+            "age": "1.5% missing (random) - Safe to impute with median",
+            "income": "5% missing (non-random) - May indicate bias, consider separate category"
+        }
+    """
+    reasoner = get_reasoner()
+    
+    prompt = f"""Analyze these missing value patterns and suggest handling strategies:
+
+{missing_summary}
+
+For each column with missing values:
+1. Assess the missing pattern (random vs systematic)
+2. Suggest imputation strategy
+3. Warn about any concerns (bias, data leakage, etc.)
+"""
+    
+    schema = {
+        "column_name": "string - Assessment and strategy"
+    }
+    
+    return reasoner.reason_structured(prompt, schema)
+
+
+def compare_datasets(
+    dataset1_summary: Dict[str, Any],
+    dataset2_summary: Dict[str, Any],
+    comparison_purpose: str = "train_test_validation"
+) -> Dict[str, Any]:
+    """
+    Compare two dataset summaries and identify differences.
+    
+    Args:
+        dataset1_summary: Summary of first dataset
+        dataset2_summary: Summary of second dataset
+        comparison_purpose: 'train_test_validation', 'before_after', or 'a_b_test'
+        
+    Returns:
+        {
+            "differences": List[str],      # Key differences found
+            "concerns": List[str],         # Potential issues
+            "data_drift": bool,            # Whether distribution shift detected
+            "recommendation": str          # What to do about differences
+        }
+    """
+    reasoner = get_reasoner()
+    
+    prompt = f"""Compare these two datasets:
+
+**Dataset 1:**
+{dataset1_summary}
+
+**Dataset 2:**
+{dataset2_summary}
+
+**Comparison Purpose:** {comparison_purpose}
+
+Identify:
+1. Distribution differences
+2. Schema differences
+3. Data quality differences
+4. Potential data drift or leakage
+5. Whether differences are concerning
+
+Be specific about what changed and why it matters."""
+    
+    schema = {
+        "differences": ["array of key differences"],
+        "concerns": ["array of potential issues"],
+        "data_drift": "boolean",
+        "recommendation": "string - What to do"
+    }
+    
+    return reasoner.reason_structured(prompt, schema)
diff --git a/src/reasoning/model_explanation.py b/src/reasoning/model_explanation.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1e32807737cde6253f9ccb2e5acdbbe8d6b600b
--- /dev/null
+++ b/src/reasoning/model_explanation.py
@@ -0,0 +1,383 @@
+"""
+Model Explanation Module
+
+Provides reasoning about model behavior, performance, and interpretability.
+
+KEY RULES:
+- ✅ Accepts: Model metrics, predictions, feature importances
+- ❌ NO: Raw model objects, training loops
+- ✅ Returns: Explanations of WHY model behaves as it does
+- ❌ NO: Model selection, hyperparameter choices
+
+Use Cases:
+1. Explain model performance metrics
+2. Interpret feature importances
+3. Diagnose model failures
+4. Suggest model debugging steps
+
+Example:
+    from reasoning.model_explanation import explain_model_performance
+    
+    metrics = {
+        "accuracy": 0.95,
+        "precision": 0.92,
+        "recall": 0.88,
+        "confusion_matrix": [[800, 50], [100, 50]]
+    }
+    
+    explanation = explain_model_performance(metrics, "classification")
+    # Returns: "Your model has high accuracy but low recall..."
+"""
+
+from typing import Dict, Any, List, Optional
+from . import get_reasoner
+
+
+def explain_model_performance(
+    metrics: Dict[str, Any],
+    task_type: str,
+    baseline_metrics: Optional[Dict[str, Any]] = None
+) -> Dict[str, Any]:
+    """
+    Explain model performance metrics in plain English.
+    
+    Args:
+        metrics: Performance metrics (accuracy, precision, recall, etc.)
+        task_type: 'classification' or 'regression'
+        baseline_metrics: Optional baseline to compare against
+        
+    Returns:
+        {
+            "summary": str,                    # Overall assessment
+            "strengths": List[str],            # What model does well
+            "weaknesses": List[str],           # What model struggles with
+            "confusion_analysis": str,         # Confusion matrix interpretation
+            "next_steps": List[str]            # Suggested improvements
+        }
+    """
+    reasoner = get_reasoner()
+    
+    comparison = ""
+    if baseline_metrics:
+        comparison = f"\n**Baseline Metrics (for comparison):**\n{baseline_metrics}"
+    
+    prompt = f"""Analyze these model performance metrics:
+
+**Task Type:** {task_type}
+
+**Metrics:**
+{metrics}{comparison}
+
+Provide:
+1. Overall performance summary (good/bad/acceptable)
+2. Strengths (what model does well)
+3. Weaknesses (where model struggles)
+4. Confusion matrix analysis (if classification)
+5. Next steps for improvement
+
+Be specific and actionable. If performance is poor, suggest why."""
+    
+    system_prompt = """You are a model interpretation expert.
+Explain performance metrics in terms business users understand.
+Focus on actionable insights, not just numbers."""
+    
+    schema = {
+        "summary": "string - Overall assessment",
+        "strengths": ["array of strengths"],
+        "weaknesses": ["array of weaknesses"],
+        "confusion_analysis": "string - Confusion matrix explanation",
+        "next_steps": ["array of improvement suggestions"]
+    }
+    
+    return reasoner.reason_structured(prompt, schema, system_prompt)
+
+
+def interpret_feature_importance(
+    feature_importances: Dict[str, float],
+    top_n: int = 10,
+    domain: Optional[str] = None
+) -> Dict[str, Any]:
+    """
+    Interpret feature importance scores and explain what they mean.
+    
+    Args:
+        feature_importances: {feature_name: importance_score}
+        top_n: Number of top features to focus on
+        domain: Optional domain context
+        
+    Returns:
+        {
+            "top_features": List[str],         # Most important features
+            "interpretation": str,             # What importances mean
+            "surprising_features": List[str],  # Unexpectedly important/unimportant
+            "feature_relationships": str,      # How features might interact
+            "recommendations": List[str]       # What to investigate further
+        }
+    """
+    reasoner = get_reasoner()
+    
+    # Sort by importance
+    sorted_features = sorted(
+        feature_importances.items(),
+        key=lambda x: x[1],
+        reverse=True
+    )[:top_n]
+    
+    domain_context = f"\nDomain: {domain}" if domain else ""
+    
+    prompt = f"""Interpret these feature importance scores:
+
+**Top {top_n} Most Important Features:**
+{dict(sorted_features)}
+
+**All Features:**
+{feature_importances}{domain_context}
+
+Explain:
+1. What these importances tell us about the model
+2. Which features are surprisingly important/unimportant
+3. Potential feature interactions or relationships
+4. What to investigate further
+5. Whether importances make intuitive sense
+
+Be specific about WHY certain features might be important."""
+    
+    system_prompt = """You are a model interpretability expert.
+Explain feature importances in domain terms, not just statistical terms.
+Point out surprising or counterintuitive results."""
+    
+    schema = {
+        "top_features": ["array of most important features"],
+        "interpretation": "string - What importances mean overall",
+        "surprising_features": ["array of unexpected results"],
+        "feature_relationships": "string - How features might interact",
+        "recommendations": ["array of investigation suggestions"]
+    }
+    
+    return reasoner.reason_structured(prompt, schema, system_prompt)
+
+
+def diagnose_model_failure(
+    failure_description: str,
+    model_type: str,
+    metrics: Dict[str, Any],
+    sample_predictions: Optional[List[Dict]] = None
+) -> Dict[str, Any]:
+    """
+    Diagnose why a model is failing and suggest fixes.
+    
+    Args:
+        failure_description: Description of the problem
+            Example: "Model predicts all positives" or "Poor performance on test set"
+        model_type: Model algorithm used
+        metrics: Current performance metrics
+        sample_predictions: Optional sample of predictions vs actuals
+        
+    Returns:
+        {
+            "diagnosis": str,              # What's likely wrong
+            "root_causes": List[str],      # Possible root causes
+            "debugging_steps": List[str],  # How to investigate
+            "potential_fixes": List[str]   # Suggested solutions
+        }
+    """
+    reasoner = get_reasoner()
+    
+    samples = ""
+    if sample_predictions:
+        samples = f"\n**Sample Predictions:**\n{sample_predictions[:10]}"
+    
+    prompt = f"""Diagnose this model failure:
+
+**Problem:** {failure_description}
+
+**Model Type:** {model_type}
+
+**Current Metrics:**
+{metrics}{samples}
+
+Provide:
+1. Diagnosis of what's likely wrong
+2. Possible root causes
+3. Debugging steps to take
+4. Potential fixes to try
+
+Be specific and prioritize most likely causes."""
+    
+    system_prompt = """You are a model debugging expert.
+Provide systematic diagnostic steps, not just guesses.
+Prioritize most common failure modes first."""
+    
+    schema = {
+        "diagnosis": "string - What's likely wrong",
+        "root_causes": ["array of possible causes"],
+        "debugging_steps": ["array of investigation steps"],
+        "potential_fixes": ["array of solutions to try"]
+    }
+    
+    return reasoner.reason_structured(prompt, schema, system_prompt)
+
+
+def explain_prediction(
+    prediction: Any,
+    feature_values: Dict[str, Any],
+    feature_contributions: Optional[Dict[str, float]] = None,
+    model_type: str = "unknown"
+) -> str:
+    """
+    Explain a single prediction in plain English.
+    
+    Args:
+        prediction: Model's prediction
+        feature_values: Feature values for this prediction
+        feature_contributions: Optional SHAP values or contributions
+        model_type: Type of model
+        
+    Returns:
+        Natural language explanation of the prediction
+    """
+    reasoner = get_reasoner()
+    
+    contributions = ""
+    if feature_contributions:
+        contributions = f"\n**Feature Contributions:**\n{feature_contributions}"
+    
+    prompt = f"""Explain this model prediction in simple terms:
+
+**Prediction:** {prediction}
+
+**Input Features:**
+{feature_values}{contributions}
+
+**Model Type:** {model_type}
+
+Explain:
+- What the model predicted
+- Which features most influenced the prediction
+- Why this prediction makes sense (or doesn't)
+- How confident we should be in this prediction
+
+Make it understandable to non-technical users."""
+    
+    system_prompt = """You are explaining model predictions to business users.
+Use plain English, avoid jargon, focus on the 'why' behind predictions."""
+    
+    return reasoner.reason(prompt, system_prompt, temperature=0.1)
+
+
+def compare_models(
+    model1_metrics: Dict[str, Any],
+    model2_metrics: Dict[str, Any],
+    model1_name: str = "Model A",
+    model2_name: str = "Model B",
+    business_context: Optional[str] = None
+) -> Dict[str, Any]:
+    """
+    Compare two models and recommend which to use.
+    
+    Args:
+        model1_metrics: Metrics for first model
+        model2_metrics: Metrics for second model
+        model1_name: Name/description of first model
+        model2_name: Name/description of second model
+        business_context: Optional business requirements
+            Example: "Need high recall, false negatives are costly"
+        
+    Returns:
+        {
+            "winner": str,                     # Which model is better
+            "comparison": str,                 # Detailed comparison
+            "tradeoffs": List[str],            # Key tradeoffs
+            "recommendation": str,             # Final recommendation
+            "context_considerations": str      # Business context factors
+        }
+    """
+    reasoner = get_reasoner()
+    
+    context = ""
+    if business_context:
+        context = f"\n**Business Context:**\n{business_context}"
+    
+    prompt = f"""Compare these two models:
+
+**{model1_name} Metrics:**
+{model1_metrics}
+
+**{model2_name} Metrics:**
+{model2_metrics}{context}
+
+Determine:
+1. Which model is objectively better (if any)
+2. Key differences and tradeoffs
+3. Which model to choose given business context
+4. When you might choose the "worse" model
+
+Consider accuracy, precision, recall, training time, interpretability, etc."""
+    
+    system_prompt = """You are a model selection expert.
+Don't just pick the highest accuracy - consider tradeoffs and business needs.
+Sometimes a simpler or faster model is better."""
+    
+    schema = {
+        "winner": "string - Which model is better overall",
+        "comparison": "string - Detailed comparison",
+        "tradeoffs": ["array of key tradeoffs"],
+        "recommendation": "string - Final recommendation with reasoning",
+        "context_considerations": "string - How business context affects choice"
+    }
+    
+    return reasoner.reason_structured(prompt, schema, system_prompt)
+
+
+def explain_overfitting(
+    train_metrics: Dict[str, float],
+    test_metrics: Dict[str, float],
+    model_complexity: Optional[str] = None
+) -> Dict[str, Any]:
+    """
+    Detect and explain overfitting (or underfitting).
+    
+    Args:
+        train_metrics: Training set metrics
+        test_metrics: Test set metrics
+        model_complexity: Optional description of model complexity
+        
+    Returns:
+        {
+            "diagnosis": str,              # Overfitting/underfitting/good_fit
+            "severity": str,               # Low/medium/high
+            "explanation": str,            # Why this is happening
+            "solutions": List[str]         # How to fix it
+        }
+    """
+    reasoner = get_reasoner()
+    
+    prompt = f"""Analyze these train vs test metrics for overfitting:
+
+**Training Metrics:**
+{train_metrics}
+
+**Test Metrics:**
+{test_metrics}
+
+**Model Complexity:** {model_complexity or 'Unknown'}
+
+Determine:
+1. Whether model is overfitting, underfitting, or well-fitted
+2. Severity of the problem
+3. Why this is happening
+4. Specific solutions to try
+
+Be specific about the gap between train and test performance."""
+    
+    system_prompt = """You are a model diagnostics expert.
+Explain overfitting in practical terms and provide actionable solutions."""
+    
+    schema = {
+        "diagnosis": "string - overfitting/underfitting/good_fit",
+        "severity": "string - low/medium/high",
+        "explanation": "string - Why this is happening",
+        "solutions": ["array of specific fixes to try"]
+    }
+    
+    return reasoner.reason_structured(prompt, schema, system_prompt)
diff --git a/src/session_memory.py b/src/session_memory.py
new file mode 100644
index 0000000000000000000000000000000000000000..524d14254cb3ddb4ec83a57da95cb4a3304d4279
--- /dev/null
+++ b/src/session_memory.py
@@ -0,0 +1,434 @@
+"""
+Session Memory Manager
+Maintains context across user interactions for intelligent follow-up handling.
+
+This module enables the agent to remember previous interactions and resolve
+ambiguous requests like "cross validate it" or "add features to that".
+"""
+
+import json
+from typing import Dict, Any, Optional, List
+from datetime import datetime
+import uuid
+
+
+class SessionMemory:
+    """
+    Manages session-based memory for contextual AI interactions.
+    
+    Features:
+    - Stores last dataset, model, target column
+    - Tracks workflow history
+    - Resolves ambiguous pronouns ("it", "that", "the model")
+    - Maintains conversation context
+    
+    Example:
+        User: "Train model on earthquake.csv predicting mag"
+        Agent stores: last_model="XGBoost", last_dataset="earthquake.csv"
+        
+        User: "Cross validate it"
+        Agent resolves: "it" → XGBoost, uses stored context
+    """
+    
+    def __init__(self, session_id: Optional[str] = None):
+        """
+        Initialize session memory.
+        
+        Args:
+            session_id: Unique session identifier (auto-generated if None)
+        """
+        self.session_id = session_id or str(uuid.uuid4())
+        self.created_at = datetime.now()
+        self.last_active = datetime.now()
+        
+        # Core context - what the agent last worked on
+        self.last_dataset: Optional[str] = None
+        self.last_target_col: Optional[str] = None
+        self.last_model: Optional[str] = None
+        self.last_task_type: Optional[str] = None  # regression, classification
+        self.best_score: Optional[float] = None
+        
+        # Output tracking - where things were saved
+        self.last_output_files: Dict[str, str] = {}
+        
+        # Workflow history - what steps were executed
+        self.workflow_history: List[Dict[str, Any]] = []
+        
+        # Conversation context - for pronoun resolution
+        self.conversation_context: List[Dict[str, str]] = []
+        
+        # Tool results cache - detailed results from last tools
+        self.last_tool_results: Dict[str, Any] = {}
+    
+    def update(self, **kwargs):
+        """
+        Update session context with new information.
+        
+        Args:
+            last_dataset: Path to dataset
+            last_target_col: Target column name
+            last_model: Model name (XGBoost, RandomForest, etc.)
+            last_task_type: Task type (regression, classification)
+            best_score: Best model score
+            last_output_files: Dict of output file paths
+        
+        Example:
+            session.update(
+                last_dataset="./data/sales.csv",
+                last_model="XGBoost",
+                best_score=0.92
+            )
+        """
+        self.last_active = datetime.now()
+        
+        for key, value in kwargs.items():
+            if hasattr(self, key):
+                setattr(self, key, value)
+    
+    def add_workflow_step(self, tool_name: str, result: Dict[str, Any]):
+        """
+        Add a workflow step to history and extract context.
+        
+        Args:
+            tool_name: Name of the tool executed
+            result: Tool execution result
+        """
+        self.workflow_history.append({
+            "timestamp": datetime.now().isoformat(),
+            "tool": tool_name,
+            "result": result
+        })
+        
+        # Update context based on tool results
+        self._extract_context_from_tool(tool_name, result)
+    
+    def _extract_context_from_tool(self, tool_name: str, result: Dict[str, Any]):
+        """
+        Extract relevant context from tool execution.
+        Automatically updates session state based on what tools did.
+        
+        Args:
+            tool_name: Name of the tool
+            result: Tool result dictionary
+        """
+        # Skip if tool failed
+        if not result.get("success"):
+            return
+        
+        tool_result = result.get("result", {})
+        
+        # Track dataset from profiling
+        if tool_name == "profile_dataset":
+            # Extract file path from arguments if available
+            if "file_path" in result.get("arguments", {}):
+                self.last_dataset = result["arguments"]["file_path"]
+        
+        # Track model training results
+        if tool_name == "train_baseline_models":
+            best_model = tool_result.get("best_model", {})
+            if isinstance(best_model, dict):
+                self.last_model = best_model.get("name")
+                self.best_score = best_model.get("score")
+            else:
+                self.last_model = best_model
+            
+            self.last_task_type = tool_result.get("task_type")
+            
+            # Extract target column from arguments
+            if "target_col" in result.get("arguments", {}):
+                self.last_target_col = result["arguments"]["target_col"]
+        
+        # Track hyperparameter tuning results
+        if tool_name == "hyperparameter_tuning":
+            if "best_score" in tool_result:
+                self.best_score = tool_result["best_score"]
+            if "model_type" in result.get("arguments", {}):
+                self.last_model = result["arguments"]["model_type"]
+        
+        # Track cross-validation results
+        if tool_name == "perform_cross_validation":
+            if "mean_score" in tool_result:
+                # Store CV score separately (could add cv_score attribute)
+                pass
+        
+        # Track output files from data processing
+        if "output_path" in tool_result:
+            tool_category = self._categorize_tool(tool_name)
+            self.last_output_files[tool_category] = tool_result["output_path"]
+            
+            # Update last_dataset if this is a data transformation
+            if tool_category in ["cleaned", "encoded", "engineered"]:
+                self.last_dataset = tool_result["output_path"]
+        
+        # Store tool results for detailed access
+        self.last_tool_results[tool_name] = tool_result
+    
+    def _categorize_tool(self, tool_name: str) -> str:
+        """
+        Categorize tool for output tracking.
+        
+        Args:
+            tool_name: Name of the tool
+        
+        Returns:
+            Category string (cleaned, encoded, model, etc.)
+        """
+        if "clean" in tool_name:
+            return "cleaned"
+        elif "encode" in tool_name:
+            return "encoded"
+        elif "feature" in tool_name and "engineer" in tool_name:
+            return "engineered"
+        elif "train" in tool_name or "model" in tool_name:
+            return "model"
+        elif "plot" in tool_name or "visual" in tool_name:
+            return "visualization"
+        elif "report" in tool_name:
+            return "report"
+        else:
+            return "other"
+    
+    def add_conversation(self, user_message: str, agent_response: str):
+        """
+        Add conversation turn to context.
+        
+        Args:
+            user_message: User's request
+            agent_response: Agent's response/summary
+        """
+        self.conversation_context.append({
+            "timestamp": datetime.now().isoformat(),
+            "user": user_message,
+            "agent": agent_response
+        })
+        
+        # Keep only last 10 turns to avoid memory bloat
+        if len(self.conversation_context) > 10:
+            self.conversation_context = self.conversation_context[-10:]
+    
+    def resolve_ambiguity(self, task_description: str) -> Dict[str, Any]:
+        """
+        Resolve ambiguous references in user request.
+        
+        Handles pronouns like "it", "that", "this" by mapping to session context.
+        
+        Args:
+            task_description: User's request (may contain "it", "that", etc.)
+        
+        Returns:
+            Dict with resolved parameters (file_path, target_col, model_type)
+        
+        Example:
+            User: "Cross validate it"
+            → Returns: {"file_path": "encoded.csv", "target_col": "mag", "model_type": "xgboost"}
+        """
+        task_lower = task_description.lower()
+        resolved = {}
+        
+        # Pronouns that reference last model/dataset
+        ambiguous_refs = ["it", "that", "this", "the model", "the dataset", "the data"]
+        has_ambiguous_ref = any(ref in task_lower for ref in ambiguous_refs)
+        
+        # Cross-validation requests
+        if "cross validat" in task_lower or "cv" in task_lower or "validate" in task_lower:
+            if has_ambiguous_ref or not any(word in task_lower for word in ["file_path=", "target_col=", "model_type="]):
+                # Use session context to fill in missing parameters
+                if self.last_output_files.get("encoded"):
+                    resolved["file_path"] = self.last_output_files.get("encoded")
+                elif self.last_dataset:
+                    resolved["file_path"] = self.last_dataset
+                
+                if self.last_target_col:
+                    resolved["target_col"] = self.last_target_col
+                
+                if self.last_model:
+                    resolved["model_type"] = self._normalize_model_name(self.last_model)
+        
+        # Hyperparameter tuning requests
+        if "tun" in task_lower or "optim" in task_lower or "improve" in task_lower:
+            if has_ambiguous_ref or "file_path" not in task_lower:
+                if self.last_output_files.get("encoded"):
+                    resolved["file_path"] = self.last_output_files.get("encoded")
+                elif self.last_dataset:
+                    resolved["file_path"] = self.last_dataset
+                
+                if self.last_target_col:
+                    resolved["target_col"] = self.last_target_col
+                
+                if self.last_model:
+                    resolved["model_type"] = self._normalize_model_name(self.last_model)
+        
+        # Visualization requests referencing "the results" or "it"
+        if ("plot" in task_lower or "visualiz" in task_lower or "graph" in task_lower) and has_ambiguous_ref:
+            if self.last_dataset:
+                resolved["file_path"] = self.last_dataset
+            
+            if self.last_target_col:
+                resolved["target_col"] = self.last_target_col
+        
+        # "Add feature" or "create feature" requests
+        if ("add feature" in task_lower or "create feature" in task_lower or 
+            "engineer feature" in task_lower or "extract feature" in task_lower):
+            if has_ambiguous_ref or "file_path" not in task_lower:
+                # Use most recent processed file
+                if self.last_output_files.get("encoded"):
+                    resolved["file_path"] = self.last_output_files.get("encoded")
+                elif self.last_output_files.get("cleaned"):
+                    resolved["file_path"] = self.last_output_files.get("cleaned")
+                elif self.last_dataset:
+                    resolved["file_path"] = self.last_dataset
+        
+        # Generic "use that" or "try it" commands
+        if has_ambiguous_ref and not resolved:
+            # Fallback: use last dataset and target
+            if self.last_dataset:
+                resolved["file_path"] = self.last_dataset
+            if self.last_target_col:
+                resolved["target_col"] = self.last_target_col
+        
+        return resolved
+    
+    def _normalize_model_name(self, model_name: Optional[str]) -> Optional[str]:
+        """
+        Normalize model name for tool compatibility.
+        
+        Different tools may use different naming conventions.
+        This maps common variations to standard names.
+        
+        Args:
+            model_name: Model name from session (e.g., "XGBoost Classifier")
+        
+        Returns:
+            Normalized name (e.g., "xgboost")
+        """
+        if not model_name:
+            return None
+        
+        name_lower = model_name.lower()
+        
+        if "xgb" in name_lower:
+            return "xgboost"
+        elif "random" in name_lower or "forest" in name_lower:
+            return "random_forest"
+        elif "ridge" in name_lower:
+            return "ridge"
+        elif "lasso" in name_lower:
+            return "ridge"  # Use ridge for lasso (same tool)
+        elif "logistic" in name_lower:
+            return "logistic"
+        elif "gradient boost" in name_lower and "xgb" not in name_lower:
+            return "gradient_boosting"
+        elif "svm" in name_lower or "support vector" in name_lower:
+            return "svm"
+        else:
+            # Return as-is if unknown
+            return model_name.lower().replace(" ", "_")
+    
+    def get_context_summary(self) -> str:
+        """
+        Generate human-readable context summary.
+        
+        Returns:
+            Formatted string describing current session state
+        
+        Example:
+            **Session Context:**
+            - Dataset: ./data/earthquake.csv
+            - Target Column: mag
+            - Last Model: XGBoost
+            - Best Score: 0.9234
+            - Task Type: regression
+        """
+        if not self.last_dataset and not self.last_model:
+            return "No previous context available."
+        
+        summary = "**Session Context:**\n"
+        
+        if self.last_dataset:
+            summary += f"- Dataset: {self.last_dataset}\n"
+        
+        if self.last_target_col:
+            summary += f"- Target Column: {self.last_target_col}\n"
+        
+        if self.last_model:
+            summary += f"- Last Model: {self.last_model}\n"
+        
+        if self.best_score is not None:
+            summary += f"- Best Score: {self.best_score:.4f}\n"
+        
+        if self.last_task_type:
+            summary += f"- Task Type: {self.last_task_type}\n"
+        
+        if self.last_output_files:
+            summary += "- Output Files:\n"
+            for category, path in self.last_output_files.items():
+                summary += f"  - {category}: {path}\n"
+        
+        return summary
+    
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Serialize session to dictionary for storage.
+        
+        Returns:
+            Dictionary with all session data
+        """
+        return {
+            "session_id": self.session_id,
+            "created_at": self.created_at.isoformat(),
+            "last_active": self.last_active.isoformat(),
+            "last_dataset": self.last_dataset,
+            "last_target_col": self.last_target_col,
+            "last_model": self.last_model,
+            "last_task_type": self.last_task_type,
+            "best_score": self.best_score,
+            "last_output_files": self.last_output_files,
+            "workflow_history": self.workflow_history,
+            "conversation_context": self.conversation_context,
+            "last_tool_results": self.last_tool_results
+        }
+    
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'SessionMemory':
+        """
+        Deserialize session from dictionary.
+        
+        Args:
+            data: Dictionary with session data (from to_dict())
+        
+        Returns:
+            SessionMemory instance
+        """
+        session = cls(session_id=data.get("session_id"))
+        
+        # Restore timestamps
+        if data.get("created_at"):
+            session.created_at = datetime.fromisoformat(data.get("created_at"))
+        if data.get("last_active"):
+            session.last_active = datetime.fromisoformat(data.get("last_active"))
+        
+        # Restore context
+        session.last_dataset = data.get("last_dataset")
+        session.last_target_col = data.get("last_target_col")
+        session.last_model = data.get("last_model")
+        session.last_task_type = data.get("last_task_type")
+        session.best_score = data.get("best_score")
+        session.last_output_files = data.get("last_output_files", {})
+        session.workflow_history = data.get("workflow_history", [])
+        session.conversation_context = data.get("conversation_context", [])
+        session.last_tool_results = data.get("last_tool_results", {})
+        
+        return session
+    
+    def clear(self):
+        """Clear all session context (start fresh)."""
+        self.last_dataset = None
+        self.last_target_col = None
+        self.last_model = None
+        self.last_task_type = None
+        self.best_score = None
+        self.last_output_files = {}
+        self.workflow_history = []
+        self.conversation_context = []
+        self.last_tool_results = {}
+        self.last_active = datetime.now()
diff --git a/src/session_store.py b/src/session_store.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2d4458bf488abbed0871df790fbb569d8e56d77
--- /dev/null
+++ b/src/session_store.py
@@ -0,0 +1,305 @@
+"""
+Session Storage Manager
+Persists session memory to SQLite database for cross-session continuity.
+
+Enables users to resume conversations even after restarting the agent.
+"""
+
+import sqlite3
+import json
+from typing import Optional, List, Dict, Any
+from pathlib import Path
+from datetime import datetime, timedelta
+
+from session_memory import SessionMemory
+
+
+class SessionStore:
+    """
+    Persistent storage for session memory using SQLite.
+    
+    Features:
+    - Save/load sessions by ID
+    - Resume most recent session automatically
+    - Cleanup old sessions
+    - List all sessions
+    
+    Storage location: ./cache_db/sessions.db
+    """
+    
+    def __init__(self, db_path: str = "./cache_db/sessions.db"):
+        """
+        Initialize session store.
+        
+        Args:
+            db_path: Path to SQLite database file
+        """
+        self.db_path = db_path
+        
+        # Create directory if it doesn't exist
+        Path(db_path).parent.mkdir(parents=True, exist_ok=True)
+        
+        # Initialize database schema
+        self._init_database()
+    
+    def _init_database(self):
+        """Create sessions table if it doesn't exist."""
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        
+        cursor.execute("""
+            CREATE TABLE IF NOT EXISTS sessions (
+                session_id TEXT PRIMARY KEY,
+                created_at TEXT NOT NULL,
+                last_active TEXT NOT NULL,
+                context_json TEXT NOT NULL
+            )
+        """)
+        
+        # Create index on last_active for faster queries
+        cursor.execute("""
+            CREATE INDEX IF NOT EXISTS idx_last_active 
+            ON sessions(last_active DESC)
+        """)
+        
+        conn.commit()
+        conn.close()
+    
+    def _make_json_serializable(self, obj: Any) -> Any:
+        """
+        Convert objects to JSON-serializable format.
+        Handles matplotlib Figures, plotly Figures, numpy arrays, and other non-serializable types.
+        """
+        try:
+            import numpy as np
+        except ImportError:
+            np = None
+        
+        # Handle dictionaries recursively
+        if isinstance(obj, dict):
+            return {k: self._make_json_serializable(v) for k, v in obj.items()}
+        
+        # Handle lists recursively
+        elif isinstance(obj, (list, tuple)):
+            return [self._make_json_serializable(item) for item in obj]
+        
+        # Handle matplotlib Figure objects
+        elif hasattr(obj, '__class__') and 'Figure' in obj.__class__.__name__:
+            return f"<{obj.__class__.__name__} object: {id(obj)}>"
+        
+        # Handle numpy arrays
+        elif np and isinstance(obj, np.ndarray):
+            return f"<NumPy array: shape={obj.shape}>"
+        
+        # Handle numpy scalar types
+        elif hasattr(obj, 'item') and callable(obj.item):
+            try:
+                return obj.item()
+            except:
+                return str(obj)
+        
+        # Handle other non-serializable objects (dataframes, models, etc.)
+        elif hasattr(obj, '__dict__') and not isinstance(obj, (str, int, float, bool, type(None))):
+            # Check if it's a common non-serializable type
+            class_name = obj.__class__.__name__
+            if class_name in ['DataFrame', 'Series', 'Model', 'Pipeline', 'Figure']:
+                return f"<{class_name} object: {id(obj)}>"
+            return f"<{class_name} object>"
+        
+        # Already serializable
+        return obj
+    
+    def save(self, session: SessionMemory):
+        """
+        Save session to database.
+        
+        Args:
+            session: SessionMemory instance to save
+        """
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        
+        # Serialize session to JSON - clean non-serializable objects first
+        data = session.to_dict()
+        clean_data = self._make_json_serializable(data)
+        
+        cursor.execute("""
+            INSERT OR REPLACE INTO sessions (session_id, created_at, last_active, context_json)
+            VALUES (?, ?, ?, ?)
+        """, (
+            session.session_id,
+            session.created_at.isoformat(),
+            session.last_active.isoformat(),
+            json.dumps(clean_data)
+        ))
+        
+        conn.commit()
+        conn.close()
+    
+    def load(self, session_id: str) -> Optional[SessionMemory]:
+        """
+        Load session from database by ID.
+        
+        Args:
+            session_id: Unique session identifier
+        
+        Returns:
+            SessionMemory instance or None if not found
+        """
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        
+        cursor.execute("""
+            SELECT context_json FROM sessions WHERE session_id = ?
+        """, (session_id,))
+        
+        result = cursor.fetchone()
+        conn.close()
+        
+        if not result:
+            return None
+        
+        # Deserialize JSON to SessionMemory
+        data = json.loads(result[0])
+        return SessionMemory.from_dict(data)
+    
+    def get_recent_session(self, max_age_hours: int = 24) -> Optional[SessionMemory]:
+        """
+        Get most recent active session within time window.
+        
+        Useful for automatic session resumption when user returns.
+        
+        Args:
+            max_age_hours: Maximum age in hours (default: 24)
+        
+        Returns:
+            Most recent SessionMemory or None if no recent sessions
+        
+        Example:
+            # Resume conversation from yesterday
+            session = store.get_recent_session(max_age_hours=24)
+            if session:
+                print(f"Resuming session: {session.last_dataset}")
+        """
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        
+        cutoff_time = (datetime.now() - timedelta(hours=max_age_hours)).isoformat()
+        
+        cursor.execute("""
+            SELECT context_json FROM sessions
+            WHERE last_active > ?
+            ORDER BY last_active DESC
+            LIMIT 1
+        """, (cutoff_time,))
+        
+        result = cursor.fetchone()
+        conn.close()
+        
+        if not result:
+            return None
+        
+        data = json.loads(result[0])
+        return SessionMemory.from_dict(data)
+    
+    def list_sessions(self, limit: int = 10) -> List[Dict[str, str]]:
+        """
+        List recent sessions with basic info.
+        
+        Args:
+            limit: Maximum number of sessions to return
+        
+        Returns:
+            List of session info dicts with id, created_at, last_active
+        
+        Example:
+            sessions = store.list_sessions(limit=5)
+            for s in sessions:
+                print(f"{s['session_id']}: {s['last_active']}")
+        """
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        
+        cursor.execute("""
+            SELECT session_id, created_at, last_active
+            FROM sessions
+            ORDER BY last_active DESC
+            LIMIT ?
+        """, (limit,))
+        
+        results = cursor.fetchall()
+        conn.close()
+        
+        return [
+            {
+                "session_id": row[0],
+                "created_at": row[1],
+                "last_active": row[2]
+            }
+            for row in results
+        ]
+    
+    def delete(self, session_id: str) -> bool:
+        """
+        Delete session from database.
+        
+        Args:
+            session_id: Session to delete
+        
+        Returns:
+            True if deleted, False if not found
+        """
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        
+        cursor.execute("DELETE FROM sessions WHERE session_id = ?", (session_id,))
+        rows_deleted = cursor.rowcount
+        
+        conn.commit()
+        conn.close()
+        
+        return rows_deleted > 0
+    
+    def cleanup_old_sessions(self, days: int = 7) -> int:
+        """
+        Delete sessions older than specified days.
+        
+        Args:
+            days: Age threshold in days
+        
+        Returns:
+            Number of sessions deleted
+        
+        Example:
+            # Delete sessions older than 7 days
+            deleted = store.cleanup_old_sessions(days=7)
+            print(f"Cleaned up {deleted} old sessions")
+        """
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        
+        cutoff_time = (datetime.now() - timedelta(days=days)).isoformat()
+        
+        cursor.execute("DELETE FROM sessions WHERE last_active < ?", (cutoff_time,))
+        rows_deleted = cursor.rowcount
+        
+        conn.commit()
+        conn.close()
+        
+        return rows_deleted
+    
+    def get_session_count(self) -> int:
+        """
+        Get total number of sessions in database.
+        
+        Returns:
+            Session count
+        """
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        
+        cursor.execute("SELECT COUNT(*) FROM sessions")
+        count = cursor.fetchone()[0]
+        
+        conn.close()
+        return count
diff --git a/src/storage/__init__.py b/src/storage/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbe347363c02f673524b3c1098682c49580f775d
--- /dev/null
+++ b/src/storage/__init__.py
@@ -0,0 +1,19 @@
+"""Storage abstraction for artifacts (models, plots, reports)."""
+
+from .artifact_store import ArtifactStore, get_artifact_store, reset_artifact_store
+from .helpers import (
+    save_model_with_store,
+    save_plot_with_store,
+    save_report_with_store,
+    save_data_with_store
+)
+
+__all__ = [
+    "ArtifactStore",
+    "get_artifact_store",
+    "reset_artifact_store",
+    "save_model_with_store",
+    "save_plot_with_store",
+    "save_report_with_store",
+    "save_data_with_store"
+]
diff --git a/src/storage/artifact_store.py b/src/storage/artifact_store.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6f7d4c952f5152fa904e99f233ea2b357c7590e
--- /dev/null
+++ b/src/storage/artifact_store.py
@@ -0,0 +1,620 @@
+"""
+Artifact Storage Abstraction Layer
+
+Provides unified interface for saving models, plots, reports, and data files
+to either local filesystem or Google Cloud Storage (GCS).
+
+Design Principles:
+- Backend chosen via environment variable (ARTIFACT_BACKEND=local|gcs)
+- Tools never know which backend is used (clean separation)
+- GCS paths versioned with timestamps for reproducibility
+- Consistent return format: local paths or GCS URIs
+- Graceful fallback to local if GCS unavailable
+
+Architecture:
+    Tool → ArtifactStore → LocalBackend / GCSBackend
+
+Usage:
+    from storage import get_artifact_store
+    
+    store = get_artifact_store()
+    
+    # Save model
+    path = store.save_model("model.pkl", metadata={"accuracy": 0.95})
+    
+    # Save plot
+    path = store.save_plot("correlation_heatmap.html")
+    
+    # Save report
+    path = store.save_report("eda_report.html")
+    
+    # Save data file
+    path = store.save_data("cleaned_data.csv")
+"""
+
+import os
+import json
+from pathlib import Path
+from datetime import datetime
+from typing import Dict, Any, Optional, Union
+from abc import ABC, abstractmethod
+
+
+class StorageBackend(ABC):
+    """Abstract base class for storage backends."""
+    
+    @abstractmethod
+    def save_file(
+        self, 
+        local_path: Union[str, Path], 
+        artifact_type: str,
+        metadata: Optional[Dict[str, Any]] = None
+    ) -> str:
+        """
+        Save file to backend storage.
+        
+        Args:
+            local_path: Path to local file to save
+            artifact_type: Type of artifact (model, plot, report, data)
+            metadata: Optional metadata to save alongside artifact
+            
+        Returns:
+            Storage path or URI where file was saved
+        """
+        pass
+    
+    @abstractmethod
+    def list_artifacts(self, artifact_type: str) -> list[str]:
+        """List all artifacts of given type."""
+        pass
+    
+    @abstractmethod
+    def get_artifact_path(self, artifact_type: str, filename: str) -> str:
+        """Get full path/URI for an artifact."""
+        pass
+
+
+class LocalBackend(StorageBackend):
+    """
+    Local filesystem storage backend.
+    
+    Preserves existing behavior - saves to ./outputs/ directory structure.
+    """
+    
+    def __init__(self, base_dir: str = "./outputs"):
+        """
+        Initialize local backend.
+        
+        Args:
+            base_dir: Base directory for all artifacts (default: ./outputs)
+        """
+        self.base_dir = Path(base_dir)
+        
+        # Create subdirectories
+        self.subdirs = {
+            "model": self.base_dir / "models",
+            "plot": self.base_dir / "plots",
+            "report": self.base_dir / "reports",
+            "data": self.base_dir / "data",
+            "code": self.base_dir / "code"
+        }
+        
+        for subdir in self.subdirs.values():
+            subdir.mkdir(parents=True, exist_ok=True)
+    
+    def save_file(
+        self, 
+        local_path: Union[str, Path], 
+        artifact_type: str,
+        metadata: Optional[Dict[str, Any]] = None
+    ) -> str:
+        """
+        Save file to local filesystem.
+        
+        Args:
+            local_path: Path to source file
+            artifact_type: Type (model, plot, report, data, code)
+            metadata: Optional metadata (saved as JSON sidecar)
+            
+        Returns:
+            Absolute path where file was saved
+        """
+        local_path = Path(local_path)
+        
+        if not local_path.exists():
+            raise FileNotFoundError(f"Source file not found: {local_path}")
+        
+        # Determine target directory
+        target_dir = self.subdirs.get(artifact_type)
+        if target_dir is None:
+            raise ValueError(
+                f"Unknown artifact type: {artifact_type}. "
+                f"Must be one of: {list(self.subdirs.keys())}"
+            )
+        
+        # Preserve filename
+        target_path = target_dir / local_path.name
+        
+        # Copy file (if not already in target location)
+        if local_path.resolve() != target_path.resolve():
+            import shutil
+            shutil.copy2(local_path, target_path)
+        
+        # Save metadata if provided
+        if metadata:
+            metadata_path = target_path.with_suffix(target_path.suffix + ".meta.json")
+            with open(metadata_path, "w") as f:
+                json.dump({
+                    "artifact_type": artifact_type,
+                    "filename": local_path.name,
+                    "timestamp": datetime.utcnow().isoformat(),
+                    "backend": "local",
+                    **metadata
+                }, f, indent=2)
+        
+        return str(target_path.resolve())
+    
+    def list_artifacts(self, artifact_type: str) -> list[str]:
+        """List all artifacts of given type in local storage."""
+        # Validate artifact type
+        valid_types = ["model", "plot", "report", "data", "code"]
+        if artifact_type not in valid_types:
+            raise ValueError(
+                f"Invalid artifact type: {artifact_type}. "
+                f"Must be one of: {', '.join(valid_types)}"
+            )
+        
+        target_dir = self.subdirs.get(artifact_type)
+        if target_dir is None or not target_dir.exists():
+            return []
+        
+        # Exclude metadata files
+        return [
+            str(f.resolve()) 
+            for f in target_dir.iterdir() 
+            if f.is_file() and not f.name.endswith(".meta.json")
+        ]
+    
+    def get_artifact_path(self, artifact_type: str, filename: str) -> str:
+        """Get full local path for artifact."""
+        target_dir = self.subdirs.get(artifact_type)
+        if target_dir is None:
+            raise ValueError(f"Unknown artifact type: {artifact_type}")
+        
+        return str((target_dir / filename).resolve())
+
+
+class GCSBackend(StorageBackend):
+    """
+    Google Cloud Storage backend.
+    
+    Saves artifacts to GCS bucket with versioned paths.
+    """
+    
+    def __init__(
+        self, 
+        bucket_name: Optional[str] = None,
+        project_id: Optional[str] = None,
+        base_prefix: str = "artifacts"
+    ):
+        """
+        Initialize GCS backend.
+        
+        Args:
+            bucket_name: GCS bucket name (from env: GCS_BUCKET_NAME)
+            project_id: GCP project ID (from env: GCP_PROJECT_ID)
+            base_prefix: Base prefix for all artifacts (default: artifacts)
+        """
+        try:
+            from google.cloud import storage
+            from google.auth import default as gcp_default
+        except ImportError:
+            raise ImportError(
+                "GCS backend requires google-cloud-storage. "
+                "Install with: pip install google-cloud-storage"
+            )
+        
+        # Get configuration from environment
+        self.bucket_name = bucket_name or os.getenv("GCS_BUCKET_NAME")
+        self.project_id = project_id or os.getenv("GCP_PROJECT_ID")
+        self.base_prefix = base_prefix
+        
+        if not self.bucket_name:
+            raise ValueError(
+                "GCS bucket name not specified. "
+                "Set GCS_BUCKET_NAME environment variable or pass bucket_name."
+            )
+        
+        # Initialize GCS client
+        try:
+            if self.project_id:
+                self.client = storage.Client(project=self.project_id)
+            else:
+                # Use default credentials
+                credentials, project = gcp_default()
+                self.client = storage.Client(credentials=credentials, project=project)
+                self.project_id = project
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to initialize GCS client: {e}\n"
+                "Ensure credentials are configured (GOOGLE_APPLICATION_CREDENTIALS "
+                "or gcloud auth application-default login)"
+            )
+        
+        # Get bucket
+        try:
+            self.bucket = self.client.bucket(self.bucket_name)
+            # Verify bucket exists
+            if not self.bucket.exists():
+                raise ValueError(f"Bucket '{self.bucket_name}' does not exist")
+        except Exception as e:
+            raise RuntimeError(f"Failed to access bucket '{self.bucket_name}': {e}")
+    
+    def _get_versioned_path(self, artifact_type: str, filename: str) -> str:
+        """
+        Generate versioned GCS path.
+        
+        Format: artifacts/{type}/{YYYY-MM-DD}/{timestamp}_{filename}
+        
+        Example: artifacts/models/2025-12-23/20251223_143052_model.pkl
+        """
+        timestamp = datetime.utcnow()
+        date_str = timestamp.strftime("%Y-%m-%d")
+        time_str = timestamp.strftime("%Y%m%d_%H%M%S")
+        
+        versioned_filename = f"{time_str}_{filename}"
+        
+        return f"{self.base_prefix}/{artifact_type}/{date_str}/{versioned_filename}"
+    
+    def save_file(
+        self, 
+        local_path: Union[str, Path], 
+        artifact_type: str,
+        metadata: Optional[Dict[str, Any]] = None
+    ) -> str:
+        """
+        Upload file to GCS with versioned path.
+        
+        Args:
+            local_path: Path to local file to upload
+            artifact_type: Type (model, plot, report, data, code)
+            metadata: Optional metadata (stored as blob metadata)
+            
+        Returns:
+            GCS URI (gs://bucket/path)
+        """
+        local_path = Path(local_path)
+        
+        if not local_path.exists():
+            raise FileNotFoundError(f"Source file not found: {local_path}")
+        
+        # Generate versioned path
+        gcs_path = self._get_versioned_path(artifact_type, local_path.name)
+        
+        # Create blob
+        blob = self.bucket.blob(gcs_path)
+        
+        # Set metadata
+        if metadata:
+            blob.metadata = {
+                "artifact_type": artifact_type,
+                "filename": local_path.name,
+                "timestamp": datetime.utcnow().isoformat(),
+                "backend": "gcs",
+                **{k: str(v) for k, v in metadata.items()}  # Convert all to strings
+            }
+        
+        # Upload file
+        try:
+            blob.upload_from_filename(str(local_path))
+        except Exception as e:
+            raise RuntimeError(f"Failed to upload to GCS: {e}")
+        
+        # Return GCS URI
+        gcs_uri = f"gs://{self.bucket_name}/{gcs_path}"
+        
+        return gcs_uri
+    
+    def list_artifacts(self, artifact_type: str) -> list[str]:
+        """List all artifacts of given type in GCS."""
+        # Validate artifact type
+        valid_types = ["model", "plot", "report", "data", "code"]
+        if artifact_type not in valid_types:
+            raise ValueError(
+                f"Invalid artifact type: {artifact_type}. "
+                f"Must be one of: {', '.join(valid_types)}"
+            )
+        
+        prefix = f"{self.base_prefix}/{artifact_type}/"
+        
+        try:
+            blobs = self.client.list_blobs(self.bucket, prefix=prefix)
+            return [f"gs://{self.bucket_name}/{blob.name}" for blob in blobs]
+        except Exception as e:
+            raise RuntimeError(f"Failed to list GCS artifacts: {e}")
+    
+    def get_artifact_path(self, artifact_type: str, filename: str) -> str:
+        """Get latest GCS path for artifact (most recent version)."""
+        artifacts = self.list_artifacts(artifact_type)
+        
+        # Filter by filename (strip timestamp prefix)
+        matching = [
+            uri for uri in artifacts 
+            if uri.endswith(f"_{filename}") or uri.endswith(f"/{filename}")
+        ]
+        
+        if not matching:
+            raise FileNotFoundError(
+                f"No artifact found with filename '{filename}' in type '{artifact_type}'"
+            )
+        
+        # Return most recent (last in sorted list)
+        return sorted(matching)[-1]
+
+
+class ArtifactStore:
+    """
+    Unified interface for artifact storage.
+    
+    Automatically routes to correct backend based on configuration.
+    Tools use this class and never directly interact with backends.
+    """
+    
+    def __init__(self, backend: Optional[StorageBackend] = None):
+        """
+        Initialize artifact store with backend.
+        
+        Args:
+            backend: Storage backend (auto-detected if None)
+        """
+        if backend is None:
+            backend = self._detect_backend()
+        
+        self.backend = backend
+    
+    def _detect_backend(self) -> StorageBackend:
+        """
+        Detect and initialize appropriate backend.
+        
+        Detection logic:
+        1. Check ARTIFACT_BACKEND env var (local|gcs)
+        2. If GCS, check for GCS_BUCKET_NAME
+        3. Fall back to local if anything fails
+        
+        Returns:
+            Initialized storage backend
+        """
+        backend_type = os.getenv("ARTIFACT_BACKEND", "local").lower()
+        
+        if backend_type == "gcs":
+            try:
+                # Try to initialize GCS
+                bucket_name = os.getenv("GCS_BUCKET_NAME")
+                if not bucket_name:
+                    print("⚠️  GCS backend requested but GCS_BUCKET_NAME not set. Falling back to local.")
+                    return LocalBackend()
+                
+                print(f"🔵 Initializing GCS backend (bucket: {bucket_name})")
+                return GCSBackend(bucket_name=bucket_name)
+                
+            except Exception as e:
+                print(f"⚠️  GCS backend initialization failed: {e}")
+                print("   Falling back to local storage.")
+                return LocalBackend()
+        
+        elif backend_type == "local":
+            print("📁 Using local filesystem backend")
+            return LocalBackend()
+        
+        else:
+            print(f"⚠️  Unknown ARTIFACT_BACKEND: {backend_type}. Using local.")
+            return LocalBackend()
+    
+    def save_model(
+        self, 
+        local_path: Union[str, Path],
+        metadata: Optional[Dict[str, Any]] = None
+    ) -> str:
+        """
+        Save machine learning model.
+        
+        Args:
+            local_path: Path to model file (e.g., model.pkl)
+            metadata: Optional metadata (accuracy, hyperparameters, etc.)
+            
+        Returns:
+            Storage path or URI where model was saved
+            
+        Example:
+            store = ArtifactStore()
+            path = store.save_model(
+                "model.pkl",
+                metadata={"accuracy": 0.95, "model_type": "RandomForest"}
+            )
+        """
+        return self.backend.save_file(local_path, "model", metadata)
+    
+    def save_plot(
+        self,
+        local_path: Union[str, Path],
+        metadata: Optional[Dict[str, Any]] = None
+    ) -> str:
+        """
+        Save visualization plot.
+        
+        Args:
+            local_path: Path to plot file (e.g., plot.html, plot.png)
+            metadata: Optional metadata (plot type, columns, etc.)
+            
+        Returns:
+            Storage path or URI where plot was saved
+            
+        Example:
+            store = ArtifactStore()
+            path = store.save_plot(
+                "correlation_heatmap.html",
+                metadata={"plot_type": "heatmap", "columns": ["age", "income"]}
+            )
+        """
+        return self.backend.save_file(local_path, "plot", metadata)
+    
+    def save_report(
+        self,
+        local_path: Union[str, Path],
+        metadata: Optional[Dict[str, Any]] = None
+    ) -> str:
+        """
+        Save analysis report.
+        
+        Args:
+            local_path: Path to report file (e.g., report.html)
+            metadata: Optional metadata (report type, dataset, etc.)
+            
+        Returns:
+            Storage path or URI where report was saved
+            
+        Example:
+            store = ArtifactStore()
+            path = store.save_report(
+                "eda_report.html",
+                metadata={"report_type": "sweetviz", "dataset": "titanic"}
+            )
+        """
+        return self.backend.save_file(local_path, "report", metadata)
+    
+    def save_data(
+        self,
+        local_path: Union[str, Path],
+        metadata: Optional[Dict[str, Any]] = None
+    ) -> str:
+        """
+        Save processed data file.
+        
+        Args:
+            local_path: Path to data file (e.g., cleaned.csv)
+            metadata: Optional metadata (transformation steps, row count, etc.)
+            
+        Returns:
+            Storage path or URI where data was saved
+            
+        Example:
+            store = ArtifactStore()
+            path = store.save_data(
+                "cleaned_data.csv",
+                metadata={"rows": 1000, "columns": 20, "transformations": ["drop_na", "encode"]}
+            )
+        """
+        return self.backend.save_file(local_path, "data", metadata)
+    
+    def save_code(
+        self,
+        local_path: Union[str, Path],
+        metadata: Optional[Dict[str, Any]] = None
+    ) -> str:
+        """
+        Save code interpreter output.
+        
+        Args:
+            local_path: Path to code output file
+            metadata: Optional metadata (execution time, etc.)
+            
+        Returns:
+            Storage path or URI where file was saved
+        """
+        return self.backend.save_file(local_path, "code", metadata)
+    
+    def list_artifacts(self, artifact_type: str) -> list[str]:
+        """
+        List all artifacts of a specific type.
+        
+        Args:
+            artifact_type: Type of artifact (model, plot, report, data, code)
+            
+        Returns:
+            List of artifact paths or URIs
+            
+        Example:
+            store = ArtifactStore()
+            models = store.list_artifacts("model")
+            plots = store.list_artifacts("plot")
+        """
+        return self.backend.list_artifacts(artifact_type)
+    
+    def list_models(self) -> list[str]:
+        """List all saved models."""
+        return self.backend.list_artifacts("model")
+    
+    def list_plots(self) -> list[str]:
+        """List all saved plots."""
+        return self.backend.list_artifacts("plot")
+    
+    def list_reports(self) -> list[str]:
+        """List all saved reports."""
+        return self.backend.list_artifacts("report")
+    
+    def list_data_files(self) -> list[str]:
+        """List all saved data files."""
+        return self.backend.list_artifacts("data")
+    
+    def get_backend_info(self) -> Dict[str, Any]:
+        """
+        Get information about current backend.
+        
+        Returns:
+            Backend configuration details
+        """
+        if isinstance(self.backend, LocalBackend):
+            return {
+                "type": "local",
+                "base_path": str(self.backend.base_dir.resolve()),
+                "base_dir": str(self.backend.base_dir.resolve()),
+                "subdirs": {k: str(v) for k, v in self.backend.subdirs.items()}
+            }
+        elif isinstance(self.backend, GCSBackend):
+            return {
+                "type": "gcs",
+                "base_path": f"gs://{self.backend.bucket_name}/{self.backend.base_prefix}",
+                "bucket": self.backend.bucket_name,
+                "project": self.backend.project_id,
+                "base_prefix": self.backend.base_prefix
+            }
+        else:
+            return {"type": "unknown", "base_path": "unknown"}
+
+
+# Singleton instance
+_artifact_store_instance: Optional[ArtifactStore] = None
+
+
+def get_artifact_store(backend: Optional[StorageBackend] = None) -> ArtifactStore:
+    """
+    Get singleton instance of ArtifactStore.
+    
+    This ensures all tools use the same backend configuration.
+    
+    Args:
+        backend: Optional backend (for testing or custom configuration)
+        
+    Returns:
+        Singleton ArtifactStore instance
+        
+    Example:
+        from storage import get_artifact_store
+        
+        store = get_artifact_store()
+        path = store.save_model("model.pkl", metadata={"accuracy": 0.95})
+    """
+    global _artifact_store_instance
+    
+    if _artifact_store_instance is None or backend is not None:
+        _artifact_store_instance = ArtifactStore(backend=backend)
+    
+    return _artifact_store_instance
+
+
+def reset_artifact_store():
+    """
+    Reset singleton instance (useful for testing).
+    """
+    global _artifact_store_instance
+    _artifact_store_instance = None
diff --git a/src/storage/helpers.py b/src/storage/helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..545995a1cf0950511f1d0258d45eae3fa039b727
--- /dev/null
+++ b/src/storage/helpers.py
@@ -0,0 +1,116 @@
+"""
+Helper utilities for artifact storage integration
+"""
+
+import os
+import tempfile
+import joblib
+from typing import Any, Dict, Optional
+from pathlib import Path
+
+
+def save_model_with_store(
+    model_data: Any,
+    filename: str,
+    metadata: Optional[Dict[str, Any]] = None,
+    fallback_dir: str = "./outputs/models"
+) -> str:
+    """
+    Save model using artifact store if available, otherwise use fallback path.
+    
+    Args:
+        model_data: Model object or dict to save
+        filename: Name of the model file (e.g., "model.pkl")
+        metadata: Optional metadata to attach
+        fallback_dir: Directory to use if artifact store unavailable
+        
+    Returns:
+        Path where model was saved
+    """
+    try:
+        from storage import get_artifact_store
+        store = get_artifact_store()
+        
+        # Save to temp file first
+        with tempfile.NamedTemporaryFile(mode='wb', suffix='.pkl', delete=False) as tmp:
+            joblib.dump(model_data, tmp.name)
+            model_path = store.save_model(tmp.name, metadata=metadata)
+        os.unlink(tmp.name)
+        
+        return model_path
+        
+    except ImportError:
+        # Fallback to local path
+        model_path = os.path.join(fallback_dir, filename)
+        Path(model_path).parent.mkdir(parents=True, exist_ok=True)
+        joblib.dump(model_data, model_path)
+        return model_path
+
+
+def save_plot_with_store(
+    plot_path: str,
+    metadata: Optional[Dict[str, Any]] = None
+) -> str:
+    """
+    Save plot using artifact store if available.
+    
+    Args:
+        plot_path: Path to existing plot file
+        metadata: Optional metadata to attach
+        
+    Returns:
+        Path where plot was saved
+    """
+    try:
+        from storage import get_artifact_store
+        store = get_artifact_store()
+        return store.save_plot(plot_path, metadata=metadata)
+    except ImportError:
+        # Already saved locally
+        return plot_path
+
+
+def save_report_with_store(
+    report_path: str,
+    metadata: Optional[Dict[str, Any]] = None
+) -> str:
+    """
+    Save report using artifact store if available.
+    
+    Args:
+        report_path: Path to existing report file
+        metadata: Optional metadata to attach
+        
+    Returns:
+        Path where report was saved
+    """
+    try:
+        from storage import get_artifact_store
+        store = get_artifact_store()
+        return store.save_report(report_path, metadata=metadata)
+    except ImportError:
+        # Already saved locally
+        return report_path
+
+
+def save_data_with_store(
+    data_path: str,
+    metadata: Optional[Dict[str, Any]] = None
+) -> str:
+    """
+    Save data file using artifact store if available.
+    
+    Args:
+        data_path: Path to existing data file
+        metadata: Optional metadata to attach
+        
+    Returns:
+        Path where data was saved
+    """
+    try:
+        from storage import get_artifact_store
+        store = get_artifact_store()
+        return store.save_data(data_path, metadata=metadata)
+    except ImportError:
+        # Already saved locally
+        return data_path
diff --git a/src/tools/__init__.py b/src/tools/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..20294c83b366e2204d0f146be8784371e015541e
--- /dev/null
+++ b/src/tools/__init__.py
@@ -0,0 +1,307 @@
+"""Tools module initialization - All 44 tools."""
+
+# Basic Tools (10)
+from .data_profiling import (
+    profile_dataset,
+    detect_data_quality_issues,
+    analyze_correlations,
+    get_smart_summary  # NEW: Enhanced data summary
+)
+
+from .data_cleaning import (
+    clean_missing_values,
+    handle_outliers,
+    fix_data_types
+)
+
+from .data_type_conversion import (
+    force_numeric_conversion,
+    smart_type_inference
+)
+
+# Data Wrangling Tools (3) - NEW
+from .data_wrangling import (
+    merge_datasets,
+    concat_datasets,
+    reshape_dataset
+)
+
+from .feature_engineering import (
+    create_time_features,
+    encode_categorical
+)
+
+from .model_training import (
+    train_baseline_models,
+    generate_model_report
+)
+
+# Advanced Analysis Tools (5)
+from .advanced_analysis import (
+    perform_eda_analysis,
+    detect_model_issues,
+    detect_anomalies,
+    detect_and_handle_multicollinearity,
+    perform_statistical_tests
+)
+
+# Advanced Feature Engineering Tools (4)
+from .advanced_feature_engineering import (
+    create_interaction_features,
+    create_aggregation_features,
+    engineer_text_features,
+    auto_feature_engineering
+)
+
+# Advanced Preprocessing Tools (3)
+from .advanced_preprocessing import (
+    handle_imbalanced_data,
+    perform_feature_scaling,
+    split_data_strategically
+)
+
+# Advanced Training Tools (3)
+from .advanced_training import (
+    hyperparameter_tuning,
+    train_ensemble_models,
+    perform_cross_validation
+)
+
+# Business Intelligence Tools (4)
+from .business_intelligence import (
+    perform_cohort_analysis,
+    perform_rfm_analysis,
+    detect_causal_relationships,
+    generate_business_insights
+)
+
+# Computer Vision Tools (3)
+from .computer_vision import (
+    extract_image_features,
+    perform_image_clustering,
+    analyze_tabular_image_hybrid
+)
+
+# NLP/Text Analytics Tools (4)
+from .nlp_text_analytics import (
+    perform_topic_modeling,
+    perform_named_entity_recognition,
+    analyze_sentiment_advanced,
+    perform_text_similarity
+)
+
+# Production/MLOps Tools (5)
+from .production_mlops import (
+    monitor_model_drift,
+    explain_predictions,
+    generate_model_card,
+    perform_ab_test_analysis,
+    detect_feature_leakage
+)
+
+# Time Series Tools (3)
+from .time_series import (
+    forecast_time_series,
+    detect_seasonality_trends,
+    create_time_series_features
+)
+
+# Advanced Insights Tools (6) - NEW
+from .advanced_insights import (
+    analyze_root_cause,
+    detect_trends_and_seasonality,
+    detect_anomalies_advanced,
+    perform_hypothesis_testing,
+    analyze_distribution,
+    perform_segment_analysis
+)
+
+# Automated Pipeline Tools (2) - NEW
+from .auto_pipeline import (
+    auto_ml_pipeline,
+    auto_feature_selection
+)
+
+# Visualization Tools (5) - NEW
+from .visualization_engine import (
+    generate_all_plots,
+    generate_data_quality_plots,
+    generate_eda_plots,
+    generate_model_performance_plots,
+    generate_feature_importance_plot
+)
+
+# Interactive Plotly Visualizations (6) - NEW PHASE 2
+from .plotly_visualizations import (
+    generate_interactive_scatter,
+    generate_interactive_histogram,
+    generate_interactive_correlation_heatmap,
+    generate_interactive_box_plots,
+    generate_interactive_time_series,
+    generate_plotly_dashboard
+)
+
+# EDA Report Generation (3) - NEW PHASE 2
+from .eda_reports import (
+    generate_sweetviz_report,
+    generate_ydata_profiling_report,
+    generate_combined_eda_report
+)
+
+# Code Interpreter (2) - NEW PHASE 2 - CRITICAL for True AI Agent
+from .code_interpreter import (
+    execute_python_code,
+    execute_code_from_file
+)
+
+# Cloud Data Sources (4) - NEW: BigQuery Integration
+from .cloud_data_sources import (
+    load_bigquery_table,
+    write_bigquery_table,
+    profile_bigquery_table,
+    query_bigquery
+)
+
+from .tools_registry import TOOLS, get_tool_by_name, get_all_tool_names
+
+from tools.enhanced_feature_engineering import (
+    create_ratio_features,
+    create_statistical_features,
+    create_log_features,
+    create_binned_features,
+)
+
+__all__ = [
+    # Basic Data Profiling (4) - UPDATED
+    "profile_dataset",
+    "detect_data_quality_issues",
+    "analyze_correlations",
+    "get_smart_summary",  # NEW
+    
+    # Basic Data Cleaning (3)
+    "clean_missing_values",
+    "handle_outliers",
+    "fix_data_types",
+    
+    # Data Type Conversion (2)
+    "force_numeric_conversion",
+    "smart_type_inference",
+    
+    # Data Wrangling (3) - NEW
+    "merge_datasets",
+    "concat_datasets",
+    "reshape_dataset",
+    
+    # Basic Feature Engineering (2)
+    "create_time_features",
+    "encode_categorical",
+    
+    # Basic Model Training (2)
+    "train_baseline_models",
+    "generate_model_report",
+    
+    # Advanced Analysis (5)
+    "perform_eda_analysis",
+    "detect_model_issues",
+    "detect_anomalies",
+    "detect_and_handle_multicollinearity",
+    "perform_statistical_tests",
+    
+    # Advanced Feature Engineering (4)
+    "create_interaction_features",
+    "create_aggregation_features",
+    "engineer_text_features",
+    "auto_feature_engineering",
+    
+    # Advanced Preprocessing (3)
+    "handle_imbalanced_data",
+    "perform_feature_scaling",
+    "split_data_strategically",
+    
+    # Advanced Training (3)
+    "hyperparameter_tuning",
+    "train_ensemble_models",
+    "perform_cross_validation",
+    
+    # Business Intelligence (4)
+    "perform_cohort_analysis",
+    "perform_rfm_analysis",
+    "detect_causal_relationships",
+    "generate_business_insights",
+    
+    # Computer Vision (3)
+    "extract_image_features",
+    "perform_image_clustering",
+    "analyze_tabular_image_hybrid",
+    
+    # NLP/Text Analytics (4)
+    "perform_topic_modeling",
+    "perform_named_entity_recognition",
+    "analyze_sentiment_advanced",
+    "perform_text_similarity",
+    
+    # Production/MLOps (5)
+    "monitor_model_drift",
+    "explain_predictions",
+    "generate_model_card",
+    "perform_ab_test_analysis",
+    "detect_feature_leakage",
+    
+    # Time Series (3)
+    "forecast_time_series",
+    "detect_seasonality_trends",
+    "create_time_series_features",
+    
+    # Advanced Insights (6) - NEW
+    "analyze_root_cause",
+    "detect_trends_and_seasonality",
+    "detect_anomalies_advanced",
+    "perform_hypothesis_testing",
+    "analyze_distribution",
+    "perform_segment_analysis",
+    
+    # Automated Pipeline (2) - NEW
+    "auto_ml_pipeline",
+    "auto_feature_selection",
+    
+    # Visualization (5) - NEW
+    "generate_all_plots",
+    "generate_data_quality_plots",
+    "generate_eda_plots",
+    "generate_model_performance_plots",
+    "generate_feature_importance_plot",
+    
+    # Interactive Plotly Visualizations (6) - NEW PHASE 2
+    "generate_interactive_scatter",
+    "generate_interactive_histogram",
+    "generate_interactive_correlation_heatmap",
+    "generate_interactive_box_plots",
+    "generate_interactive_time_series",
+    "generate_plotly_dashboard",
+    
+    # EDA Report Generation (3) - NEW PHASE 2
+    "generate_sweetviz_report",
+    "generate_ydata_profiling_report",
+    "generate_combined_eda_report",
+    
+    # Code Interpreter (2) - NEW PHASE 2 - CRITICAL for True AI Agent
+    "execute_python_code",
+    "execute_code_from_file",
+    
+    # Cloud Data Sources (4) - NEW: BigQuery Integration
+    "load_bigquery_table",
+    "write_bigquery_table",
+    "profile_bigquery_table",
+    "query_bigquery",
+    
+    # Enhanced Feature Engineering (4) - NEW
+    "create_ratio_features",
+    "create_statistical_features",
+    "create_log_features",
+    "create_binned_features",
+    
+    # Registry
+    "TOOLS",
+    "get_tool_by_name",
+    "get_all_tool_names",
+]
diff --git a/src/tools/advanced_analysis.py b/src/tools/advanced_analysis.py
new file mode 100644
index 0000000000000000000000000000000000000000..e025a51be86075520ad26af05951e48e98649503
--- /dev/null
+++ b/src/tools/advanced_analysis.py
@@ -0,0 +1,823 @@
+"""
+Advanced Analysis Tools
+Tools for EDA, model diagnostics, anomaly detection, multicollinearity, and statistical tests.
+"""
+
+import polars as pl
+import numpy as np
+from typing import Dict, Any, List, Optional, Tuple
+from pathlib import Path
+import sys
+import os
+import warnings
+import json
+
+warnings.filterwarnings('ignore')
+
+# Add parent directory to path for imports
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from sklearn.ensemble import IsolationForest
+from sklearn.neighbors import LocalOutlierFactor
+from sklearn.model_selection import learning_curve
+from scipy import stats
+from scipy.stats import chi2_contingency, f_oneway, ttest_ind, pearsonr
+from statsmodels.stats.outliers_influence import variance_inflation_factor
+import plotly.graph_objects as go
+import plotly.express as px
+from plotly.subplots import make_subplots
+import pandas as pd
+
+from utils.polars_helpers import (
+    load_dataframe, get_numeric_columns, get_categorical_columns
+)
+from utils.validation import (
+    validate_file_exists, validate_file_format, validate_dataframe,
+    validate_column_exists
+)
+
+
+def perform_eda_analysis(
+    file_path: str,
+    target_col: Optional[str] = None,
+    output_html: Optional[str] = None
+) -> Dict[str, Any]:
+    """
+    Perform comprehensive automated Exploratory Data Analysis with interactive visualizations.
+    
+    Args:
+        file_path: Path to dataset
+        target_col: Target column for supervised analysis
+        output_html: Path to save HTML report
+        
+    Returns:
+        Dictionary with EDA insights and statistics
+    """
+    # Validation
+    validate_file_exists(file_path)
+    validate_file_format(file_path)
+    
+    # Load data
+    df = load_dataframe(file_path)
+    validate_dataframe(df)
+    
+    if target_col:
+        validate_column_exists(df, target_col)
+    
+    print("📊 Performing comprehensive EDA...")
+    
+    # Basic statistics
+    n_rows, n_cols = df.shape
+    numeric_cols = get_numeric_columns(df)
+    categorical_cols = get_categorical_columns(df)
+    
+    # Missing values analysis
+    missing_stats = {}
+    for col in df.columns:
+        null_count = df[col].null_count()
+        if null_count > 0:
+            missing_stats[col] = {
+                'count': null_count,
+                'percentage': float(null_count / n_rows * 100)
+            }
+    
+    # Univariate analysis for numeric columns
+    numeric_stats = {}
+    for col in numeric_cols[:20]:  # Limit to 20 columns
+        col_data = df[col].drop_nulls().to_numpy()
+        numeric_stats[col] = {
+            'mean': float(np.mean(col_data)),
+            'median': float(np.median(col_data)),
+            'std': float(np.std(col_data)),
+            'min': float(np.min(col_data)),
+            'max': float(np.max(col_data)),
+            'q25': float(np.percentile(col_data, 25)),
+            'q75': float(np.percentile(col_data, 75)),
+            'skewness': float(stats.skew(col_data)),
+            'kurtosis': float(stats.kurtosis(col_data))
+        }
+    
+    # Categorical analysis
+    categorical_stats = {}
+    for col in categorical_cols[:10]:  # Limit to 10 columns
+        value_counts = df[col].value_counts().head(10)
+        categorical_stats[col] = {
+            'unique_values': df[col].n_unique(),
+            'mode': df[col].mode()[0] if len(df[col].mode()) > 0 else None,
+            'top_10_values': {str(row[col]): row['count'] for row in value_counts.to_dicts()}
+        }
+    
+    # Correlation analysis (numeric only)
+    correlations = {}
+    if len(numeric_cols) > 1:
+        corr_matrix = df[numeric_cols[:20]].to_pandas().corr()
+        
+        # Find highly correlated pairs
+        high_corr_pairs = []
+        for i in range(len(corr_matrix.columns)):
+            for j in range(i+1, len(corr_matrix.columns)):
+                corr_val = corr_matrix.iloc[i, j]
+                if abs(corr_val) > 0.7:
+                    high_corr_pairs.append({
+                        'feature_1': corr_matrix.columns[i],
+                        'feature_2': corr_matrix.columns[j],
+                        'correlation': float(corr_val)
+                    })
+        
+        correlations['high_correlations'] = high_corr_pairs
+        correlations['correlation_matrix_shape'] = corr_matrix.shape
+    
+    # Target analysis
+    target_insights = {}
+    if target_col:
+        if target_col in numeric_cols:
+            # Numeric target - regression
+            target_data = df[target_col].drop_nulls().to_numpy()
+            target_insights = {
+                'type': 'regression',
+                'mean': float(np.mean(target_data)),
+                'std': float(np.std(target_data)),
+                'min': float(np.min(target_data)),
+                'max': float(np.max(target_data))
+            }
+            
+            # Feature-target correlations
+            target_corr = {}
+            for col in numeric_cols:
+                if col != target_col:
+                    try:
+                        corr, pval = pearsonr(
+                            df[col].drop_nulls().to_numpy(),
+                            df[target_col].drop_nulls().to_numpy()
+                        )
+                        if abs(corr) > 0.3:
+                            target_corr[col] = {
+                                'correlation': float(corr),
+                                'p_value': float(pval)
+                            }
+                    except:
+                        pass
+            target_insights['correlated_features'] = target_corr
+            
+        else:
+            # Categorical target - classification
+            value_counts = df[target_col].value_counts()
+            target_insights = {
+                'type': 'classification',
+                'classes': len(value_counts),
+                'distribution': {str(row[target_col]): row['count'] for row in value_counts.to_dicts()},
+                'imbalance_ratio': float(value_counts['count'].max() / value_counts['count'].min())
+            }
+    
+    # Create visualizations if output_html requested
+    if output_html:
+        print("📈 Generating interactive visualizations...")
+        
+        fig = make_subplots(
+            rows=3, cols=2,
+            subplot_titles=('Distribution of Numeric Features', 'Missing Values',
+                          'Correlation Heatmap', 'Target Distribution',
+                          'Outliers Detection', 'Feature Importance')
+        )
+        
+        # Distribution plot (first numeric column)
+        if numeric_cols:
+            col = numeric_cols[0]
+            fig.add_trace(
+                go.Histogram(x=df[col].to_list(), name=col),
+                row=1, col=1
+            )
+        
+        # Missing values plot
+        if missing_stats:
+            missing_cols = list(missing_stats.keys())[:10]
+            missing_pcts = [missing_stats[col]['percentage'] for col in missing_cols]
+            fig.add_trace(
+                go.Bar(x=missing_cols, y=missing_pcts, name='Missing %'),
+                row=1, col=2
+            )
+        
+        # Correlation heatmap
+        if len(numeric_cols) > 1:
+            corr_matrix_np = corr_matrix.values
+            fig.add_trace(
+                go.Heatmap(
+                    z=corr_matrix_np,
+                    x=corr_matrix.columns.tolist(),
+                    y=corr_matrix.columns.tolist(),
+                    colorscale='RdBu'
+                ),
+                row=2, col=1
+            )
+        
+        # Target distribution
+        if target_col and target_col in categorical_cols:
+            target_counts = df[target_col].value_counts()
+            fig.add_trace(
+                go.Bar(
+                    x=[str(row[target_col]) for row in target_counts.to_dicts()],
+                    y=[row['count'] for row in target_counts.to_dicts()],
+                    name='Target'
+                ),
+                row=2, col=2
+            )
+        
+        fig.update_layout(height=1200, showlegend=False, title_text="Automated EDA Report")
+        
+        # Save HTML
+        os.makedirs(os.path.dirname(output_html) if os.path.dirname(output_html) else '.', exist_ok=True)
+        fig.write_html(output_html)
+        print(f"💾 EDA report saved to: {output_html}")
+    
+    return {
+        'status': 'success',
+        'dataset_shape': {'rows': n_rows, 'columns': n_cols},
+        'column_types': {
+            'numeric': len(numeric_cols),
+            'categorical': len(categorical_cols)
+        },
+        'missing_values': missing_stats,
+        'numeric_statistics': numeric_stats,
+        'categorical_statistics': categorical_stats,
+        'correlations': correlations,
+        'target_insights': target_insights,
+        'output_html': output_html
+    }
+
+
+def detect_model_issues(
+    model_path: str,
+    train_data_path: str,
+    test_data_path: str,
+    target_col: str
+) -> Dict[str, Any]:
+    """
+    Detect overfitting, underfitting, and other model issues using learning curves and diagnostics.
+    
+    Args:
+        model_path: Path to trained model (.pkl)
+        train_data_path: Path to training dataset
+        test_data_path: Path to test dataset
+        target_col: Target column name
+        
+    Returns:
+        Dictionary with model diagnostics
+    """
+    import joblib
+    from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
+    
+    # Validation
+    validate_file_exists(model_path)
+    validate_file_exists(train_data_path)
+    validate_file_exists(test_data_path)
+    
+    # Load model
+    model = joblib.load(model_path)
+    
+    # Load data
+    train_df = load_dataframe(train_data_path)
+    test_df = load_dataframe(test_data_path)
+    
+    validate_column_exists(train_df, target_col)
+    validate_column_exists(test_df, target_col)
+    
+    # Prepare data
+    from utils.polars_helpers import split_features_target
+    X_train, y_train = split_features_target(train_df, target_col)
+    X_test, y_test = split_features_target(test_df, target_col)
+    
+    print("🔍 Analyzing model performance...")
+    
+    # Predictions
+    y_train_pred = model.predict(X_train)
+    y_test_pred = model.predict(X_test)
+    
+    # Detect task type
+    unique_values = len(np.unique(y_train))
+    task_type = "classification" if unique_values < 20 else "regression"
+    
+    # Calculate metrics
+    if task_type == "classification":
+        train_score = accuracy_score(y_train, y_train_pred)
+        test_score = accuracy_score(y_test, y_test_pred)
+        metric_name = "accuracy"
+    else:
+        train_score = r2_score(y_train, y_train_pred)
+        test_score = r2_score(y_test, y_test_pred)
+        metric_name = "r2"
+    
+    # Diagnose issues
+    score_gap = train_score - test_score
+    
+    diagnosis = []
+    if score_gap > 0.15:
+        diagnosis.append({
+            'issue': 'overfitting',
+            'severity': 'high' if score_gap > 0.25 else 'medium',
+            'description': f'Training {metric_name} ({train_score:.3f}) is much higher than test {metric_name} ({test_score:.3f})',
+            'recommendations': [
+                'Add regularization (L1/L2)',
+                'Reduce model complexity',
+                'Increase training data',
+                'Use cross-validation',
+                'Add dropout (for neural networks)'
+            ]
+        })
+    
+    if test_score < 0.6 and task_type == "classification":
+        diagnosis.append({
+            'issue': 'underfitting',
+            'severity': 'high',
+            'description': f'Test accuracy ({test_score:.3f}) is too low',
+            'recommendations': [
+                'Increase model complexity',
+                'Engineer better features',
+                'Try ensemble methods',
+                'Tune hyperparameters',
+                'Check for data quality issues'
+            ]
+        })
+    
+    if test_score < 0.3 and task_type == "regression":
+        diagnosis.append({
+            'issue': 'underfitting',
+            'severity': 'high',
+            'description': f'Test R² ({test_score:.3f}) is too low',
+            'recommendations': [
+                'Increase model complexity',
+                'Engineer better features',
+                'Try non-linear models',
+                'Check for data scaling issues'
+            ]
+        })
+    
+    # Bias-variance analysis
+    if abs(score_gap) < 0.05:
+        bias_variance = 'balanced'
+    elif score_gap > 0.15:
+        bias_variance = 'high_variance'  # Overfitting
+    else:
+        bias_variance = 'high_bias'  # Underfitting
+    
+    # Generate learning curve data
+    print("📊 Generating learning curve...")
+    try:
+        train_sizes = np.linspace(0.1, 1.0, 10)
+        train_sizes_abs, train_scores, val_scores = learning_curve(
+            model, X_train, y_train,
+            train_sizes=train_sizes,
+            cv=5,
+            scoring='accuracy' if task_type == "classification" else 'r2',
+            n_jobs=-1
+        )
+        
+        learning_curve_data = {
+            'train_sizes': train_sizes_abs.tolist(),
+            'train_scores_mean': train_scores.mean(axis=1).tolist(),
+            'val_scores_mean': val_scores.mean(axis=1).tolist()
+        }
+    except Exception as e:
+        learning_curve_data = {'error': str(e)}
+    
+    return {
+        'status': 'success',
+        'task_type': task_type,
+        'train_score': float(train_score),
+        'test_score': float(test_score),
+        'score_gap': float(score_gap),
+        'bias_variance_assessment': bias_variance,
+        'diagnosis': diagnosis,
+        'learning_curve': learning_curve_data,
+        'summary': f"Model shows {bias_variance} with {len(diagnosis)} issues detected"
+    }
+
+
+def detect_anomalies(
+    file_path: str,
+    method: str = "isolation_forest",
+    contamination: float = 0.1,
+    columns: Optional[List[str]] = None,
+    output_path: Optional[str] = None
+) -> Dict[str, Any]:
+    """
+    Detect anomalies/outliers using various methods.
+    
+    Args:
+        file_path: Path to dataset
+        method: Anomaly detection method:
+            - 'isolation_forest': Isolation Forest (good for high-dim data)
+            - 'lof': Local Outlier Factor
+            - 'zscore': Z-score method (univariate)
+            - 'iqr': Interquartile Range method (univariate)
+        contamination: Expected proportion of outliers (0.01 to 0.5)
+        columns: Columns to analyze (None = all numeric)
+        output_path: Path to save dataset with anomaly labels
+        
+    Returns:
+        Dictionary with anomaly detection results
+    """
+    # Validation
+    validate_file_exists(file_path)
+    validate_file_format(file_path)
+    
+    # Load data
+    df = load_dataframe(file_path)
+    validate_dataframe(df)
+    
+    # Get numeric columns if not specified
+    if columns is None:
+        columns = get_numeric_columns(df)
+        print(f"🔢 Auto-detected {len(columns)} numeric columns")
+    else:
+        for col in columns:
+            validate_column_exists(df, col)
+    
+    if not columns:
+        return {
+            'status': 'skipped',
+            'message': 'No numeric columns found for anomaly detection'
+        }
+    
+    X = df[columns].fill_null(0).to_numpy()
+    
+    print(f"🔍 Detecting anomalies using {method}...")
+    
+    # Detect anomalies based on method
+    if method == "isolation_forest":
+        detector = IsolationForest(contamination=contamination, random_state=42, n_jobs=-1)
+        predictions = detector.fit_predict(X)
+        anomaly_scores = detector.score_samples(X)
+        anomalies = predictions == -1
+        
+    elif method == "lof":
+        detector = LocalOutlierFactor(contamination=contamination, n_jobs=-1)
+        predictions = detector.fit_predict(X)
+        anomaly_scores = detector.negative_outlier_factor_
+        anomalies = predictions == -1
+        
+    elif method == "zscore":
+        # Z-score for each column
+        z_scores = np.abs(stats.zscore(X, axis=0))
+        anomalies = (z_scores > 3).any(axis=1)
+        anomaly_scores = z_scores.max(axis=1)
+        
+    elif method == "iqr":
+        # IQR for each column
+        Q1 = np.percentile(X, 25, axis=0)
+        Q3 = np.percentile(X, 75, axis=0)
+        IQR = Q3 - Q1
+        
+        lower_bound = Q1 - 1.5 * IQR
+        upper_bound = Q3 + 1.5 * IQR
+        
+        anomalies = ((X < lower_bound) | (X > upper_bound)).any(axis=1)
+        # Calculate how many IQRs away from bounds
+        dist_from_bounds = np.maximum(
+            (lower_bound - X) / IQR,
+            (X - upper_bound) / IQR
+        ).max(axis=1)
+        anomaly_scores = dist_from_bounds
+    else:
+        raise ValueError(f"Unsupported method: {method}")
+    
+    # Count anomalies
+    n_anomalies = int(anomalies.sum())
+    anomaly_percentage = float(n_anomalies / len(df) * 100)
+    
+    print(f"🚨 Found {n_anomalies} anomalies ({anomaly_percentage:.2f}%)")
+    
+    # Add anomaly labels to dataframe
+    df_with_anomalies = df.with_columns([
+        pl.Series('is_anomaly', anomalies.astype(int)),
+        pl.Series('anomaly_score', anomaly_scores)
+    ])
+    
+    # Get indices of anomalies
+    anomaly_indices = np.where(anomalies)[0].tolist()
+    
+    # Analyze anomalies by column
+    column_anomaly_stats = {}
+    for col in columns:
+        col_data = df[col].to_numpy()
+        anomaly_values = col_data[anomalies]
+        
+        if len(anomaly_values) > 0:
+            column_anomaly_stats[col] = {
+                'mean_normal': float(np.mean(col_data[~anomalies])),
+                'mean_anomaly': float(np.mean(anomaly_values)),
+                'std_normal': float(np.std(col_data[~anomalies])),
+                'std_anomaly': float(np.std(anomaly_values))
+            }
+    
+    # Save if output path provided
+    if output_path:
+        from utils.polars_helpers import save_dataframe
+        save_dataframe(df_with_anomalies, output_path)
+        print(f"💾 Dataset with anomaly labels saved to: {output_path}")
+    
+    return {
+        'status': 'success',
+        'method': method,
+        'n_anomalies': n_anomalies,
+        'anomaly_percentage': anomaly_percentage,
+        'anomaly_indices': anomaly_indices[:100],  # First 100
+        'column_statistics': column_anomaly_stats,
+        'contamination': contamination,
+        'output_path': output_path
+    }
+
+
+def detect_and_handle_multicollinearity(
+    file_path: str,
+    threshold: float = 10.0,
+    action: str = "report",
+    output_path: Optional[str] = None
+) -> Dict[str, Any]:
+    """
+    Detect and optionally handle multicollinearity using VIF (Variance Inflation Factor).
+    
+    Args:
+        file_path: Path to dataset
+        threshold: VIF threshold (10 = high multicollinearity, 5 = moderate)
+        action: Action to take:
+            - 'report': Only report VIF values
+            - 'remove': Remove features with VIF > threshold
+            - 'recommend': Provide regularization recommendations
+        output_path: Path to save dataset with reduced features
+        
+    Returns:
+        Dictionary with VIF values and recommendations
+    """
+    # Validation
+    validate_file_exists(file_path)
+    validate_file_format(file_path)
+    
+    # Load data
+    df = load_dataframe(file_path)
+    validate_dataframe(df)
+    
+    # Get numeric columns
+    numeric_cols = get_numeric_columns(df)
+    
+    if len(numeric_cols) < 2:
+        return {
+            'status': 'skipped',
+            'message': 'Need at least 2 numeric columns for multicollinearity analysis'
+        }
+    
+    print(f"🔍 Calculating VIF for {len(numeric_cols)} features...")
+    
+    # Prepare data
+    X = df[numeric_cols].fill_null(0).to_numpy()
+    
+    # Calculate VIF for each feature
+    vif_data = {}
+    problematic_features = []
+    
+    for i, col in enumerate(numeric_cols):
+        try:
+            vif = variance_inflation_factor(X, i)
+            vif_data[col] = float(vif)
+            
+            if vif > threshold:
+                problematic_features.append({
+                    'feature': col,
+                    'vif': float(vif),
+                    'severity': 'high' if vif > 20 else 'moderate'
+                })
+        except Exception as e:
+            vif_data[col] = None
+            print(f"⚠️ Could not calculate VIF for {col}: {str(e)}")
+    
+    # Sort by VIF
+    sorted_vif = dict(sorted(vif_data.items(), key=lambda x: x[1] if x[1] is not None else 0, reverse=True))
+    
+    # Generate recommendations
+    recommendations = []
+    
+    if len(problematic_features) > 0:
+        recommendations.append({
+            'type': 'regularization',
+            'description': 'Use Ridge (L2) or Elastic Net regularization to handle multicollinearity',
+            'reason': f'{len(problematic_features)} features have VIF > {threshold}'
+        })
+        
+        recommendations.append({
+            'type': 'pca',
+            'description': 'Apply PCA to reduce dimensionality and eliminate correlations',
+            'reason': 'PCA creates orthogonal features'
+        })
+        
+        if action == "remove":
+            # Remove features with highest VIF iteratively
+            features_to_remove = [f['feature'] for f in problematic_features]
+            recommendations.append({
+                'type': 'feature_removal',
+                'description': f'Remove {len(features_to_remove)} features with high VIF',
+                'features': features_to_remove
+            })
+    
+    # Handle action
+    if action == "remove" and len(problematic_features) > 0:
+        # Remove features with VIF > threshold
+        features_to_keep = [col for col in numeric_cols if col not in [f['feature'] for f in problematic_features]]
+        categorical_cols = get_categorical_columns(df)
+        
+        df_reduced = df.select(features_to_keep + categorical_cols)
+        
+        if output_path:
+            from utils.polars_helpers import save_dataframe
+            save_dataframe(df_reduced, output_path)
+            print(f"💾 Dataset with reduced features saved to: {output_path}")
+        
+        return {
+            'status': 'success',
+            'action': 'removed',
+            'vif_values': sorted_vif,
+            'problematic_features': problematic_features,
+            'features_removed': len(problematic_features),
+            'features_remaining': len(features_to_keep),
+            'recommendations': recommendations,
+            'output_path': output_path
+        }
+    
+    return {
+        'status': 'success',
+        'action': action,
+        'vif_values': sorted_vif,
+        'problematic_features': problematic_features,
+        'threshold': threshold,
+        'recommendations': recommendations
+    }
+
+
+def perform_statistical_tests(
+    file_path: str,
+    target_col: str,
+    test_type: str = "auto",
+    features: Optional[List[str]] = None,
+    alpha: float = 0.05
+) -> Dict[str, Any]:
+    """
+    Perform statistical hypothesis tests to validate feature relationships.
+    
+    Args:
+        file_path: Path to dataset
+        target_col: Target column name
+        test_type: Type of test:
+            - 'auto': Automatically select based on data types
+            - 'chi2': Chi-square test (categorical vs categorical)
+            - 'ttest': T-test (binary categorical vs numeric)
+            - 'anova': ANOVA (multi-class categorical vs numeric)
+            - 'pearson': Pearson correlation test (numeric vs numeric)
+        features: Features to test (None = all)
+        alpha: Significance level (default 0.05)
+        
+    Returns:
+        Dictionary with test results and p-values
+    """
+    # Validation
+    validate_file_exists(file_path)
+    validate_file_format(file_path)
+    
+    # Load data
+    df = load_dataframe(file_path)
+    validate_dataframe(df)
+    validate_column_exists(df, target_col)
+    
+    # Get column types
+    numeric_cols = get_numeric_columns(df)
+    categorical_cols = get_categorical_columns(df)
+    
+    # Determine target type
+    target_is_numeric = target_col in numeric_cols
+    target_is_categorical = target_col in categorical_cols
+    
+    # Get features to test
+    if features is None:
+        features = [col for col in df.columns if col != target_col]
+    
+    print(f"📊 Performing statistical tests for {len(features)} features...")
+    
+    test_results = []
+    
+    for feature in features:
+        feature_is_numeric = feature in numeric_cols
+        feature_is_categorical = feature in categorical_cols
+        
+        # Skip if feature is target
+        if feature == target_col:
+            continue
+        
+        # Select appropriate test
+        if test_type == "auto":
+            if target_is_numeric and feature_is_numeric:
+                selected_test = "pearson"
+            elif target_is_categorical and feature_is_numeric:
+                target_unique = df[target_col].n_unique()
+                selected_test = "ttest" if target_unique == 2 else "anova"
+            elif target_is_categorical and feature_is_categorical:
+                selected_test = "chi2"
+            elif target_is_numeric and feature_is_categorical:
+                selected_test = "anova"
+            else:
+                continue
+        else:
+            selected_test = test_type
+        
+        # Perform test
+        try:
+            if selected_test == "pearson":
+                # Pearson correlation
+                feature_data = df[feature].drop_nulls().to_numpy()
+                target_data = df[target_col].drop_nulls().to_numpy()
+                
+                # Align lengths
+                min_len = min(len(feature_data), len(target_data))
+                corr, pval = pearsonr(feature_data[:min_len], target_data[:min_len])
+                
+                test_results.append({
+                    'feature': feature,
+                    'test': 'pearson',
+                    'statistic': float(corr),
+                    'p_value': float(pval),
+                    'significant': pval < alpha,
+                    'interpretation': f"Correlation: {corr:.3f}"
+                })
+                
+            elif selected_test == "chi2":
+                # Chi-square test
+                contingency_table = pd.crosstab(
+                    df[feature].to_pandas(),
+                    df[target_col].to_pandas()
+                )
+                chi2, pval, dof, expected = chi2_contingency(contingency_table)
+                
+                test_results.append({
+                    'feature': feature,
+                    'test': 'chi2',
+                    'statistic': float(chi2),
+                    'p_value': float(pval),
+                    'dof': int(dof),
+                    'significant': pval < alpha
+                })
+                
+            elif selected_test == "ttest":
+                # T-test
+                target_values = df[target_col].unique().to_list()
+                if len(target_values) != 2:
+                    continue
+                
+                group1 = df.filter(pl.col(target_col) == target_values[0])[feature].drop_nulls().to_numpy()
+                group2 = df.filter(pl.col(target_col) == target_values[1])[feature].drop_nulls().to_numpy()
+                
+                t_stat, pval = ttest_ind(group1, group2)
+                
+                test_results.append({
+                    'feature': feature,
+                    'test': 'ttest',
+                    'statistic': float(t_stat),
+                    'p_value': float(pval),
+                    'significant': pval < alpha,
+                    'mean_diff': float(np.mean(group1) - np.mean(group2))
+                })
+                
+            elif selected_test == "anova":
+                # ANOVA
+                groups = []
+                target_values = df[target_col].unique().to_list()
+                
+                for val in target_values:
+                    group_data = df.filter(pl.col(target_col) == val)[feature].drop_nulls().to_numpy()
+                    if len(group_data) > 0:
+                        groups.append(group_data)
+                
+                if len(groups) > 1:
+                    f_stat, pval = f_oneway(*groups)
+                    
+                    test_results.append({
+                        'feature': feature,
+                        'test': 'anova',
+                        'statistic': float(f_stat),
+                        'p_value': float(pval),
+                        'significant': pval < alpha,
+                        'n_groups': len(groups)
+                    })
+        
+        except Exception as e:
+            print(f"⚠️ Test failed for {feature}: {str(e)}")
+    
+    # Summary
+    significant_features = [r for r in test_results if r['significant']]
+    
+    print(f"✅ {len(significant_features)}/{len(test_results)} features are statistically significant (α={alpha})")
+    
+    return {
+        'status': 'success',
+        'target_column': target_col,
+        'alpha': alpha,
+        'total_tests': len(test_results),
+        'significant_features': len(significant_features),
+        'test_results': test_results,
+        'significant_features_list': [r['feature'] for r in significant_features]
+    }
diff --git a/src/tools/advanced_feature_engineering.py b/src/tools/advanced_feature_engineering.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3165596c81a64071361cfdb98ba5139c35ecc63
--- /dev/null
+++ b/src/tools/advanced_feature_engineering.py
@@ -0,0 +1,692 @@
+"""
+Advanced Feature Engineering Tools
+Tools for creating interaction features, aggregations, text features, and auto feature engineering.
+"""
+
+import polars as pl
+import numpy as np
+from typing import Dict, Any, List, Optional, Tuple
+from pathlib import Path
+import sys
+import os
+import json
+import warnings
+from itertools import combinations
+
+warnings.filterwarnings('ignore')
+
+# Add parent directory to path for imports
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from sklearn.preprocessing import PolynomialFeatures
+from sklearn.decomposition import PCA, TruncatedSVD
+from sklearn.feature_selection import mutual_info_classif, mutual_info_regression, SelectKBest, f_classif, f_regression
+from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
+from textblob import TextBlob
+import re
+
+from utils.polars_helpers import (
+    load_dataframe, save_dataframe, get_numeric_columns,
+    get_categorical_columns, get_datetime_columns
+)
+from utils.validation import (
+    validate_file_exists, validate_file_format, validate_dataframe,
+    validate_column_exists
+)
+
+
+def create_interaction_features(
+    file_path: str,
+    method: str = "polynomial",
+    degree: int = 2,
+    n_components: Optional[int] = None,
+    columns: Optional[List[str]] = None,
+    max_features: int = 50,
+    output_path: Optional[str] = None
+) -> Dict[str, Any]:
+    """
+    Create interaction features using polynomial features, PCA, or feature crossing.
+    
+    Args:
+        file_path: Path to dataset
+        method: Feature interaction method:
+            - 'polynomial': Polynomial features (degree 2 or 3)
+            - 'pca': Principal Component Analysis
+            - 'cross': Manual feature crossing (multiply pairs)
+            - 'mutual_info': Select best features by mutual information
+        degree: Polynomial degree (for polynomial method)
+        n_components: Number of components (for PCA, None = auto)
+        columns: Columns to use (None = all numeric)
+        max_features: Maximum number of new features to create
+        output_path: Path to save dataset with new features
+        
+    Returns:
+        Dictionary with feature engineering results
+    """
+    # Validation
+    validate_file_exists(file_path)
+    validate_file_format(file_path)
+    
+    # Load data
+    df = load_dataframe(file_path)
+    validate_dataframe(df)
+    
+    # Get numeric columns if not specified
+    if columns is None:
+        columns = get_numeric_columns(df)
+        print(f"🔢 Auto-detected {len(columns)} numeric columns")
+    else:
+        for col in columns:
+            validate_column_exists(df, col)
+    
+    if not columns:
+        return {
+            'status': 'skipped',
+            'message': 'No numeric columns found for interaction features'
+        }
+    
+    # Limit columns if too many
+    if len(columns) > 20:
+        print(f"⚠️ Too many columns ({len(columns)}). Using top 20 by variance.")
+        variances = df[columns].select([
+            (pl.col(col).var().alias(col)) for col in columns
+        ]).to_dicts()[0]
+        columns = sorted(variances.keys(), key=lambda x: variances[x], reverse=True)[:20]
+    
+    X = df[columns].to_numpy()
+    original_features = len(columns)
+    
+    # Create interaction features based on method
+    if method == "polynomial":
+        print(f"🔄 Creating polynomial features (degree={degree})...")
+        poly = PolynomialFeatures(degree=degree, include_bias=False, interaction_only=False)
+        X_poly = poly.fit_transform(X)
+        
+        # Get feature names
+        feature_names = poly.get_feature_names_out(columns)
+        
+        # Limit features
+        if X_poly.shape[1] > max_features + original_features:
+            # Keep original + top max_features new ones by variance
+            variances = np.var(X_poly[:, original_features:], axis=0)
+            top_indices = np.argsort(variances)[::-1][:max_features]
+            X_new = np.hstack([X, X_poly[:, original_features:][:, top_indices]])
+            new_feature_names = [feature_names[i + original_features] for i in top_indices]
+        else:
+            X_new = X_poly
+            new_feature_names = feature_names[original_features:].tolist()
+        
+        # Create new dataframe
+        df_new = df.clone()
+        for i, name in enumerate(new_feature_names):
+            clean_name = name.replace(' ', '_').replace('^', '_pow_')
+            df_new = df_new.with_columns(
+                pl.Series(f"poly_{clean_name}", X_new[:, original_features + i])
+            )
+        
+        created_features = new_feature_names
+        
+    elif method == "pca":
+        print(f"🔄 Creating PCA features...")
+        if n_components is None:
+            n_components = min(len(columns), max_features)
+        
+        pca = PCA(n_components=n_components)
+        X_pca = pca.fit_transform(X)
+        
+        # Create new dataframe
+        df_new = df.clone()
+        for i in range(n_components):
+            df_new = df_new.with_columns(
+                pl.Series(f"pca_{i+1}", X_pca[:, i])
+            )
+        
+        created_features = [f"pca_{i+1}" for i in range(n_components)]
+        
+        explained_variance = pca.explained_variance_ratio_
+        cumulative_variance = np.cumsum(explained_variance)
+        
+    elif method == "cross":
+        print(f"🔄 Creating feature crosses...")
+        # Create pairwise interactions
+        pairs = list(combinations(columns, 2))
+        
+        # Limit number of pairs
+        if len(pairs) > max_features:
+            pairs = pairs[:max_features]
+        
+        df_new = df.clone()
+        created_features = []
+        
+        for col1, col2 in pairs:
+            new_name = f"{col1}_x_{col2}"
+            df_new = df_new.with_columns(
+                (pl.col(col1) * pl.col(col2)).alias(new_name)
+            )
+            created_features.append(new_name)
+        
+    elif method == "mutual_info":
+        print(f"🔄 Selecting features by mutual information...")
+        # This requires a target column - for now, create interaction features
+        # and let the user select based on their target
+        return {
+            'status': 'error',
+            'message': 'mutual_info method requires a target column. Use polynomial or cross instead, then use feature selection.'
+        }
+    
+    else:
+        raise ValueError(f"Unsupported method: {method}")
+    
+    # Save if output path provided
+    if output_path:
+        save_dataframe(df_new, output_path)
+        print(f"💾 Dataset with interaction features saved to: {output_path}")
+    
+    result = {
+        'status': 'success',
+        'method': method,
+        'original_features': original_features,
+        'new_features_created': len(created_features),
+        'total_features': len(df_new.columns),
+        'feature_names': created_features[:20],  # Show first 20
+        'output_path': output_path
+    }
+    
+    if method == "pca":
+        result['explained_variance_ratio'] = explained_variance.tolist()
+        result['cumulative_variance'] = cumulative_variance.tolist()
+        result['variance_explained_by_top_5'] = float(cumulative_variance[min(4, len(cumulative_variance)-1)])
+    
+    return result
+
+
+def create_aggregation_features(
+    file_path: str,
+    group_col: str,
+    agg_columns: Optional[List[str]] = None,
+    agg_functions: Optional[List[str]] = None,
+    rolling_window: Optional[int] = None,
+    time_col: Optional[str] = None,
+    lag_periods: Optional[List[int]] = None,
+    output_path: Optional[str] = None
+) -> Dict[str, Any]:
+    """
+    Create aggregation features including group-by aggregations, rolling windows, and lags.
+    
+    Args:
+        file_path: Path to dataset
+        group_col: Column to group by (e.g., 'customer_id', 'category')
+        agg_columns: Columns to aggregate (None = all numeric)
+        agg_functions: Aggregation functions ('mean', 'sum', 'std', 'min', 'max', 'count')
+        rolling_window: Window size for rolling aggregations (requires sorted data)
+        time_col: Time column for sorting (required for rolling/lag features)
+        lag_periods: Lag periods to create (e.g., [1, 7, 30] for 1-day, 7-day, 30-day lags)
+        output_path: Path to save dataset with new features
+        
+    Returns:
+        Dictionary with aggregation results
+    """
+    # Validation
+    validate_file_exists(file_path)
+    validate_file_format(file_path)
+    
+    # Load data
+    df = load_dataframe(file_path)
+    validate_dataframe(df)
+    validate_column_exists(df, group_col)
+    
+    if time_col:
+        validate_column_exists(df, time_col)
+        df = df.sort(time_col)
+    
+    # Get numeric columns if not specified
+    if agg_columns is None:
+        agg_columns = [col for col in get_numeric_columns(df) if col != group_col]
+        print(f"🔢 Auto-detected {len(agg_columns)} numeric columns for aggregation")
+    else:
+        for col in agg_columns:
+            validate_column_exists(df, col)
+    
+    if not agg_columns:
+        return {
+            'status': 'skipped',
+            'message': 'No numeric columns found for aggregation'
+        }
+    
+    # Default aggregation functions
+    if agg_functions is None:
+        agg_functions = ['mean', 'sum', 'std', 'min', 'max', 'count']
+    
+    df_new = df.clone()
+    created_features = []
+    
+    # Group-by aggregations
+    print(f"📊 Creating group-by aggregations for {group_col}...")
+    
+    for agg_col in agg_columns:
+        for agg_func in agg_functions:
+            try:
+                if agg_func == 'mean':
+                    agg_df = df.group_by(group_col).agg(
+                        pl.col(agg_col).mean().alias(f"{agg_col}_{group_col}_mean")
+                    )
+                elif agg_func == 'sum':
+                    agg_df = df.group_by(group_col).agg(
+                        pl.col(agg_col).sum().alias(f"{agg_col}_{group_col}_sum")
+                    )
+                elif agg_func == 'std':
+                    agg_df = df.group_by(group_col).agg(
+                        pl.col(agg_col).std().alias(f"{agg_col}_{group_col}_std")
+                    )
+                elif agg_func == 'min':
+                    agg_df = df.group_by(group_col).agg(
+                        pl.col(agg_col).min().alias(f"{agg_col}_{group_col}_min")
+                    )
+                elif agg_func == 'max':
+                    agg_df = df.group_by(group_col).agg(
+                        pl.col(agg_col).max().alias(f"{agg_col}_{group_col}_max")
+                    )
+                elif agg_func == 'count':
+                    agg_df = df.group_by(group_col).agg(
+                        pl.col(agg_col).count().alias(f"{agg_col}_{group_col}_count")
+                    )
+                else:
+                    continue
+                
+                # Join back to original dataframe
+                df_new = df_new.join(agg_df, on=group_col, how='left')
+                created_features.append(f"{agg_col}_{group_col}_{agg_func}")
+            except Exception as e:
+                print(f"⚠️ Skipping {agg_col}_{agg_func}: {str(e)}")
+    
+    # Rolling window features
+    if rolling_window and time_col:
+        print(f"📈 Creating rolling window features (window={rolling_window})...")
+        
+        for agg_col in agg_columns[:5]:  # Limit to first 5 columns to avoid explosion
+            try:
+                # Rolling mean
+                df_new = df_new.with_columns(
+                    pl.col(agg_col).rolling_mean(window_size=rolling_window)
+                    .over(group_col)
+                    .alias(f"{agg_col}_rolling_{rolling_window}_mean")
+                )
+                created_features.append(f"{agg_col}_rolling_{rolling_window}_mean")
+                
+                # Rolling std
+                df_new = df_new.with_columns(
+                    pl.col(agg_col).rolling_std(window_size=rolling_window)
+                    .over(group_col)
+                    .alias(f"{agg_col}_rolling_{rolling_window}_std")
+                )
+                created_features.append(f"{agg_col}_rolling_{rolling_window}_std")
+            except Exception as e:
+                print(f"⚠️ Skipping rolling for {agg_col}: {str(e)}")
+    
+    # Lag features
+    if lag_periods and time_col:
+        print(f"⏰ Creating lag features (periods={lag_periods})...")
+        
+        for agg_col in agg_columns[:5]:  # Limit to avoid explosion
+            for lag in lag_periods:
+                try:
+                    df_new = df_new.with_columns(
+                        pl.col(agg_col).shift(lag)
+                        .over(group_col)
+                        .alias(f"{agg_col}_lag_{lag}")
+                    )
+                    created_features.append(f"{agg_col}_lag_{lag}")
+                except Exception as e:
+                    print(f"⚠️ Skipping lag {lag} for {agg_col}: {str(e)}")
+    
+    # Save if output path provided
+    if output_path:
+        save_dataframe(df_new, output_path)
+        print(f"💾 Dataset with aggregation features saved to: {output_path}")
+    
+    return {
+        'status': 'success',
+        'group_column': group_col,
+        'aggregated_columns': agg_columns,
+        'aggregation_functions': agg_functions,
+        'new_features_created': len(created_features),
+        'total_features': len(df_new.columns),
+        'feature_names': created_features[:30],  # Show first 30
+        'rolling_window': rolling_window,
+        'lag_periods': lag_periods,
+        'output_path': output_path
+    }
+
+
+def engineer_text_features(
+    file_path: str,
+    text_column: str,
+    methods: Optional[List[str]] = None,
+    max_features: int = 100,
+    ngram_range: Tuple[int, int] = (1, 2),
+    output_path: Optional[str] = None
+) -> Dict[str, Any]:
+    """
+    Extract features from text columns using TF-IDF, n-grams, and text statistics.
+    
+    Args:
+        file_path: Path to dataset
+        text_column: Name of text column
+        methods: List of methods to apply:
+            - 'tfidf': TF-IDF vectorization
+            - 'count': Count vectorization (bag of words)
+            - 'sentiment': Sentiment analysis
+            - 'stats': Text statistics (length, word count, etc.)
+            - 'ngrams': N-gram features
+        max_features: Maximum number of TF-IDF/count features
+        ngram_range: N-gram range (e.g., (1, 2) for unigrams and bigrams)
+        output_path: Path to save dataset with new features
+        
+    Returns:
+        Dictionary with text feature engineering results
+    """
+    # Validation
+    validate_file_exists(file_path)
+    validate_file_format(file_path)
+    
+    # Load data
+    df = load_dataframe(file_path)
+    validate_dataframe(df)
+    validate_column_exists(df, text_column)
+    
+    # Default methods
+    if methods is None:
+        methods = ['stats', 'sentiment', 'tfidf']
+    
+    df_new = df.clone()
+    created_features = []
+    
+    # Get text data
+    texts = df[text_column].fill_null("").to_list()
+    
+    # Text statistics
+    if 'stats' in methods:
+        print("📝 Extracting text statistics...")
+        
+        char_counts = [len(str(text)) for text in texts]
+        word_counts = [len(str(text).split()) for text in texts]
+        avg_word_lengths = [np.mean([len(word) for word in str(text).split()]) if text else 0 for text in texts]
+        special_char_counts = [len(re.findall(r'[^a-zA-Z0-9\s]', str(text))) for text in texts]
+        digit_counts = [len(re.findall(r'\d', str(text))) for text in texts]
+        uppercase_counts = [len(re.findall(r'[A-Z]', str(text))) for text in texts]
+        
+        df_new = df_new.with_columns([
+            pl.Series(f"{text_column}_char_count", char_counts),
+            pl.Series(f"{text_column}_word_count", word_counts),
+            pl.Series(f"{text_column}_avg_word_length", avg_word_lengths),
+            pl.Series(f"{text_column}_special_char_count", special_char_counts),
+            pl.Series(f"{text_column}_digit_count", digit_counts),
+            pl.Series(f"{text_column}_uppercase_count", uppercase_counts)
+        ])
+        
+        created_features.extend([
+            f"{text_column}_char_count",
+            f"{text_column}_word_count",
+            f"{text_column}_avg_word_length",
+            f"{text_column}_special_char_count",
+            f"{text_column}_digit_count",
+            f"{text_column}_uppercase_count"
+        ])
+    
+    # Sentiment analysis
+    if 'sentiment' in methods:
+        print("💭 Performing sentiment analysis...")
+        
+        sentiments = []
+        subjectivities = []
+        
+        for text in texts:
+            try:
+                blob = TextBlob(str(text))
+                sentiments.append(blob.sentiment.polarity)
+                subjectivities.append(blob.sentiment.subjectivity)
+            except:
+                sentiments.append(0.0)
+                subjectivities.append(0.0)
+        
+        df_new = df_new.with_columns([
+            pl.Series(f"{text_column}_sentiment", sentiments),
+            pl.Series(f"{text_column}_subjectivity", subjectivities)
+        ])
+        
+        created_features.extend([
+            f"{text_column}_sentiment",
+            f"{text_column}_subjectivity"
+        ])
+    
+    # TF-IDF features
+    if 'tfidf' in methods:
+        print(f"🔤 Creating TF-IDF features (max_features={max_features})...")
+        
+        tfidf = TfidfVectorizer(
+            max_features=max_features,
+            ngram_range=ngram_range,
+            stop_words='english',
+            min_df=2
+        )
+        
+        try:
+            tfidf_matrix = tfidf.fit_transform([str(text) for text in texts])
+            feature_names = tfidf.get_feature_names_out()
+            
+            # Add TF-IDF features to dataframe
+            for i, feature_name in enumerate(feature_names):
+                clean_name = re.sub(r'[^a-zA-Z0-9_]', '_', feature_name)[:30]
+                df_new = df_new.with_columns(
+                    pl.Series(f"tfidf_{clean_name}", tfidf_matrix[:, i].toarray().flatten())
+                )
+                created_features.append(f"tfidf_{clean_name}")
+        except Exception as e:
+            print(f"⚠️ TF-IDF failed: {str(e)}")
+    
+    # Count vectorization
+    if 'count' in methods:
+        print(f"🔢 Creating count features (max_features={max_features})...")
+        
+        count_vec = CountVectorizer(
+            max_features=max_features,
+            ngram_range=ngram_range,
+            stop_words='english',
+            min_df=2
+        )
+        
+        try:
+            count_matrix = count_vec.fit_transform([str(text) for text in texts])
+            feature_names = count_vec.get_feature_names_out()
+            
+            # Add count features to dataframe
+            for i, feature_name in enumerate(feature_names[:50]):  # Limit to 50
+                clean_name = re.sub(r'[^a-zA-Z0-9_]', '_', feature_name)[:30]
+                df_new = df_new.with_columns(
+                    pl.Series(f"count_{clean_name}", count_matrix[:, i].toarray().flatten())
+                )
+                created_features.append(f"count_{clean_name}")
+        except Exception as e:
+            print(f"⚠️ Count vectorization failed: {str(e)}")
+    
+    # Save if output path provided
+    if output_path:
+        save_dataframe(df_new, output_path)
+        print(f"💾 Dataset with text features saved to: {output_path}")
+    
+    return {
+        'status': 'success',
+        'text_column': text_column,
+        'methods_applied': methods,
+        'new_features_created': len(created_features),
+        'total_features': len(df_new.columns),
+        'feature_names': created_features[:30],  # Show first 30
+        'output_path': output_path
+    }
+
+
+def auto_feature_engineering(
+    file_path: str,
+    target_col: str,
+    groq_api_key: Optional[str] = None,
+    max_suggestions: int = 10,
+    implement_top_k: int = 5,
+    output_path: Optional[str] = None
+) -> Dict[str, Any]:
+    """
+    Use LLM (Groq or Gemini) to automatically generate and implement feature engineering ideas.
+    
+    Args:
+        file_path: Path to dataset
+        target_col: Target column name
+        groq_api_key: Groq API key (optional - will try to use environment variable or Gemini)
+        max_suggestions: Maximum number of feature suggestions to generate
+        implement_top_k: Number of top suggestions to implement
+        output_path: Path to save dataset with new features
+        
+    Returns:
+        Dictionary with feature suggestions and implementation results
+    """
+    import os
+    
+    # Validation
+    validate_file_exists(file_path)
+    validate_file_format(file_path)
+    
+    # Load data
+    df = load_dataframe(file_path)
+    validate_dataframe(df)
+    validate_column_exists(df, target_col)
+    
+    # Get dataset summary
+    numeric_cols = get_numeric_columns(df)
+    categorical_cols = get_categorical_columns(df)
+    
+    # Sample data for analysis
+    sample_df = df.head(5)
+    
+    # Create prompt for LLM
+    prompt = f"""You are a data science expert. Analyze this dataset and suggest {max_suggestions} creative feature engineering ideas.
+
+Dataset Overview:
+- Target column: {target_col}
+- Numeric columns ({len(numeric_cols)}): {', '.join(numeric_cols[:10])}
+- Categorical columns ({len(categorical_cols)}): {', '.join(categorical_cols[:5])}
+- Rows: {len(df)}
+
+Sample data (first 5 rows):
+{sample_df.head(5)}
+
+Suggest {max_suggestions} feature engineering ideas that could improve model performance. For each idea:
+1. Describe the feature clearly
+2. Provide Python code using Polars to create it
+3. Explain why it might be valuable
+
+Format your response as JSON:
+{{
+  "suggestions": [
+    {{
+      "name": "feature_name",
+      "description": "what it does",
+      "code": "pl.col('a') * pl.col('b')",
+      "reasoning": "why it helps"
+    }}
+  ]
+}}
+"""
+    
+    print("🤖 Asking LLM for feature engineering suggestions...")
+    
+    # Try multiple LLM providers in order of preference
+    llm_response = None
+    
+    # Try Groq first if API key provided
+    if groq_api_key or os.getenv("GROQ_API_KEY"):
+        try:
+            from groq import Groq
+            api_key = groq_api_key or os.getenv("GROQ_API_KEY")
+            client = Groq(api_key=api_key)
+            response = client.chat.completions.create(
+                model="llama-3.3-70b-versatile",
+                messages=[{"role": "user", "content": prompt}],
+                temperature=0.7,
+                max_tokens=2000
+            )
+            llm_response = response.choices[0].message.content
+            print("   ✓ Using Groq LLM")
+        except Exception as e:
+            print(f"   ⚠️ Groq failed: {str(e)}, trying Gemini...")
+    
+    # Try Gemini if Groq failed or not available
+    if not llm_response and os.getenv("GEMINI_API_KEY"):
+        try:
+            import google.generativeai as genai
+            genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
+            model = genai.GenerativeModel('gemini-2.0-flash-exp')
+            response = model.generate_content(prompt)
+            llm_response = response.text
+            print("   ✓ Using Gemini LLM")
+        except Exception as e:
+            print(f"   ⚠️ Gemini failed: {str(e)}")
+    
+    if not llm_response:
+        return {
+            "status": "error",
+            "message": "No LLM API key available. Set GROQ_API_KEY or GEMINI_API_KEY environment variable."
+        }
+    
+    try:
+        # Parse JSON response
+        import json
+        # Extract JSON from response (might be wrapped in markdown code blocks)
+        if "```json" in llm_response:
+            llm_response = llm_response.split("```json")[1].split("```")[0].strip()
+        elif "```" in llm_response:
+            llm_response = llm_response.split("```")[1].split("```")[0].strip()
+        
+        suggestions = json.loads(llm_response)
+        
+        # Implement top K suggestions
+        df_new = df.clone()
+        implemented = []
+        
+        for i, suggestion in enumerate(suggestions['suggestions'][:implement_top_k]):
+            try:
+                # Execute feature creation code
+                feature_name = suggestion['name']
+                code = suggestion['code']
+                
+                # Create new column using eval (be careful in production!)
+                df_new = df_new.with_columns(
+                    eval(code).alias(feature_name)
+                )
+                
+                implemented.append({
+                    'name': feature_name,
+                    'description': suggestion['description'],
+                    'reasoning': suggestion['reasoning']
+                })
+                
+                print(f"✅ Implemented: {feature_name}")
+            except Exception as e:
+                print(f"⚠️ Failed to implement {suggestion.get('name', 'unknown')}: {str(e)}")
+        
+        # Save if output path provided
+        if output_path:
+            save_dataframe(df_new, output_path)
+            print(f"💾 Dataset with auto-generated features saved to: {output_path}")
+        
+        return {
+            'status': 'success',
+            'total_suggestions': len(suggestions['suggestions']),
+            'suggestions': suggestions['suggestions'],
+            'implemented': implemented,
+            'new_features_created': len(implemented),
+            'output_path': output_path
+        }
+        
+    except Exception as e:
+        return {
+            'status': 'error',
+            'message': f"Auto feature engineering failed: {str(e)}"
+        }
diff --git a/src/tools/advanced_insights.py b/src/tools/advanced_insights.py
new file mode 100644
index 0000000000000000000000000000000000000000..da1b49bad6ef82793cb20fae5cb754e3737deb8a
--- /dev/null
+++ b/src/tools/advanced_insights.py
@@ -0,0 +1,544 @@
+"""
+Advanced Insights Tools
+Tools for root cause analysis, trend detection, anomaly detection, and statistical testing.
+"""
+
+import polars as pl
+import numpy as np
+import pandas as pd
+from typing import Dict, Any, List, Optional, Tuple
+from pathlib import Path
+import sys
+import os
+from scipy import stats
+from scipy.signal import find_peaks
+from sklearn.ensemble import IsolationForest
+from sklearn.cluster import KMeans
+from sklearn.preprocessing import StandardScaler
+import json
+
+# Add parent directory to path
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from utils.polars_helpers import load_dataframe, get_numeric_columns
+from utils.validation import validate_file_exists, validate_file_format
+
+
+def analyze_root_cause(file_path: str, 
+                       target_col: str,
+                       time_col: Optional[str] = None,
+                       threshold_drop: float = 0.15) -> Dict[str, Any]:
+    """
+    Perform root cause analysis to identify why a metric dropped.
+    
+    Args:
+        file_path: Path to dataset
+        target_col: Column to analyze (e.g., 'sales')
+        time_col: Optional time column for trend analysis
+        threshold_drop: Percentage drop to flag as significant (default 15%)
+        
+    Returns:
+        Dictionary with root cause insights
+    """
+    validate_file_exists(file_path)
+    df = load_dataframe(file_path)
+    
+    # Convert to pandas for easier analysis
+    df_pd = df.to_pandas()
+    
+    results = {
+        "target_column": target_col,
+        "analysis_type": "root_cause",
+        "insights": [],
+        "correlations": {},
+        "top_factors": []
+    }
+    
+    # Check if target exists
+    if target_col not in df_pd.columns:
+        return {"status": "error", "message": f"Column '{target_col}' not found"}
+    
+    # Analyze overall trend
+    target_mean = df_pd[target_col].mean()
+    target_std = df_pd[target_col].std()
+    
+    # If time column exists, analyze temporal patterns
+    if time_col and time_col in df_pd.columns:
+        try:
+            df_pd[time_col] = pd.to_datetime(df_pd[time_col])
+            df_sorted = df_pd.sort_values(time_col)
+            
+            # Calculate period-over-period changes
+            if len(df_sorted) > 10:
+                mid_point = len(df_sorted) // 2
+                first_half_mean = df_sorted[target_col].iloc[:mid_point].mean()
+                second_half_mean = df_sorted[target_col].iloc[mid_point:].mean()
+                
+                change_pct = ((second_half_mean - first_half_mean) / first_half_mean) * 100
+                
+                if abs(change_pct) > threshold_drop * 100:
+                    insight = f"📉 Significant change detected: {change_pct:+.1f}% between periods"
+                    results["insights"].append(insight)
+                    results["period_change"] = {
+                        "first_period_avg": float(first_half_mean),
+                        "second_period_avg": float(second_half_mean),
+                        "change_percentage": float(change_pct)
+                    }
+        except Exception as e:
+            results["insights"].append(f"⚠️ Could not analyze time series: {str(e)}")
+    
+    # Find correlations with target
+    numeric_cols = df_pd.select_dtypes(include=[np.number]).columns.tolist()
+    if target_col in numeric_cols:
+        numeric_cols.remove(target_col)
+    
+    if numeric_cols:
+        correlations = {}
+        for col in numeric_cols[:20]:  # Limit to top 20 for performance
+            try:
+                corr = df_pd[target_col].corr(df_pd[col])
+                if not np.isnan(corr):
+                    correlations[col] = float(corr)
+            except:
+                pass
+        
+        # Sort by absolute correlation
+        sorted_corrs = sorted(correlations.items(), key=lambda x: abs(x[1]), reverse=True)
+        results["correlations"] = dict(sorted_corrs[:10])
+        
+        # Identify top factors
+        top_factors = []
+        for col, corr in sorted_corrs[:5]:
+            if abs(corr) > 0.3:
+                direction = "positively" if corr > 0 else "negatively"
+                top_factors.append({
+                    "factor": col,
+                    "correlation": float(corr),
+                    "description": f"{col} is {direction} correlated ({corr:.3f}) with {target_col}"
+                })
+        
+        results["top_factors"] = top_factors
+        
+        if top_factors:
+            results["insights"].append(f"🔍 Found {len(top_factors)} significant factors influencing {target_col}")
+    
+    # Outlier detection in target
+    Q1 = df_pd[target_col].quantile(0.25)
+    Q3 = df_pd[target_col].quantile(0.75)
+    IQR = Q3 - Q1
+    outliers = df_pd[(df_pd[target_col] < Q1 - 1.5 * IQR) | (df_pd[target_col] > Q3 + 1.5 * IQR)]
+    
+    if len(outliers) > 0:
+        outlier_pct = (len(outliers) / len(df_pd)) * 100
+        results["insights"].append(f"⚠️ {len(outliers)} outliers detected ({outlier_pct:.1f}% of data)")
+        results["outlier_count"] = len(outliers)
+    
+    return results
+
+
+def detect_trends_and_seasonality(file_path: str,
+                                  value_col: str,
+                                  time_col: str,
+                                  seasonal_period: Optional[int] = None) -> Dict[str, Any]:
+    """
+    Detect trends and seasonal patterns in time series data.
+    
+    Args:
+        file_path: Path to dataset
+        value_col: Column with values to analyze
+        time_col: Column with timestamps
+        seasonal_period: Expected seasonal period (auto-detected if None)
+        
+    Returns:
+        Dictionary with trend and seasonality insights
+    """
+    validate_file_exists(file_path)
+    df = load_dataframe(file_path).to_pandas()
+    
+    results = {
+        "value_column": value_col,
+        "time_column": time_col,
+        "trend_detected": False,
+        "seasonality_detected": False,
+        "insights": []
+    }
+    
+    # Validate columns
+    if value_col not in df.columns or time_col not in df.columns:
+        return {"status": "error", "message": "Columns not found"}
+    
+    # Convert to datetime and sort
+    try:
+        df[time_col] = pd.to_datetime(df[time_col])
+        df = df.sort_values(time_col).reset_index(drop=True)
+    except:
+        return {"status": "error", "message": f"Could not parse {time_col} as datetime"}
+    
+    values = df[value_col].values
+    
+    # Trend detection using linear regression
+    X = np.arange(len(values)).reshape(-1, 1)
+    y = values
+    
+    # Simple linear regression
+    slope, intercept, r_value, p_value, std_err = stats.linregress(X.flatten(), y)
+    
+    if p_value < 0.05:  # Significant trend
+        results["trend_detected"] = True
+        results["trend_slope"] = float(slope)
+        results["trend_r_squared"] = float(r_value ** 2)
+        
+        direction = "upward" if slope > 0 else "downward"
+        results["insights"].append(f"📈 {direction.capitalize()} trend detected (slope: {slope:.4f}, R²: {r_value**2:.3f})")
+        results["trend_direction"] = direction
+    else:
+        results["insights"].append("📊 No significant trend detected")
+    
+    # Seasonality detection using autocorrelation
+    if len(values) > 20:
+        from statsmodels.tsa.stattools import acf
+        
+        try:
+            autocorr = acf(values, nlags=min(len(values)//2, 50), fft=True)
+            
+            # Find peaks in autocorrelation (excluding lag 0)
+            peaks, properties = find_peaks(autocorr[1:], height=0.3)
+            
+            if len(peaks) > 0:
+                # Most prominent peak indicates seasonal period
+                peak_lag = peaks[np.argmax(properties['peak_heights'])] + 1
+                results["seasonality_detected"] = True
+                results["seasonal_period"] = int(peak_lag)
+                results["insights"].append(f"🔄 Seasonality detected with period of {peak_lag} observations")
+            else:
+                results["insights"].append("📊 No strong seasonality pattern detected")
+        except Exception as e:
+            results["insights"].append(f"⚠️ Could not analyze seasonality: {str(e)}")
+    
+    # Calculate summary statistics
+    results["statistics"] = {
+        "mean": float(np.mean(values)),
+        "std": float(np.std(values)),
+        "min": float(np.min(values)),
+        "max": float(np.max(values)),
+        "range": float(np.max(values) - np.min(values))
+    }
+    
+    return results
+
+
+def detect_anomalies_advanced(file_path: str,
+                              columns: Optional[List[str]] = None,
+                              contamination: float = 0.1,
+                              method: str = "isolation_forest") -> Dict[str, Any]:
+    """
+    Detect anomalies with confidence scores using advanced methods.
+    
+    Args:
+        file_path: Path to dataset
+        columns: Columns to analyze (all numeric if None)
+        contamination: Expected proportion of outliers
+        method: 'isolation_forest' or 'statistical'
+        
+    Returns:
+        Dictionary with anomaly detection results
+    """
+    validate_file_exists(file_path)
+    df = load_dataframe(file_path)
+    df_pd = df.to_pandas()
+    
+    # Select numeric columns
+    if columns is None:
+        numeric_cols = df_pd.select_dtypes(include=[np.number]).columns.tolist()
+    else:
+        numeric_cols = [c for c in columns if c in df_pd.columns]
+    
+    if not numeric_cols:
+        return {"status": "error", "message": "No numeric columns found"}
+    
+    X = df_pd[numeric_cols].fillna(df_pd[numeric_cols].mean())
+    
+    results = {
+        "method": method,
+        "columns_analyzed": numeric_cols,
+        "total_rows": len(X),
+        "anomaly_indices": [],
+        "anomaly_scores": []
+    }
+    
+    if method == "isolation_forest":
+        # Isolation Forest
+        clf = IsolationForest(contamination=contamination, random_state=42)
+        predictions = clf.fit_predict(X)
+        scores = clf.score_samples(X)
+        
+        anomaly_mask = predictions == -1
+        results["anomalies_detected"] = int(anomaly_mask.sum())
+        results["anomaly_percentage"] = float((anomaly_mask.sum() / len(X)) * 100)
+        results["anomaly_indices"] = np.where(anomaly_mask)[0].tolist()
+        results["anomaly_scores"] = scores[anomaly_mask].tolist()
+        
+        results["insights"] = [
+            f"🔍 Detected {results['anomalies_detected']} anomalies ({results['anomaly_percentage']:.2f}% of data)",
+            f"📊 Using Isolation Forest with contamination={contamination}"
+        ]
+    
+    else:  # Statistical method
+        # Z-score method
+        z_scores = np.abs(stats.zscore(X, nan_policy='omit'))
+        anomaly_mask = (z_scores > 3).any(axis=1)
+        
+        results["anomalies_detected"] = int(anomaly_mask.sum())
+        results["anomaly_percentage"] = float((anomaly_mask.sum() / len(X)) * 100)
+        results["anomaly_indices"] = np.where(anomaly_mask)[0].tolist()
+        
+        results["insights"] = [
+            f"🔍 Detected {results['anomalies_detected']} anomalies ({results['anomaly_percentage']:.2f}% of data)",
+            f"📊 Using statistical method (Z-score > 3)"
+        ]
+    
+    return results
+
+
+def perform_hypothesis_testing(file_path: str,
+                               group_col: str,
+                               value_col: str,
+                               test_type: str = "auto") -> Dict[str, Any]:
+    """
+    Perform statistical hypothesis testing.
+    
+    Args:
+        file_path: Path to dataset
+        group_col: Column defining groups
+        value_col: Column with values to compare
+        test_type: 't-test', 'chi-square', 'anova', or 'auto'
+        
+    Returns:
+        Dictionary with test results
+    """
+    validate_file_exists(file_path)
+    df = load_dataframe(file_path).to_pandas()
+    
+    if group_col not in df.columns or value_col not in df.columns:
+        return {"status": "error", "message": "Columns not found"}
+    
+    results = {
+        "group_column": group_col,
+        "value_column": value_col,
+        "test_type": test_type
+    }
+    
+    # Get groups
+    groups = df.groupby(group_col)[value_col].apply(list).to_dict()
+    group_names = list(groups.keys())
+    
+    if len(group_names) < 2:
+        return {"status": "error", "message": "Need at least 2 groups for comparison"}
+    
+    # Auto-detect test type
+    if test_type == "auto":
+        if len(group_names) == 2:
+            test_type = "t-test"
+        else:
+            test_type = "anova"
+    
+    # Perform test
+    if test_type == "t-test" and len(group_names) >= 2:
+        group1_data = groups[group_names[0]]
+        group2_data = groups[group_names[1]]
+        
+        statistic, p_value = stats.ttest_ind(group1_data, group2_data)
+        
+        results["test_statistic"] = float(statistic)
+        results["p_value"] = float(p_value)
+        results["significant"] = p_value < 0.05
+        results["groups_compared"] = [group_names[0], group_names[1]]
+        
+        results["interpretation"] = (
+            f"{'Significant' if p_value < 0.05 else 'No significant'} difference "
+            f"between {group_names[0]} and {group_names[1]} (p={p_value:.4f})"
+        )
+        
+        # Effect size (Cohen's d)
+        mean1, mean2 = np.mean(group1_data), np.mean(group2_data)
+        std1, std2 = np.std(group1_data), np.std(group2_data)
+        pooled_std = np.sqrt((std1**2 + std2**2) / 2)
+        cohens_d = (mean1 - mean2) / pooled_std if pooled_std > 0 else 0
+        
+        results["effect_size"] = float(cohens_d)
+        results["group_means"] = {group_names[0]: float(mean1), group_names[1]: float(mean2)}
+    
+    elif test_type == "anova":
+        group_data = [groups[g] for g in group_names]
+        statistic, p_value = stats.f_oneway(*group_data)
+        
+        results["test_statistic"] = float(statistic)
+        results["p_value"] = float(p_value)
+        results["significant"] = p_value < 0.05
+        results["groups_compared"] = group_names
+        
+        results["interpretation"] = (
+            f"{'Significant' if p_value < 0.05 else 'No significant'} difference "
+            f"among {len(group_names)} groups (p={p_value:.4f})"
+        )
+        
+        # Group means
+        results["group_means"] = {g: float(np.mean(groups[g])) for g in group_names}
+    
+    return results
+
+
+def analyze_distribution(file_path: str,
+                        column: str,
+                        tests: List[str] = ["normality", "skewness"]) -> Dict[str, Any]:
+    """
+    Analyze distribution of a column.
+    
+    Args:
+        file_path: Path to dataset
+        column: Column to analyze
+        tests: List of tests to perform
+        
+    Returns:
+        Dictionary with distribution analysis results
+    """
+    validate_file_exists(file_path)
+    df = load_dataframe(file_path).to_pandas()
+    
+    if column not in df.columns:
+        return {"status": "error", "message": f"Column '{column}' not found"}
+    
+    data = df[column].dropna()
+    
+    results = {
+        "column": column,
+        "n_values": len(data),
+        "n_missing": int(df[column].isna().sum()),
+        "tests_performed": tests,
+        "insights": []
+    }
+    
+    # Basic statistics
+    results["statistics"] = {
+        "mean": float(data.mean()),
+        "median": float(data.median()),
+        "std": float(data.std()),
+        "min": float(data.min()),
+        "max": float(data.max()),
+        "q25": float(data.quantile(0.25)),
+        "q75": float(data.quantile(0.75))
+    }
+    
+    # Normality test
+    if "normality" in tests:
+        statistic, p_value = stats.shapiro(data.sample(min(5000, len(data))))  # Limit for performance
+        results["normality_test"] = {
+            "test": "Shapiro-Wilk",
+            "statistic": float(statistic),
+            "p_value": float(p_value),
+            "is_normal": p_value > 0.05
+        }
+        
+        if p_value > 0.05:
+            results["insights"].append(f"✅ Data appears normally distributed (p={p_value:.4f})")
+        else:
+            results["insights"].append(f"⚠️ Data is NOT normally distributed (p={p_value:.4f})")
+    
+    # Skewness
+    if "skewness" in tests:
+        skewness = float(stats.skew(data))
+        kurtosis = float(stats.kurtosis(data))
+        
+        results["skewness"] = skewness
+        results["kurtosis"] = kurtosis
+        
+        if abs(skewness) < 0.5:
+            skew_desc = "approximately symmetric"
+        elif skewness > 0:
+            skew_desc = "right-skewed (positive skew)"
+        else:
+            skew_desc = "left-skewed (negative skew)"
+        
+        results["insights"].append(f"📊 Distribution is {skew_desc} (skewness={skewness:.3f})")
+    
+    return results
+
+
+def perform_segment_analysis(file_path: str,
+                             n_segments: int = 5,
+                             features: Optional[List[str]] = None) -> Dict[str, Any]:
+    """
+    Perform cluster-based segment analysis.
+    
+    Args:
+        file_path: Path to dataset
+        n_segments: Number of segments to create
+        features: Features to use for clustering (all numeric if None)
+        
+    Returns:
+        Dictionary with segment analysis results
+    """
+    validate_file_exists(file_path)
+    df = load_dataframe(file_path).to_pandas()
+    
+    # Select features
+    if features is None:
+        features = df.select_dtypes(include=[np.number]).columns.tolist()
+    else:
+        features = [f for f in features if f in df.columns]
+    
+    if not features:
+        return {"status": "error", "message": "No numeric features found for clustering"}
+    
+    # Prepare data
+    X = df[features].fillna(df[features].mean())
+    
+    # Scale features
+    scaler = StandardScaler()
+    X_scaled = scaler.fit_transform(X)
+    
+    # Perform clustering
+    kmeans = KMeans(n_clusters=n_segments, random_state=42, n_init=10)
+    labels = kmeans.fit_predict(X_scaled)
+    
+    # Add cluster labels to dataframe
+    df['segment'] = labels
+    
+    # Analyze segments
+    segment_profiles = []
+    for i in range(n_segments):
+        segment_data = df[df['segment'] == i]
+        profile = {
+            "segment_id": i,
+            "size": len(segment_data),
+            "percentage": float((len(segment_data) / len(df)) * 100),
+            "characteristics": {}
+        }
+        
+        # Calculate mean for each feature
+        for feat in features:
+            profile["characteristics"][feat] = {
+                "mean": float(segment_data[feat].mean()),
+                "std": float(segment_data[feat].std())
+            }
+        
+        segment_profiles.append(profile)
+    
+    results = {
+        "n_segments": n_segments,
+        "features_used": features,
+        "total_samples": len(df),
+        "segments": segment_profiles,
+        "insights": [
+            f"🎯 Created {n_segments} segments from {len(df)} samples",
+            f"📊 Used {len(features)} features for segmentation"
+        ]
+    }
+    
+    # Find most distinctive features for each segment
+    for i, profile in enumerate(segment_profiles):
+        results["insights"].append(
+            f"Segment {i}: {profile['size']} samples ({profile['percentage']:.1f}%)"
+        )
+    
+    return results
diff --git a/src/tools/advanced_preprocessing.py b/src/tools/advanced_preprocessing.py
new file mode 100644
index 0000000000000000000000000000000000000000..25b28db979403eb6cd3d86e9d0b6104d48247ca4
--- /dev/null
+++ b/src/tools/advanced_preprocessing.py
@@ -0,0 +1,539 @@
+"""
+Advanced Preprocessing Tools
+Tools for handling imbalanced data, feature scaling, and strategic data splitting.
+"""
+
+import polars as pl
+import numpy as np
+from typing import Dict, Any, List, Optional, Tuple
+from pathlib import Path
+import sys
+import os
+import joblib
+import warnings
+
+warnings.filterwarnings('ignore')
+
+# Add parent directory to path for imports
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, LabelEncoder
+from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
+from imblearn.under_sampling import RandomUnderSampler, TomekLinks, EditedNearestNeighbours
+from imblearn.combine import SMOTETomek, SMOTEENN
+from collections import Counter
+
+from utils.polars_helpers import (
+    load_dataframe, save_dataframe, get_numeric_columns,
+    get_categorical_columns, split_features_target
+)
+from utils.validation import (
+    validate_file_exists, validate_file_format, validate_dataframe,
+    validate_column_exists
+)
+
+
+def handle_imbalanced_data(
+    file_path: str,
+    target_col: str,
+    strategy: str = "smote",
+    sampling_ratio: float = 1.0,
+    output_path: str = None,
+    random_state: int = 42
+) -> Dict[str, Any]:
+    """
+    Handle imbalanced datasets using various resampling techniques.
+    
+    Args:
+        file_path: Path to dataset
+        target_col: Target column name
+        strategy: Resampling strategy:
+            - 'smote': Synthetic Minority Over-sampling (SMOTE)
+            - 'adasyn': Adaptive Synthetic Sampling
+            - 'borderline_smote': Borderline SMOTE variant
+            - 'random_undersample': Random undersampling
+            - 'tomek': Tomek Links undersampling
+            - 'smote_tomek': Combined SMOTE + Tomek Links
+            - 'smote_enn': Combined SMOTE + Edited Nearest Neighbours
+            - 'class_weights': Return class weights (no resampling)
+        sampling_ratio: Ratio of minority to majority class (0.5 = 50%, 1.0 = 100%)
+        output_path: Path to save balanced dataset
+        random_state: Random seed
+        
+    Returns:
+        Dictionary with balancing results and class distributions
+    """
+    # Validation
+    validate_file_exists(file_path)
+    validate_file_format(file_path)
+    
+    # Load data
+    df = load_dataframe(file_path)
+    validate_dataframe(df)
+    validate_column_exists(df, target_col)
+    
+    # Get original class distribution
+    original_dist = df[target_col].value_counts().to_dict()
+    original_counts = dict(sorted(original_dist.items()))
+    
+    print(f"📊 Original class distribution: {original_counts}")
+    
+    # Calculate imbalance ratio
+    class_counts = list(original_counts.values())
+    imbalance_ratio = max(class_counts) / min(class_counts)
+    
+    if imbalance_ratio < 1.5:
+        return {
+            'status': 'skipped',
+            'message': 'Dataset is already balanced (ratio < 1.5)',
+            'original_distribution': original_counts,
+            'imbalance_ratio': float(imbalance_ratio)
+        }
+    
+    # Prepare data
+    X, y = split_features_target(df, target_col)
+    
+    # Handle class weights strategy (no resampling)
+    if strategy == "class_weights":
+        from sklearn.utils.class_weight import compute_class_weight
+        classes = np.unique(y)
+        weights = compute_class_weight('balanced', classes=classes, y=y)
+        class_weights = dict(zip(classes, weights))
+        
+        return {
+            'status': 'success',
+            'strategy': 'class_weights',
+            'class_weights': {str(k): float(v) for k, v in class_weights.items()},
+            'original_distribution': original_counts,
+            'imbalance_ratio': float(imbalance_ratio),
+            'recommendation': 'Use class_weight parameter in your model training'
+        }
+    
+    # Create resampler based on strategy
+    sampling_strategy = sampling_ratio if sampling_ratio < 1.0 else 'auto'
+    
+    if strategy == "smote":
+        resampler = SMOTE(sampling_strategy=sampling_strategy, random_state=random_state)
+    elif strategy == "adasyn":
+        resampler = ADASYN(sampling_strategy=sampling_strategy, random_state=random_state)
+    elif strategy == "borderline_smote":
+        resampler = BorderlineSMOTE(sampling_strategy=sampling_strategy, random_state=random_state)
+    elif strategy == "random_undersample":
+        resampler = RandomUnderSampler(sampling_strategy=sampling_strategy, random_state=random_state)
+    elif strategy == "tomek":
+        resampler = TomekLinks(sampling_strategy='auto')
+    elif strategy == "smote_tomek":
+        resampler = SMOTETomek(sampling_strategy=sampling_strategy, random_state=random_state)
+    elif strategy == "smote_enn":
+        resampler = SMOTEENN(sampling_strategy=sampling_strategy, random_state=random_state)
+    else:
+        raise ValueError(f"Unsupported strategy: {strategy}")
+    
+    # Perform resampling
+    print(f"⚖️ Applying {strategy} resampling...")
+    X_resampled, y_resampled = resampler.fit_resample(X, y)
+    
+    # Get new class distribution
+    new_counts = dict(Counter(y_resampled))
+    new_counts = dict(sorted(new_counts.items()))
+    
+    print(f"✅ New class distribution: {new_counts}")
+    
+    # Calculate changes
+    total_original = sum(original_counts.values())
+    total_new = sum(new_counts.values())
+    
+    changes = {
+        str(cls): {
+            'original': original_counts.get(cls, 0),
+            'new': new_counts.get(cls, 0),
+            'change': new_counts.get(cls, 0) - original_counts.get(cls, 0)
+        }
+        for cls in set(list(original_counts.keys()) + list(new_counts.keys()))
+    }
+    
+    # Create balanced dataframe
+    feature_cols = [col for col in df.columns if col != target_col]
+    balanced_data = {col: X_resampled[:, i] for i, col in enumerate(feature_cols)}
+    balanced_data[target_col] = y_resampled
+    
+    balanced_df = pl.DataFrame(balanced_data)
+    
+    # Save if output path provided
+    if output_path:
+        save_dataframe(balanced_df, output_path)
+        print(f"💾 Balanced dataset saved to: {output_path}")
+    
+    return {
+        'status': 'success',
+        'strategy': strategy,
+        'original_distribution': original_counts,
+        'new_distribution': new_counts,
+        'changes_by_class': changes,
+        'total_samples_before': total_original,
+        'total_samples_after': total_new,
+        'sample_change': f"{'+' if total_new > total_original else ''}{total_new - total_original}",
+        'new_imbalance_ratio': float(max(new_counts.values()) / min(new_counts.values())),
+        'output_path': output_path
+    }
+
+
+def perform_feature_scaling(
+    file_path: str,
+    scaler_type: str = "standard",
+    columns: Optional[List[str]] = None,
+    output_path: Optional[str] = None,
+    scaler_save_path: Optional[str] = None
+) -> Dict[str, Any]:
+    """
+    Scale features using various normalization techniques.
+    
+    Args:
+        file_path: Path to dataset
+        scaler_type: Scaling method:
+            - 'standard': StandardScaler (mean=0, std=1)
+            - 'minmax': MinMaxScaler (range 0-1)
+            - 'robust': RobustScaler (median, IQR - robust to outliers)
+        columns: List of columns to scale (None = all numeric columns)
+        output_path: Path to save scaled dataset
+        scaler_save_path: Path to save fitted scaler for future use
+        
+    Returns:
+        Dictionary with scaling statistics
+    """
+    # Validation
+    validate_file_exists(file_path)
+    validate_file_format(file_path)
+    
+    # Load data
+    df = load_dataframe(file_path)
+    validate_dataframe(df)
+    
+    # Get numeric columns if not specified
+    if columns is None:
+        columns = get_numeric_columns(df)
+        print(f"🔢 Auto-detected {len(columns)} numeric columns for scaling")
+    else:
+        for col in columns:
+            validate_column_exists(df, col)
+    
+    if not columns:
+        return {
+            'status': 'skipped',
+            'message': 'No numeric columns found to scale'
+        }
+    
+    # Create scaler
+    if scaler_type == "standard":
+        scaler = StandardScaler()
+    elif scaler_type == "minmax":
+        scaler = MinMaxScaler()
+    elif scaler_type == "robust":
+        scaler = RobustScaler()
+    else:
+        raise ValueError(f"Unsupported scaler_type: {scaler_type}")
+    
+    # Get original statistics
+    original_stats = {}
+    for col in columns:
+        col_data = df[col].to_numpy()
+        original_stats[col] = {
+            'mean': float(np.mean(col_data)),
+            'std': float(np.std(col_data)),
+            'min': float(np.min(col_data)),
+            'max': float(np.max(col_data)),
+            'median': float(np.median(col_data))
+        }
+    
+    # Fit and transform
+    print(f"📏 Applying {scaler_type} scaling to {len(columns)} columns...")
+    scaled_data = scaler.fit_transform(df[columns].to_numpy())
+    
+    # Create scaled dataframe
+    df_scaled = df.clone()
+    for i, col in enumerate(columns):
+        df_scaled = df_scaled.with_columns(
+            pl.Series(col, scaled_data[:, i])
+        )
+    
+    # Get new statistics
+    new_stats = {}
+    for i, col in enumerate(columns):
+        new_stats[col] = {
+            'mean': float(np.mean(scaled_data[:, i])),
+            'std': float(np.std(scaled_data[:, i])),
+            'min': float(np.min(scaled_data[:, i])),
+            'max': float(np.max(scaled_data[:, i])),
+            'median': float(np.median(scaled_data[:, i]))
+        }
+    
+    # Save scaled data
+    if output_path:
+        save_dataframe(df_scaled, output_path)
+        print(f"💾 Scaled dataset saved to: {output_path}")
+    
+    # Save scaler
+    if scaler_save_path:
+        os.makedirs(os.path.dirname(scaler_save_path), exist_ok=True)
+        joblib.dump(scaler, scaler_save_path)
+        print(f"💾 Scaler saved to: {scaler_save_path}")
+    
+    return {
+        'status': 'success',
+        'scaler_type': scaler_type,
+        'columns_scaled': columns,
+        'n_columns': len(columns),
+        'original_stats': original_stats,
+        'scaled_stats': new_stats,
+        'output_path': output_path,
+        'scaler_path': scaler_save_path
+    }
+
+
+def split_data_strategically(
+    file_path: str,
+    target_col: Optional[str] = None,
+    split_type: str = "train_test",
+    test_size: float = 0.2,
+    val_size: float = 0.1,
+    stratify: bool = True,
+    time_col: Optional[str] = None,
+    group_col: Optional[str] = None,
+    random_state: int = 42,
+    output_dir: Optional[str] = None
+) -> Dict[str, Any]:
+    """
+    Perform strategic data splitting with multiple options.
+    
+    Args:
+        file_path: Path to dataset
+        target_col: Target column (for stratification)
+        split_type: Split strategy:
+            - 'train_test': Train/test split
+            - 'train_val_test': Train/validation/test split
+            - 'time_based': Time-based split (requires time_col)
+            - 'group_based': Group-based split (requires group_col, prevents leakage)
+        test_size: Test set proportion
+        val_size: Validation set proportion (for train_val_test)
+        stratify: Whether to stratify by target
+        time_col: Column to use for time-based splitting
+        group_col: Column to use for group-based splitting
+        random_state: Random seed
+        output_dir: Directory to save split datasets
+        
+    Returns:
+        Dictionary with split information and file paths
+    """
+    # Validation
+    validate_file_exists(file_path)
+    validate_file_format(file_path)
+    
+    # Load data
+    df = load_dataframe(file_path)
+    validate_dataframe(df)
+    
+    if target_col:
+        validate_column_exists(df, target_col)
+    
+    n_samples = len(df)
+    
+    # Time-based split
+    if split_type == "time_based":
+        if not time_col:
+            raise ValueError("time_col is required for time_based split")
+        validate_column_exists(df, time_col)
+        
+        # Sort by time
+        df = df.sort(time_col)
+        
+        # Calculate split points
+        test_idx = int(n_samples * (1 - test_size))
+        
+        if output_dir:
+            train_df = df[:test_idx]
+            test_df = df[test_idx:]
+            
+            os.makedirs(output_dir, exist_ok=True)
+            train_path = os.path.join(output_dir, "train.csv")
+            test_path = os.path.join(output_dir, "test.csv")
+            
+            save_dataframe(train_df, train_path)
+            save_dataframe(test_df, test_path)
+            
+            print(f"✅ Time-based split: train={len(train_df)}, test={len(test_df)}")
+            
+            return {
+                'status': 'success',
+                'split_type': 'time_based',
+                'train_size': len(train_df),
+                'test_size': len(test_df),
+                'train_path': train_path,
+                'test_path': test_path,
+                'time_column': time_col
+            }
+    
+    # Group-based split
+    elif split_type == "group_based":
+        if not group_col:
+            raise ValueError("group_col is required for group_based split")
+        validate_column_exists(df, group_col)
+        
+        # Get unique groups
+        unique_groups = df[group_col].unique().to_list()
+        n_groups = len(unique_groups)
+        
+        # Split groups
+        np.random.seed(random_state)
+        np.random.shuffle(unique_groups)
+        
+        test_n_groups = max(1, int(n_groups * test_size))
+        test_groups = unique_groups[:test_n_groups]
+        train_groups = unique_groups[test_n_groups:]
+        
+        train_df = df.filter(pl.col(group_col).is_in(train_groups))
+        test_df = df.filter(pl.col(group_col).is_in(test_groups))
+        
+        if output_dir:
+            os.makedirs(output_dir, exist_ok=True)
+            train_path = os.path.join(output_dir, "train.csv")
+            test_path = os.path.join(output_dir, "test.csv")
+            
+            save_dataframe(train_df, train_path)
+            save_dataframe(test_df, test_path)
+            
+            print(f"✅ Group-based split: train={len(train_df)}, test={len(test_df)}")
+            
+            return {
+                'status': 'success',
+                'split_type': 'group_based',
+                'train_size': len(train_df),
+                'test_size': len(test_df),
+                'train_groups': len(train_groups),
+                'test_groups': len(test_groups),
+                'train_path': train_path,
+                'test_path': test_path,
+                'group_column': group_col
+            }
+    
+    # Standard train/test split
+    elif split_type == "train_test":
+        X, y = split_features_target(df, target_col) if target_col else (df.to_numpy(), None)
+        
+        stratify_y = y if (stratify and target_col and len(np.unique(y)) < 20) else None
+        
+        if target_col:
+            X_train, X_test, y_train, y_test = train_test_split(
+                X, y, test_size=test_size, random_state=random_state, stratify=stratify_y
+            )
+            
+            # Reconstruct dataframes
+            feature_cols = [col for col in df.columns if col != target_col]
+            train_data = {col: X_train[:, i] for i, col in enumerate(feature_cols)}
+            train_data[target_col] = y_train
+            train_df = pl.DataFrame(train_data)
+            
+            test_data = {col: X_test[:, i] for i, col in enumerate(feature_cols)}
+            test_data[target_col] = y_test
+            test_df = pl.DataFrame(test_data)
+        else:
+            indices = np.arange(len(df))
+            train_idx, test_idx = train_test_split(
+                indices, test_size=test_size, random_state=random_state
+            )
+            train_df = df[train_idx]
+            test_df = df[test_idx]
+        
+        if output_dir:
+            os.makedirs(output_dir, exist_ok=True)
+            train_path = os.path.join(output_dir, "train.csv")
+            test_path = os.path.join(output_dir, "test.csv")
+            
+            save_dataframe(train_df, train_path)
+            save_dataframe(test_df, test_path)
+            
+            print(f"✅ Train/test split: train={len(train_df)}, test={len(test_df)}")
+            
+            return {
+                'status': 'success',
+                'split_type': 'train_test',
+                'train_size': len(train_df),
+                'test_size': len(test_df),
+                'stratified': bool(stratify_y is not None),
+                'train_path': train_path,
+                'test_path': test_path
+            }
+    
+    # Train/val/test split
+    elif split_type == "train_val_test":
+        X, y = split_features_target(df, target_col) if target_col else (df.to_numpy(), None)
+        
+        stratify_y = y if (stratify and target_col and len(np.unique(y)) < 20) else None
+        
+        # First split: train+val vs test
+        if target_col:
+            X_temp, X_test, y_temp, y_test = train_test_split(
+                X, y, test_size=test_size, random_state=random_state, stratify=stratify_y
+            )
+            
+            # Second split: train vs val
+            val_ratio = val_size / (1 - test_size)
+            stratify_temp = y_temp if stratify_y is not None else None
+            X_train, X_val, y_train, y_val = train_test_split(
+                X_temp, y_temp, test_size=val_ratio, random_state=random_state, stratify=stratify_temp
+            )
+            
+            # Reconstruct dataframes
+            feature_cols = [col for col in df.columns if col != target_col]
+            
+            train_data = {col: X_train[:, i] for i, col in enumerate(feature_cols)}
+            train_data[target_col] = y_train
+            train_df = pl.DataFrame(train_data)
+            
+            val_data = {col: X_val[:, i] for i, col in enumerate(feature_cols)}
+            val_data[target_col] = y_val
+            val_df = pl.DataFrame(val_data)
+            
+            test_data = {col: X_test[:, i] for i, col in enumerate(feature_cols)}
+            test_data[target_col] = y_test
+            test_df = pl.DataFrame(test_data)
+        else:
+            indices = np.arange(len(df))
+            temp_idx, test_idx = train_test_split(
+                indices, test_size=test_size, random_state=random_state
+            )
+            val_ratio = val_size / (1 - test_size)
+            train_idx, val_idx = train_test_split(
+                temp_idx, test_size=val_ratio, random_state=random_state
+            )
+            
+            train_df = df[train_idx]
+            val_df = df[val_idx]
+            test_df = df[test_idx]
+        
+        if output_dir:
+            os.makedirs(output_dir, exist_ok=True)
+            train_path = os.path.join(output_dir, "train.csv")
+            val_path = os.path.join(output_dir, "val.csv")
+            test_path = os.path.join(output_dir, "test.csv")
+            
+            save_dataframe(train_df, train_path)
+            save_dataframe(val_df, val_path)
+            save_dataframe(test_df, test_path)
+            
+            print(f"✅ Train/val/test split: train={len(train_df)}, val={len(val_df)}, test={len(test_df)}")
+            
+            return {
+                'status': 'success',
+                'split_type': 'train_val_test',
+                'train_size': len(train_df),
+                'val_size': len(val_df),
+                'test_size': len(test_df),
+                'stratified': bool(stratify_y is not None),
+                'train_path': train_path,
+                'val_path': val_path,
+                'test_path': test_path
+            }
+    
+    else:
+        raise ValueError(f"Unsupported split_type: {split_type}")
diff --git a/src/tools/advanced_training.py b/src/tools/advanced_training.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebd5f1eb959461d2d68138627eee94c5623ace2a
--- /dev/null
+++ b/src/tools/advanced_training.py
@@ -0,0 +1,811 @@
+"""
+Advanced Model Training Tools
+Tools for hyperparameter tuning, ensemble methods, and cross-validation.
+"""
+
+import polars as pl
+import pandas as pd
+import numpy as np
+from typing import Dict, Any, List, Optional, Tuple
+from pathlib import Path
+import sys
+import os
+import joblib
+import json
+import optuna
+from optuna.pruners import MedianPruner
+from optuna.samplers import TPESampler
+import warnings
+import tempfile
+
+warnings.filterwarnings('ignore')
+
+# Add parent directory to path for imports
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+# Import artifact store
+try:
+    from storage.helpers import save_model_with_store
+    ARTIFACT_STORE_AVAILABLE = True
+except ImportError:
+    ARTIFACT_STORE_AVAILABLE = False
+    print("⚠️  Artifact store not available, using local paths")
+
+from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, TimeSeriesSplit, cross_val_score
+from sklearn.linear_model import LogisticRegression, Ridge, Lasso, ElasticNet
+from sklearn.ensemble import (
+    RandomForestClassifier, RandomForestRegressor,
+    GradientBoostingClassifier, GradientBoostingRegressor,
+    VotingClassifier, VotingRegressor,
+    StackingClassifier, StackingRegressor
+)
+from xgboost import XGBClassifier, XGBRegressor
+from sklearn.metrics import (
+    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
+    mean_squared_error, mean_absolute_error, r2_score
+)
+
+from utils.polars_helpers import load_dataframe, get_numeric_columns, split_features_target
+from utils.validation import (
+    validate_file_exists, validate_file_format, validate_dataframe,
+    validate_column_exists, validate_target_column
+)
+
+
+def hyperparameter_tuning(
+    file_path: str,
+    target_col: str,
+    model_type: str = "random_forest",
+    task_type: str = "auto",
+    n_trials: int = 50,
+    cv_folds: int = 5,
+    optimization_metric: str = "auto",
+    test_size: float = 0.2,
+    random_state: int = 42,
+    output_path: Optional[str] = None
+) -> Dict[str, Any]:
+    """
+    Perform Bayesian hyperparameter optimization using Optuna.
+    
+    Args:
+        file_path: Path to prepared dataset
+        target_col: Target column name
+        model_type: Model to tune ('random_forest', 'xgboost', 'logistic', 'ridge')
+        task_type: 'classification', 'regression', or 'auto' (detect from target)
+        n_trials: Number of optimization trials
+        cv_folds: Number of cross-validation folds
+        optimization_metric: Metric to optimize ('auto', 'accuracy', 'f1', 'roc_auc', 'rmse', 'r2')
+        test_size: Test set size for final evaluation
+        random_state: Random seed
+        output_path: Path to save best model
+        
+    Returns:
+        Dictionary with tuning results, best parameters, and performance
+    """
+    # ⚠️ CRITICAL FIX: Convert integer params (Gemini/LLMs pass floats)
+    n_trials = int(n_trials)
+    cv_folds = int(cv_folds)
+    random_state = int(random_state)
+    # Validation
+    validate_file_exists(file_path)
+    validate_file_format(file_path)
+    
+    # Load data
+    df = load_dataframe(file_path)
+    validate_dataframe(df)
+    validate_column_exists(df, target_col)
+    
+    # ⚠️ SKIP DATETIME CONVERSION: Already handled by create_time_features() in workflow step 7
+    # The encoded.csv file should already have time features extracted
+    # If datetime columns still exist, they will be handled as regular features
+    
+    # ⚠️ CRITICAL FIX: Convert Polars to Pandas if needed (for XGBoost compatibility)
+    if hasattr(df, 'to_pandas'):
+        print(f"   🔄 Converting Polars DataFrame to Pandas for XGBoost compatibility...")
+        df = df.to_pandas()
+    
+    # ⚠️ CRITICAL: Drop any remaining datetime columns that weren't converted to features
+    # XGBoost cannot handle Timestamp objects in NumPy arrays
+    if isinstance(df, pd.DataFrame):
+        datetime_cols = df.select_dtypes(include=['datetime64', 'datetime64[ns]', 'datetime64[ns, UTC]']).columns.tolist()
+        if datetime_cols:
+            print(f"   ⚠️ Dropping {len(datetime_cols)} datetime columns that cannot be used directly: {datetime_cols}")
+            print(f"   💡 Time features should have been extracted in workflow step 7 (create_time_features)")
+            df = df.drop(columns=datetime_cols)
+        
+        # ⚠️ CRITICAL: Drop any remaining string/object columns (not encoded properly)
+        # XGBoost cannot handle string values like 'mb', 'ml', etc.
+        object_cols = df.select_dtypes(include=['object', 'string']).columns.tolist()
+        # Don't drop the target column if it's object type
+        object_cols = [col for col in object_cols if col != target_col]
+        if object_cols:
+            print(f"   ⚠️ Dropping {len(object_cols)} string columns that weren't encoded: {object_cols}")
+            print(f"   💡 Categorical encoding should have been done in workflow step 8 (encode_categorical)")
+            print(f"   💡 These columns likely weren't in the encoded file or encoding failed")
+            df = df.drop(columns=object_cols)
+    
+    # Prepare data - handle both Polars and Pandas
+    if target_col not in df.columns:
+        raise ValueError(f"Target column '{target_col}' not found in dataframe. Available columns: {list(df.columns)}")
+    
+    # Split features and target (works for both Polars and Pandas)
+    if hasattr(df, 'drop'):  # Both have drop method
+        X = df.drop(columns=[target_col]) if isinstance(df, pd.DataFrame) else df.drop(target_col)
+        y = df[target_col]
+    else:
+        X, y = split_features_target(df, target_col)
+    
+    # Convert to numpy for sklearn compatibility
+    if hasattr(X, 'to_numpy'):
+        X = X.to_numpy()
+        y = y.to_numpy()
+    elif hasattr(X, 'values'):
+        X = X.values
+        y = y.values
+    
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=test_size, random_state=random_state, stratify=y if task_type == "classification" else None
+    )
+    
+    # Detect task type
+    if task_type == "auto":
+        unique_values = len(np.unique(y))
+        task_type = "classification" if unique_values < 20 else "regression"
+    
+    # Set default metric
+    if optimization_metric == "auto":
+        optimization_metric = "accuracy" if task_type == "classification" else "rmse"
+    
+    # Define objective function for Optuna
+    def objective(trial):
+        # Suggest hyperparameters based on model type
+        if model_type == "random_forest":
+            params = {
+                'n_estimators': trial.suggest_int('n_estimators', 50, 500),
+                'max_depth': trial.suggest_int('max_depth', 3, 20),
+                'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
+                'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
+                'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
+                'random_state': random_state
+            }
+            if task_type == "classification":
+                model = RandomForestClassifier(**params)
+            else:
+                model = RandomForestRegressor(**params)
+                
+        elif model_type == "xgboost":
+            params = {
+                'n_estimators': trial.suggest_int('n_estimators', 50, 500),
+                'max_depth': trial.suggest_int('max_depth', 3, 10),
+                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
+                'subsample': trial.suggest_float('subsample', 0.5, 1.0),
+                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
+                'gamma': trial.suggest_float('gamma', 0, 5),
+                'reg_alpha': trial.suggest_float('reg_alpha', 0, 2),
+                'reg_lambda': trial.suggest_float('reg_lambda', 0, 2),
+                'random_state': random_state
+            }
+            if task_type == "classification":
+                model = XGBClassifier(**params, use_label_encoder=False, eval_metric='logloss')
+            else:
+                model = XGBRegressor(**params)
+                
+        elif model_type == "logistic":
+            params = {
+                'C': trial.suggest_float('C', 0.001, 100, log=True),
+                'penalty': trial.suggest_categorical('penalty', ['l1', 'l2', 'elasticnet']),
+                'solver': 'saga',
+                'max_iter': 1000,
+                'random_state': random_state
+            }
+            if params['penalty'] == 'elasticnet':
+                params['l1_ratio'] = trial.suggest_float('l1_ratio', 0, 1)
+            model = LogisticRegression(**params)
+            
+        elif model_type == "ridge":
+            params = {
+                'alpha': trial.suggest_float('alpha', 0.001, 100, log=True),
+                'solver': trial.suggest_categorical('solver', ['auto', 'svd', 'cholesky', 'lsqr']),
+                'random_state': random_state
+            }
+            model = Ridge(**params)
+        else:
+            raise ValueError(f"Unsupported model_type: {model_type}")
+        
+        # Cross-validation
+        if task_type == "classification":
+            cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=random_state)
+        else:
+            cv = KFold(n_splits=cv_folds, shuffle=True, random_state=random_state)
+        
+        # Select scoring metric
+        if optimization_metric == "accuracy":
+            scoring = 'accuracy'
+        elif optimization_metric == "f1":
+            scoring = 'f1_weighted'
+        elif optimization_metric == "roc_auc":
+            scoring = 'roc_auc_ovr_weighted'
+        elif optimization_metric == "rmse":
+            scoring = 'neg_root_mean_squared_error'
+        elif optimization_metric == "r2":
+            scoring = 'r2'
+        else:
+            scoring = 'accuracy' if task_type == "classification" else 'neg_root_mean_squared_error'
+        
+        # Cross-validation score
+        scores = cross_val_score(model, X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1)
+        
+        # Return mean score (Optuna maximizes by default)
+        return scores.mean()
+    
+    # Run optimization
+    print(f"🔧 Starting hyperparameter tuning with {n_trials} trials...")
+    study = optuna.create_study(
+        direction='maximize',
+        sampler=TPESampler(seed=random_state),
+        pruner=MedianPruner(n_startup_trials=5, n_warmup_steps=10)
+    )
+    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
+    
+    # Get best parameters
+    best_params = study.best_params
+    best_score = study.best_value
+    
+    print(f"✅ Best {optimization_metric}: {best_score:.4f}")
+    print(f"📊 Best parameters: {best_params}")
+    
+    # Train final model with best parameters
+    if model_type == "random_forest":
+        if task_type == "classification":
+            final_model = RandomForestClassifier(**best_params)
+        else:
+            final_model = RandomForestRegressor(**best_params)
+    elif model_type == "xgboost":
+        if task_type == "classification":
+            final_model = XGBClassifier(**best_params, use_label_encoder=False, eval_metric='logloss')
+        else:
+            final_model = XGBRegressor(**best_params)
+    elif model_type == "logistic":
+        final_model = LogisticRegression(**best_params)
+    elif model_type == "ridge":
+        final_model = Ridge(**best_params)
+    
+    final_model.fit(X_train, y_train)
+    
+    # Evaluate on test set
+    y_pred = final_model.predict(X_test)
+    
+    if task_type == "classification":
+        test_metrics = {
+            'accuracy': float(accuracy_score(y_test, y_pred)),
+            'precision': float(precision_score(y_test, y_pred, average='weighted', zero_division=0)),
+            'recall': float(recall_score(y_test, y_pred, average='weighted', zero_division=0)),
+            'f1': float(f1_score(y_test, y_pred, average='weighted', zero_division=0))
+        }
+        if len(np.unique(y)) == 2:
+            y_pred_proba = final_model.predict_proba(X_test)[:, 1]
+            test_metrics['roc_auc'] = float(roc_auc_score(y_test, y_pred_proba))
+    else:
+        test_metrics = {
+            'rmse': float(np.sqrt(mean_squared_error(y_test, y_pred))),
+            'mae': float(mean_absolute_error(y_test, y_pred)),
+            'r2': float(r2_score(y_test, y_pred))
+        }
+    
+    # Save model if output path provided
+    if output_path:
+        if ARTIFACT_STORE_AVAILABLE:
+            output_path = save_model_with_store(
+                model_data=final_model,
+                filename=os.path.basename(output_path),
+                metadata={
+                    "model_type": model_type,
+                    "task_type": task_type,
+                    "best_params": best_params,
+                    "cv_score": float(best_score),
+                    "test_metrics": test_metrics
+                }
+            )
+        else:
+            os.makedirs(os.path.dirname(output_path), exist_ok=True)
+            joblib.dump(final_model, output_path)
+        print(f"💾 Model saved to: {output_path}")
+    
+    return {
+        'status': 'success',
+        'model_type': model_type,
+        'task_type': task_type,
+        'n_trials': n_trials,
+        'best_params': best_params,
+        'best_cv_score': float(best_score),
+        'optimization_metric': optimization_metric,
+        'test_metrics': test_metrics,
+        'trials_summary': {
+            'total_trials': len(study.trials),
+            'best_trial': study.best_trial.number,
+            'completed_trials': len([t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE])
+        },
+        'model_path': output_path if output_path else None
+    }
+
+
+def train_ensemble_models(
+    file_path: str,
+    target_col: str,
+    ensemble_type: str = "voting",
+    task_type: str = "auto",
+    test_size: float = 0.2,
+    random_state: int = 42,
+    output_path: Optional[str] = None
+) -> Dict[str, Any]:
+    """
+    Train ensemble models using stacking, blending, or voting.
+    
+    Args:
+        file_path: Path to prepared dataset
+        target_col: Target column name
+        ensemble_type: 'voting', 'stacking', or 'blending'
+        task_type: 'classification', 'regression', or 'auto'
+        test_size: Test set size
+        random_state: Random seed
+        output_path: Path to save ensemble model
+        
+    Returns:
+        Dictionary with ensemble performance and comparison
+    """
+    # Validation
+    validate_file_exists(file_path)
+    validate_file_format(file_path)
+    
+    # Load data
+    df = load_dataframe(file_path)
+    validate_dataframe(df)
+    validate_column_exists(df, target_col)
+    
+    # ⚠️ SKIP DATETIME CONVERSION: Already handled by create_time_features() in workflow step 7
+    # The encoded.csv file should already have time features extracted
+    
+    # ⚠️ CRITICAL FIX: Convert Polars to Pandas if needed (for XGBoost compatibility)
+    if hasattr(df, 'to_pandas'):
+        print(f"   🔄 Converting Polars DataFrame to Pandas for XGBoost compatibility...")
+        df = df.to_pandas()
+    
+    # ⚠️ CRITICAL: Drop remaining datetime columns BEFORE NumPy conversion
+    # XGBoost cannot handle Timestamp objects (causes TypeError: float() argument must be a string or a real number, not 'Timestamp')
+    if isinstance(df, pd.DataFrame):
+        datetime_cols = df.select_dtypes(include=['datetime64', 'datetime64[ns]', 'datetime64[ns, UTC]']).columns.tolist()
+        if datetime_cols:
+            print(f"   ⚠️ Dropping {len(datetime_cols)} datetime columns: {datetime_cols}")
+            print(f"   💡 Time features should have been extracted in workflow step 7 (create_time_features)")
+            df = df.drop(columns=datetime_cols)
+        
+        # ⚠️ CRITICAL: Drop any remaining string/object columns (not encoded properly)
+        object_cols = df.select_dtypes(include=['object', 'string']).columns.tolist()
+        object_cols = [col for col in object_cols if col != target_col]
+        if object_cols:
+            print(f"   ⚠️ Dropping {len(object_cols)} string columns that weren't encoded: {object_cols}")
+            print(f"   💡 Categorical encoding should have been done in workflow step 8")
+            df = df.drop(columns=object_cols)
+    
+    # Prepare data - handle both Polars and Pandas
+    if target_col not in df.columns:
+        raise ValueError(f"Target column '{target_col}' not found in dataframe. Available columns: {list(df.columns)}")
+    
+    # Split features and target (works for both Polars and Pandas)
+    if hasattr(df, 'drop'):
+        X = df.drop(columns=[target_col]) if isinstance(df, pd.DataFrame) else df.drop(target_col)
+        y = df[target_col]
+    else:
+        X, y = split_features_target(df, target_col)
+    
+    # Convert to numpy for sklearn compatibility
+    if hasattr(X, 'to_numpy'):
+        X = X.to_numpy()
+        y = y.to_numpy()
+    elif hasattr(X, 'values'):
+        X = X.values
+        y = y.values
+    
+    # Detect task type
+    if task_type == "auto":
+        unique_values = len(np.unique(y))
+        task_type = "classification" if unique_values < 20 else "regression"
+    
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=test_size, random_state=random_state,
+        stratify=y if task_type == "classification" else None
+    )
+    
+    # Define base models
+    if task_type == "classification":
+        base_models = [
+            ('lr', LogisticRegression(max_iter=1000, random_state=random_state)),
+            ('rf', RandomForestClassifier(n_estimators=100, random_state=random_state)),
+            ('xgb', XGBClassifier(n_estimators=100, random_state=random_state, use_label_encoder=False, eval_metric='logloss'))
+        ]
+        meta_model = LogisticRegression(max_iter=1000, random_state=random_state)
+    else:
+        base_models = [
+            ('ridge', Ridge(random_state=random_state)),
+            ('rf', RandomForestRegressor(n_estimators=100, random_state=random_state)),
+            ('xgb', XGBRegressor(n_estimators=100, random_state=random_state))
+        ]
+        meta_model = Ridge(random_state=random_state)
+    
+    # Train individual models for comparison
+    individual_results = {}
+    for name, model in base_models:
+        model.fit(X_train, y_train)
+        y_pred = model.predict(X_test)
+        
+        if task_type == "classification":
+            individual_results[name] = {
+                'accuracy': float(accuracy_score(y_test, y_pred)),
+                'f1': float(f1_score(y_test, y_pred, average='weighted', zero_division=0))
+            }
+        else:
+            individual_results[name] = {
+                'rmse': float(np.sqrt(mean_squared_error(y_test, y_pred))),
+                'r2': float(r2_score(y_test, y_pred))
+            }
+    
+    # Create ensemble
+    print(f"🎯 Building {ensemble_type} ensemble...")
+    
+    if ensemble_type == "voting":
+        if task_type == "classification":
+            ensemble = VotingClassifier(estimators=base_models, voting='soft')
+        else:
+            ensemble = VotingRegressor(estimators=base_models)
+            
+    elif ensemble_type == "stacking":
+        if task_type == "classification":
+            ensemble = StackingClassifier(
+                estimators=base_models,
+                final_estimator=meta_model,
+                cv=5
+            )
+        else:
+            ensemble = StackingRegressor(
+                estimators=base_models,
+                final_estimator=meta_model,
+                cv=5
+            )
+    
+    elif ensemble_type == "blending":
+        # Split training data for blending
+        X_base_train, X_blend_train, y_base_train, y_blend_train = train_test_split(
+            X_train, y_train, test_size=0.3, random_state=random_state,
+            stratify=y_train if task_type == "classification" else None
+        )
+        
+        # Train base models on base training set
+        base_predictions_train = []
+        base_predictions_test = []
+        
+        for name, model in base_models:
+            model.fit(X_base_train, y_base_train)
+            base_predictions_train.append(model.predict(X_blend_train))
+            base_predictions_test.append(model.predict(X_test))
+        
+        # Stack predictions
+        X_blend = np.column_stack(base_predictions_train)
+        X_test_blend = np.column_stack(base_predictions_test)
+        
+        # Train meta-model
+        meta_model.fit(X_blend, y_blend_train)
+        y_pred = meta_model.predict(X_test_blend)
+        
+        # Calculate metrics
+        if task_type == "classification":
+            ensemble_metrics = {
+                'accuracy': float(accuracy_score(y_test, y_pred)),
+                'precision': float(precision_score(y_test, y_pred, average='weighted', zero_division=0)),
+                'recall': float(recall_score(y_test, y_pred, average='weighted', zero_division=0)),
+                'f1': float(f1_score(y_test, y_pred, average='weighted', zero_division=0))
+            }
+        else:
+            ensemble_metrics = {
+                'rmse': float(np.sqrt(mean_squared_error(y_test, y_pred))),
+                'mae': float(mean_absolute_error(y_test, y_pred)),
+                'r2': float(r2_score(y_test, y_pred))
+            }
+        
+        # Save for blending
+        if output_path:
+            if ARTIFACT_STORE_AVAILABLE:
+                output_path = save_model_with_store(
+                    model_data={
+                        'base_models': dict(base_models),
+                        'meta_model': meta_model,
+                        'ensemble_type': 'blending'
+                    },
+                    filename=os.path.basename(output_path),
+                    metadata={
+                        "ensemble_type": "blending",
+                        "task_type": task_type,
+                        "ensemble_metrics": ensemble_metrics,
+                        "num_base_models": len(base_models)
+                    }
+                )
+            else:
+                os.makedirs(os.path.dirname(output_path), exist_ok=True)
+                joblib.dump({
+                    'base_models': dict(base_models),
+                    'meta_model': meta_model,
+                    'ensemble_type': 'blending'
+                }, output_path)
+        
+        return {
+            'status': 'success',
+            'ensemble_type': ensemble_type,
+            'task_type': task_type,
+            'ensemble_metrics': ensemble_metrics,
+            'individual_models': individual_results,
+            'improvement': f"+{(ensemble_metrics.get('accuracy', ensemble_metrics.get('r2', 0)) - max([m.get('accuracy', m.get('r2', 0)) for m in individual_results.values()])) * 100:.2f}%",
+            'model_path': output_path if output_path else None
+        }
+    
+    else:
+        raise ValueError(f"Unsupported ensemble_type: {ensemble_type}")
+    
+    # Train ensemble (voting or stacking)
+    ensemble.fit(X_train, y_train)
+    y_pred = ensemble.predict(X_test)
+    
+    # Calculate ensemble metrics
+    if task_type == "classification":
+        ensemble_metrics = {
+            'accuracy': float(accuracy_score(y_test, y_pred)),
+            'precision': float(precision_score(y_test, y_pred, average='weighted', zero_division=0)),
+            'recall': float(recall_score(y_test, y_pred, average='weighted', zero_division=0)),
+            'f1': float(f1_score(y_test, y_pred, average='weighted', zero_division=0))
+        }
+        best_individual_metric = max([m['accuracy'] for m in individual_results.values()])
+        improvement = ensemble_metrics['accuracy'] - best_individual_metric
+    else:
+        ensemble_metrics = {
+            'rmse': float(np.sqrt(mean_squared_error(y_test, y_pred))),
+            'mae': float(mean_absolute_error(y_test, y_pred)),
+            'r2': float(r2_score(y_test, y_pred))
+        }
+        best_individual_metric = max([m['r2'] for m in individual_results.values()])
+        improvement = ensemble_metrics['r2'] - best_individual_metric
+    
+    # Save model
+    if output_path:
+        if ARTIFACT_STORE_AVAILABLE:
+            output_path = save_model_with_store(
+                model_data=ensemble,
+                filename=os.path.basename(output_path),
+                metadata={
+                    "ensemble_type": ensemble_type,
+                    "task_type": task_type,
+                    "ensemble_metrics": ensemble_metrics,
+                    "improvement_pct": float(improvement * 100)
+                }
+            )
+        else:
+            os.makedirs(os.path.dirname(output_path), exist_ok=True)
+            joblib.dump(ensemble, output_path)
+        print(f"💾 Ensemble model saved to: {output_path}")
+    
+    return {
+        'status': 'success',
+        'ensemble_type': ensemble_type,
+        'task_type': task_type,
+        'ensemble_metrics': ensemble_metrics,
+        'individual_models': individual_results,
+        'improvement': f"+{improvement * 100:.2f}%",
+        'model_path': output_path if output_path else None
+    }
+
+
+def perform_cross_validation(
+    file_path: str,
+    target_col: str,
+    model_type: str = "random_forest",
+    task_type: str = "auto",
+    cv_strategy: str = "kfold",
+    n_splits: int = 5,
+    random_state: int = 42,
+    save_oof: bool = False,
+    output_path: Optional[str] = None
+) -> Dict[str, Any]:
+    """
+    Perform comprehensive cross-validation with out-of-fold predictions.
+    
+    Args:
+        file_path: Path to prepared dataset
+        target_col: Target column name
+        model_type: 'random_forest', 'xgboost', 'logistic', 'ridge'
+        task_type: 'classification', 'regression', or 'auto'
+        cv_strategy: 'kfold', 'stratified', or 'timeseries'
+        n_splits: Number of CV folds
+        random_state: Random seed
+        save_oof: Whether to save out-of-fold predictions
+        output_path: Path to save OOF predictions
+        
+    Returns:
+        Dictionary with CV scores, statistics, and OOF predictions
+    """
+    # ⚠️ CRITICAL FIX: Convert n_splits and random_state to int (Gemini/LLMs pass floats)
+    n_splits = int(n_splits)
+    random_state = int(random_state)
+    
+    # Validation
+    validate_file_exists(file_path)
+    validate_file_format(file_path)
+    
+    # Load data
+    df = load_dataframe(file_path)
+    validate_dataframe(df)
+    validate_column_exists(df, target_col)
+    
+    # ⚠️ SKIP DATETIME CONVERSION: Already handled by create_time_features() in workflow step 7
+    # The encoded.csv file should already have time features extracted
+    
+    # ⚠️ CRITICAL FIX: Convert Polars to Pandas if needed (for XGBoost compatibility)
+    if hasattr(df, 'to_pandas'):
+        print(f"   🔄 Converting Polars DataFrame to Pandas for XGBoost compatibility...")
+        df = df.to_pandas()
+    
+    # ⚠️ CRITICAL: Drop remaining datetime columns BEFORE NumPy conversion
+    # XGBoost cannot handle Timestamp objects (causes TypeError: float() argument must be a string or a real number, not 'Timestamp')
+    if isinstance(df, pd.DataFrame):
+        datetime_cols = df.select_dtypes(include=['datetime64', 'datetime64[ns]', 'datetime64[ns, UTC]']).columns.tolist()
+        if datetime_cols:
+            print(f"   ⚠️ Dropping {len(datetime_cols)} datetime columns: {datetime_cols}")
+            print(f"   💡 Time features should have been extracted in workflow step 7 (create_time_features)")
+            df = df.drop(columns=datetime_cols)
+        
+        # ⚠️ CRITICAL: Drop any remaining string/object columns (not encoded properly)
+        object_cols = df.select_dtypes(include=['object', 'string']).columns.tolist()
+        object_cols = [col for col in object_cols if col != target_col]
+        if object_cols:
+            print(f"   ⚠️ Dropping {len(object_cols)} string columns that weren't encoded: {object_cols}")
+            print(f"   💡 Categorical encoding should have been done in workflow step 8")
+            df = df.drop(columns=object_cols)
+    
+    # Prepare data - handle both Polars and Pandas
+    if target_col not in df.columns:
+        raise ValueError(f"Target column '{target_col}' not found in dataframe. Available columns: {list(df.columns)}")
+    
+    # Split features and target (works for both Polars and Pandas)
+    if hasattr(df, 'drop'):
+        X = df.drop(columns=[target_col]) if isinstance(df, pd.DataFrame) else df.drop(target_col)
+        y = df[target_col]
+    else:
+        X, y = split_features_target(df, target_col)
+    
+    # Convert to numpy for sklearn compatibility
+    if hasattr(X, 'to_numpy'):
+        X = X.to_numpy()
+        y = y.to_numpy()
+    elif hasattr(X, 'values'):
+        X = X.values
+        y = y.values
+    
+    # Detect task type    # Detect task type
+    if task_type == "auto":
+        unique_values = len(np.unique(y))
+        task_type = "classification" if unique_values < 20 else "regression"
+    
+    # Create model
+    if model_type == "random_forest":
+        if task_type == "classification":
+            model = RandomForestClassifier(n_estimators=100, random_state=random_state)
+        else:
+            model = RandomForestRegressor(n_estimators=100, random_state=random_state)
+    elif model_type == "xgboost":
+        if task_type == "classification":
+            model = XGBClassifier(n_estimators=100, random_state=random_state, use_label_encoder=False, eval_metric='logloss')
+        else:
+            model = XGBRegressor(n_estimators=100, random_state=random_state)
+    elif model_type == "logistic":
+        model = LogisticRegression(max_iter=1000, random_state=random_state)
+    elif model_type == "ridge":
+        model = Ridge(random_state=random_state)
+    else:
+        raise ValueError(f"Unsupported model_type: {model_type}")
+    
+    # Create CV splitter
+    # ⚠️ CRITICAL FIX: Auto-use StratifiedKFold for classification to avoid single-class folds
+    if cv_strategy == "timeseries":
+        cv = TimeSeriesSplit(n_splits=n_splits)
+    elif task_type == "classification":
+        # Always use stratified for classification (unless timeseries)
+        cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
+        if cv_strategy != "stratified":
+            print(f"   💡 Auto-switching to StratifiedKFold for classification (prevents single-class folds)")
+    else:
+        # Regression: use regular KFold
+        cv = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
+    
+    print(f"🔄 Performing {n_splits}-fold cross-validation ({cv_strategy})...")
+    
+    # Perform cross-validation with detailed tracking
+    fold_scores = []
+    oof_predictions = np.zeros(len(y))
+    oof_indices = []
+    
+    for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X, y if cv_strategy == "stratified" else None)):
+        X_train_fold, X_val_fold = X[train_idx], X[val_idx]
+        y_train_fold, y_val_fold = y[train_idx], y[val_idx]
+        
+        # Train model
+        model.fit(X_train_fold, y_train_fold)
+        
+        # Predict on validation fold
+        y_pred_fold = model.predict(X_val_fold)
+        
+        # Store OOF predictions
+        oof_predictions[val_idx] = y_pred_fold
+        oof_indices.extend(val_idx.tolist())
+        
+        # Calculate fold metrics
+        if task_type == "classification":
+            fold_score = {
+                'fold': fold_idx + 1,
+                'accuracy': float(accuracy_score(y_val_fold, y_pred_fold)),
+                'f1': float(f1_score(y_val_fold, y_pred_fold, average='weighted', zero_division=0)),
+                'samples': len(val_idx)
+            }
+        else:
+            fold_score = {
+                'fold': fold_idx + 1,
+                'rmse': float(np.sqrt(mean_squared_error(y_val_fold, y_pred_fold))),
+                'r2': float(r2_score(y_val_fold, y_pred_fold)),
+                'samples': len(val_idx)
+            }
+        
+        fold_scores.append(fold_score)
+        print(f"  Fold {fold_idx + 1}: {fold_score}")
+    
+    # Calculate overall OOF metrics
+    if task_type == "classification":
+        oof_metrics = {
+            'accuracy': float(accuracy_score(y, oof_predictions)),
+            'precision': float(precision_score(y, oof_predictions, average='weighted', zero_division=0)),
+            'recall': float(recall_score(y, oof_predictions, average='weighted', zero_division=0)),
+            'f1': float(f1_score(y, oof_predictions, average='weighted', zero_division=0))
+        }
+        mean_fold_metric = np.mean([f['accuracy'] for f in fold_scores])
+        std_fold_metric = np.std([f['accuracy'] for f in fold_scores])
+        metric_name = "accuracy"
+    else:
+        oof_metrics = {
+            'rmse': float(np.sqrt(mean_squared_error(y, oof_predictions))),
+            'mae': float(mean_absolute_error(y, oof_predictions)),
+            'r2': float(r2_score(y, oof_predictions))
+        }
+        mean_fold_metric = np.mean([f['rmse'] for f in fold_scores])
+        std_fold_metric = np.std([f['rmse'] for f in fold_scores])
+        metric_name = "rmse"
+    
+    print(f"\n✅ Overall OOF {metric_name}: {oof_metrics.get(metric_name):.4f} (±{std_fold_metric:.4f})")
+    
+    # Save OOF predictions if requested
+    if save_oof and output_path:
+        oof_df = pl.DataFrame({
+            'index': list(range(len(y))),
+            'true_values': y,
+            'oof_predictions': oof_predictions
+        })
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        oof_df.write_csv(output_path)
+        print(f"💾 OOF predictions saved to: {output_path}")
+    
+    return {
+        'status': 'success',
+        'model_type': model_type,
+        'task_type': task_type,
+        'cv_strategy': cv_strategy,
+        'n_splits': n_splits,
+        'fold_scores': fold_scores,
+        'oof_metrics': oof_metrics,
+        'mean_cv_score': float(mean_fold_metric),
+        'std_cv_score': float(std_fold_metric),
+        'confidence_interval_95': f"[{mean_fold_metric - 1.96 * std_fold_metric:.4f}, {mean_fold_metric + 1.96 * std_fold_metric:.4f}]",
+        'oof_path': output_path if save_oof and output_path else None
+    }
diff --git a/src/tools/auto_pipeline.py b/src/tools/auto_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b0549c097a911fcaac59e44a9d2573977cfc1db
--- /dev/null
+++ b/src/tools/auto_pipeline.py
@@ -0,0 +1,362 @@
+"""
+Automated ML Pipeline
+Zero-configuration automatic data processing: Clean → Encode → Engineer → Select
+"""
+
+import polars as pl
+import numpy as np
+import pandas as pd
+from typing import Dict, Any, List, Optional, Tuple
+from pathlib import Path
+import sys
+import os
+from sklearn.feature_selection import SelectKBest, f_classif, f_regression, mutual_info_classif
+from sklearn.preprocessing import StandardScaler
+
+# Add parent directory to path
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from utils.polars_helpers import load_dataframe, get_numeric_columns
+from utils.validation import validate_file_exists
+from tools.data_cleaning import clean_missing_values, handle_outliers
+from tools.data_type_conversion import force_numeric_conversion, smart_type_inference
+from tools.feature_engineering import encode_categorical, create_time_features
+from tools.advanced_feature_engineering import create_interaction_features
+
+
+def auto_ml_pipeline(file_path: str,
+                     target_col: str,
+                     task_type: str = "auto",
+                     output_path: Optional[str] = None,
+                     feature_engineering_level: str = "basic") -> Dict[str, Any]:
+    """
+    Fully automated ML pipeline with zero manual intervention.
+    
+    Pipeline stages:
+    1. Auto-detect column types
+    2. Clean missing values intelligently
+    3. Handle outliers
+    4. Encode categorical variables
+    5. Engineer time features (if datetime detected)
+    6. Create interaction features (if requested)
+    7. Select best features
+    
+    Args:
+        file_path: Path to input dataset
+        target_col: Target column name
+        task_type: 'classification', 'regression', or 'auto'
+        output_path: Where to save processed data
+        feature_engineering_level: 'basic', 'intermediate', 'advanced'
+        
+    Returns:
+        Dictionary with pipeline results and explanations
+    """
+    validate_file_exists(file_path)
+    
+    if output_path is None:
+        output_path = "./outputs/data/auto_pipeline_output.csv"
+    
+    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+    
+    results = {
+        "stages_completed": [],
+        "transformations_applied": [],
+        "warnings": [],
+        "final_features": [],
+        "output_path": output_path
+    }
+    
+    # Load data
+    df = load_dataframe(file_path)
+    original_shape = df.shape
+    results["original_shape"] = {"rows": original_shape[0], "columns": original_shape[1]}
+    
+    print(f"🚀 Starting Auto ML Pipeline")
+    print(f"📊 Original shape: {original_shape[0]:,} rows × {original_shape[1]} columns")
+    
+    # STAGE 1: Auto-detect column types
+    print("\n🔍 Stage 1: Auto-detecting column types...")
+    type_detection = smart_type_inference(file_path, output_path="./outputs/data/stage1_types.csv")
+    results["stages_completed"].append("type_detection")
+    results["transformations_applied"].append({
+        "stage": "Type Detection",
+        "description": f"Detected {len(type_detection.get('conversions_made', []))} type conversions"
+    })
+    current_file = "./outputs/data/stage1_types.csv"
+    
+    # STAGE 2: Clean missing values
+    print("\n🧹 Stage 2: Cleaning missing values...")
+    cleaning_result = clean_missing_values(
+        current_file,
+        strategy="auto",
+        output_path="./outputs/data/stage2_cleaned.csv"
+    )
+    results["stages_completed"].append("missing_value_cleaning")
+    results["transformations_applied"].append({
+        "stage": "Missing Value Cleaning",
+        "description": f"Cleaned {cleaning_result.get('total_nulls_before', 0)} missing values using auto-detected strategies"
+    })
+    current_file = "./outputs/data/stage2_cleaned.csv"
+    
+    # STAGE 3: Handle outliers
+    print("\n📊 Stage 3: Handling outliers...")
+    outlier_result = handle_outliers(
+        current_file,
+        columns=["all"],
+        method="clip",
+        output_path="./outputs/data/stage3_no_outliers.csv"
+    )
+    results["stages_completed"].append("outlier_handling")
+    results["transformations_applied"].append({
+        "stage": "Outlier Handling",
+        "description": f"Clipped outliers in {outlier_result.get('columns_processed', 0)} columns"
+    })
+    current_file = "./outputs/data/stage3_no_outliers.csv"
+    
+    # STAGE 4: Force numeric conversion (for any remaining string numbers)
+    print("\n🔢 Stage 4: Converting to numeric...")
+    numeric_result = force_numeric_conversion(
+        current_file,
+        columns=["all"],
+        errors="coerce",
+        output_path="./outputs/data/stage4_numeric.csv"
+    )
+    results["stages_completed"].append("numeric_conversion")
+    current_file = "./outputs/data/stage4_numeric.csv"
+    
+    # STAGE 5: Encode categorical variables
+    print("\n🏷️  Stage 5: Encoding categorical variables...")
+    encoding_result = encode_categorical(
+        current_file,
+        method="auto",
+        output_path="./outputs/data/stage5_encoded.csv"
+    )
+    results["stages_completed"].append("categorical_encoding")
+    results["transformations_applied"].append({
+        "stage": "Categorical Encoding",
+        "description": f"Encoded {len(encoding_result.get('encoded_columns', []))} categorical columns"
+    })
+    current_file = "./outputs/data/stage5_encoded.csv"
+    
+    # STAGE 6: Feature engineering (if requested)
+    if feature_engineering_level in ["intermediate", "advanced"]:
+        print("\n⚙️  Stage 6: Engineering features...")
+        
+        # Check for datetime columns and create time features
+        df_current = load_dataframe(current_file).to_pandas()
+        datetime_cols = df_current.select_dtypes(include=['datetime64']).columns.tolist()
+        
+        if datetime_cols:
+            print(f"   Creating time features from {len(datetime_cols)} datetime columns...")
+            for dt_col in datetime_cols:
+                try:
+                    time_result = create_time_features(
+                        current_file,
+                        date_column=dt_col,
+                        output_path=current_file  # Overwrite
+                    )
+                    results["transformations_applied"].append({
+                        "stage": "Time Feature Engineering",
+                        "description": f"Created time features from {dt_col}"
+                    })
+                except Exception as e:
+                    results["warnings"].append(f"Could not create time features from {dt_col}: {str(e)}")
+        
+        # Create interaction features for advanced mode
+        if feature_engineering_level == "advanced":
+            print("   Creating interaction features...")
+            try:
+                interaction_result = create_interaction_features(
+                    current_file,
+                    method="polynomial",
+                    degree=2,
+                    max_features=10,
+                    output_path="./outputs/data/stage6_engineered.csv"
+                )
+                results["stages_completed"].append("interaction_features")
+                results["transformations_applied"].append({
+                    "stage": "Interaction Features",
+                    "description": f"Created {len(interaction_result.get('new_features', []))} interaction features"
+                })
+                current_file = "./outputs/data/stage6_engineered.csv"
+            except Exception as e:
+                results["warnings"].append(f"Could not create interaction features: {str(e)}")
+    
+    # STAGE 7: Feature selection
+    print("\n🎯 Stage 7: Selecting best features...")
+    try:
+        selection_result = auto_feature_selection(
+            current_file,
+            target_col=target_col,
+            task_type=task_type,
+            max_features=50,
+            output_path=output_path
+        )
+        results["stages_completed"].append("feature_selection")
+        results["transformations_applied"].append({
+            "stage": "Feature Selection",
+            "description": f"Selected {selection_result['n_features_selected']} best features from {selection_result['n_features_original']}"
+        })
+        results["selected_features"] = selection_result["selected_features"]
+        results["feature_importance"] = selection_result.get("feature_scores", {})
+    except Exception as e:
+        results["warnings"].append(f"Feature selection failed: {str(e)}")
+        # Just copy the file
+        import shutil
+        shutil.copy(current_file, output_path)
+    
+    # Final shape
+    df_final = load_dataframe(output_path)
+    final_shape = df_final.shape
+    results["final_shape"] = {"rows": final_shape[0], "columns": final_shape[1]}
+    results["final_features"] = df_final.columns
+    
+    print(f"\n✅ Pipeline completed!")
+    print(f"📊 Final shape: {final_shape[0]:,} rows × {final_shape[1]} columns")
+    print(f"💾 Saved to: {output_path}")
+    
+    # Generate summary
+    results["summary"] = _generate_pipeline_summary(results)
+    
+    return results
+
+
+def auto_feature_selection(file_path: str,
+                           target_col: str,
+                           task_type: str = "auto",
+                           max_features: int = 50,
+                           method: str = "auto",
+                           output_path: Optional[str] = None) -> Dict[str, Any]:
+    """
+    Automatically select the best features for modeling.
+    
+    Args:
+        file_path: Path to dataset
+        target_col: Target column
+        task_type: 'classification', 'regression', or 'auto'
+        max_features: Maximum number of features to keep
+        method: 'mutual_info', 'f_test', or 'auto'
+        output_path: Where to save selected features
+        
+    Returns:
+        Dictionary with selection results
+    """
+    validate_file_exists(file_path)
+    df = load_dataframe(file_path).to_pandas()
+    
+    if target_col not in df.columns:
+        return {"status": "error", "message": f"Target column '{target_col}' not found"}
+    
+    # Separate features and target
+    X = df.drop(columns=[target_col])
+    y = df[target_col]
+    
+    # Get only numeric features
+    numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
+    X_numeric = X[numeric_features]
+    
+    if len(numeric_features) == 0:
+        return {"status": "error", "message": "No numeric features found"}
+    
+    # Auto-detect task type
+    if task_type == "auto":
+        if y.dtype == 'object' or y.nunique() < 20:
+            task_type = "classification"
+        else:
+            task_type = "regression"
+    
+    # Select method
+    if method == "auto":
+        method = "mutual_info" if task_type == "classification" else "f_test"
+    
+    # Perform selection
+    n_features_to_select = min(max_features, len(numeric_features))
+    
+    if method == "mutual_info":
+        if task_type == "classification":
+            selector = SelectKBest(mutual_info_classif, k=n_features_to_select)
+        else:
+            from sklearn.feature_selection import mutual_info_regression
+            selector = SelectKBest(mutual_info_regression, k=n_features_to_select)
+    else:  # f_test
+        if task_type == "classification":
+            selector = SelectKBest(f_classif, k=n_features_to_select)
+        else:
+            selector = SelectKBest(f_regression, k=n_features_to_select)
+    
+    # Fit selector
+    X_selected = selector.fit_transform(X_numeric.fillna(0), y)
+    
+    # Get selected feature names
+    selected_mask = selector.get_support()
+    selected_features = np.array(numeric_features)[selected_mask].tolist()
+    
+    # Get feature scores
+    feature_scores = dict(zip(numeric_features, selector.scores_))
+    sorted_features = sorted(feature_scores.items(), key=lambda x: x[1], reverse=True)
+    
+    results = {
+        "n_features_original": len(numeric_features),
+        "n_features_selected": len(selected_features),
+        "selected_features": selected_features,
+        "feature_scores": dict(sorted_features[:n_features_to_select]),
+        "selection_method": method,
+        "task_type": task_type
+    }
+    
+    # Save selected features + target
+    if output_path:
+        df_selected = df[selected_features + [target_col]]
+        df_selected.to_csv(output_path, index=False)
+        results["output_path"] = output_path
+    
+    return results
+
+
+def _generate_pipeline_summary(results: Dict[str, Any]) -> str:
+    """Generate human-readable summary of pipeline execution."""
+    summary = []
+    
+    summary.append("🔄 **Auto ML Pipeline Summary**\n")
+    summary.append(f"Original shape: {results['original_shape']['rows']:,} rows × {results['original_shape']['columns']} columns")
+    summary.append(f"Final shape: {results['final_shape']['rows']:,} rows × {results['final_shape']['columns']} columns\n")
+    
+    summary.append("**Stages Completed:**")
+    for i, stage in enumerate(results['stages_completed'], 1):
+        summary.append(f"{i}. {stage.replace('_', ' ').title()}")
+    
+    summary.append("\n**Transformations Applied:**")
+    for transform in results['transformations_applied']:
+        summary.append(f"• {transform['stage']}: {transform['description']}")
+    
+    if results.get('warnings'):
+        summary.append("\n⚠️  **Warnings:**")
+        for warning in results['warnings']:
+            summary.append(f"• {warning}")
+    
+    if results.get('selected_features'):
+        summary.append(f"\n🎯 **Selected {len(results['selected_features'])} best features**")
+    
+    summary.append(f"\n💾 Output saved to: {results['output_path']}")
+    
+    return "\n".join(summary)
+
+
+def explain_pipeline_decision(stage: str, decision: str, reason: str) -> Dict[str, str]:
+    """
+    Explain a pipeline decision in human-readable format.
+    
+    Args:
+        stage: Pipeline stage name
+        decision: What decision was made
+        reason: Why this decision was made
+        
+    Returns:
+        Dictionary with explanation
+    """
+    return {
+        "stage": stage,
+        "decision": decision,
+        "reason": reason,
+        "explanation": f"In the {stage} stage, I decided to {decision} because {reason}"
+    }
diff --git a/src/tools/business_intelligence.py b/src/tools/business_intelligence.py
new file mode 100644
index 0000000000000000000000000000000000000000..dac8b0c927cea49a034f0acbeb716ca2a69eb2f0
--- /dev/null
+++ b/src/tools/business_intelligence.py
@@ -0,0 +1,799 @@
+"""
+Business Intelligence & Analytics Tools
+
+Advanced business analytics tools for cohort analysis, RFM segmentation,
+causal inference, and automated insight generation.
+"""
+
+import polars as pl
+import numpy as np
+import pandas as pd
+from typing import Dict, Any, List, Optional, Tuple
+from datetime import datetime, timedelta
+import json
+
+# Statistical packages
+try:
+    from scipy import stats
+    from scipy.stats import chi2_contingency, ttest_ind, f_oneway
+except ImportError:
+    pass
+
+try:
+    from statsmodels.tsa.stattools import grangercausalitytests
+    from statsmodels.stats.proportion import proportions_ztest
+    STATSMODELS_AVAILABLE = True
+except ImportError:
+    STATSMODELS_AVAILABLE = False
+
+# Causal inference (optional)
+try:
+    from econml.dml import CausalForestDML
+    from econml.dr import DRLearner
+    ECONML_AVAILABLE = True
+except ImportError:
+    ECONML_AVAILABLE = False
+
+# Customer analytics (optional)
+try:
+    from lifetimes import BetaGeoFitter, GammaGammaFitter
+    from lifetimes.utils import summary_data_from_transaction_data
+    LIFETIMES_AVAILABLE = True
+except ImportError:
+    LIFETIMES_AVAILABLE = False
+
+# For Groq API calls
+import os
+from groq import Groq
+
+
+def perform_cohort_analysis(
+    data: pl.DataFrame,
+    customer_id_column: str,
+    date_column: str,
+    value_column: Optional[str] = None,
+    cohort_period: str = "monthly",
+    metric: str = "retention"
+) -> Dict[str, Any]:
+    """
+    Perform cohort analysis for customer retention, CLV, and churn analysis.
+    
+    Args:
+        data: Input DataFrame with transaction/event data
+        customer_id_column: Column containing customer IDs
+        date_column: Column containing dates
+        value_column: Column containing transaction values (optional, for revenue cohorts)
+        cohort_period: Period for cohorts ('daily', 'weekly', 'monthly', 'quarterly')
+        metric: Metric to analyze ('retention', 'revenue', 'frequency', 'churn')
+    
+    Returns:
+        Dictionary containing cohort analysis results, retention curves, and insights
+    """
+    print(f"🔍 Performing cohort analysis ({metric})...")
+    
+    # Validate input
+    required_cols = [customer_id_column, date_column]
+    if metric == "revenue" and value_column:
+        required_cols.append(value_column)
+    
+    for col in required_cols:
+        if col not in data.columns:
+            raise ValueError(f"Column '{col}' not found in DataFrame")
+    
+    # Convert to pandas for easier date manipulation
+    df = data.to_pandas()
+    
+    # Parse dates
+    df[date_column] = pd.to_datetime(df[date_column])
+    
+    # Create cohort based on first purchase date
+    df['cohort'] = df.groupby(customer_id_column)[date_column].transform('min')
+    
+    # Extract period from dates
+    period_map = {
+        'daily': 'D',
+        'weekly': 'W',
+        'monthly': 'M',
+        'quarterly': 'Q'
+    }
+    
+    if cohort_period not in period_map:
+        raise ValueError(f"Unknown cohort_period '{cohort_period}'. Use: {list(period_map.keys())}")
+    
+    period_format = {
+        'daily': '%Y-%m-%d',
+        'weekly': '%Y-W%U',
+        'monthly': '%Y-%m',
+        'quarterly': '%Y-Q%q'
+    }
+    
+    df['cohort_period'] = df['cohort'].dt.to_period(period_map[cohort_period])
+    df['transaction_period'] = df[date_column].dt.to_period(period_map[cohort_period])
+    
+    # Calculate period number (periods since cohort start)
+    df['period_number'] = (df['transaction_period'] - df['cohort_period']).apply(lambda x: x.n)
+    
+    result = {
+        "metric": metric,
+        "cohort_period": cohort_period,
+        "total_customers": df[customer_id_column].nunique(),
+        "cohorts": []
+    }
+    
+    try:
+        if metric == "retention":
+            # Retention analysis
+            cohort_data = df.groupby(['cohort_period', 'period_number']).agg({
+                customer_id_column: 'nunique'
+            }).reset_index()
+            
+            cohort_data.columns = ['cohort_period', 'period_number', 'customers']
+            
+            # Get cohort sizes (period 0)
+            cohort_sizes = cohort_data[cohort_data['period_number'] == 0].set_index('cohort_period')['customers']
+            
+            # Calculate retention rates
+            cohort_data['cohort_size'] = cohort_data['cohort_period'].map(cohort_sizes)
+            cohort_data['retention_rate'] = cohort_data['customers'] / cohort_data['cohort_size']
+            
+            # Pivot for cohort matrix
+            cohort_matrix = cohort_data.pivot(
+                index='cohort_period',
+                columns='period_number',
+                values='retention_rate'
+            )
+            
+            result["cohort_matrix"] = cohort_matrix.to_dict()
+            result["avg_retention_by_period"] = cohort_matrix.mean().to_dict()
+            
+            # Calculate churn (1 - retention)
+            result["avg_churn_by_period"] = (1 - cohort_matrix.mean()).to_dict()
+            
+            # Retention curve (average across all cohorts)
+            retention_curve = cohort_matrix.mean().to_list()
+            result["retention_curve"] = retention_curve
+            
+        elif metric == "revenue" and value_column:
+            # Revenue cohort analysis
+            cohort_data = df.groupby(['cohort_period', 'period_number']).agg({
+                value_column: 'sum',
+                customer_id_column: 'nunique'
+            }).reset_index()
+            
+            cohort_data.columns = ['cohort_period', 'period_number', 'revenue', 'customers']
+            
+            # Revenue per customer
+            cohort_data['revenue_per_customer'] = cohort_data['revenue'] / cohort_data['customers']
+            
+            # Pivot for cohort matrix
+            cohort_matrix = cohort_data.pivot(
+                index='cohort_period',
+                columns='period_number',
+                values='revenue_per_customer'
+            )
+            
+            result["cohort_matrix"] = cohort_matrix.to_dict()
+            result["avg_revenue_by_period"] = cohort_matrix.mean().to_dict()
+            
+            # Cumulative revenue
+            cumulative_revenue = cohort_matrix.fillna(0).cumsum(axis=1)
+            result["cumulative_revenue"] = cumulative_revenue.mean().to_dict()
+            
+            # Lifetime value estimate (sum of all periods)
+            result["estimated_ltv"] = float(cohort_matrix.sum(axis=1).mean())
+            
+        elif metric == "frequency":
+            # Frequency analysis (purchases per period)
+            cohort_data = df.groupby(['cohort_period', 'period_number', customer_id_column]).size().reset_index(name='transactions')
+            
+            cohort_summary = cohort_data.groupby(['cohort_period', 'period_number']).agg({
+                'transactions': 'mean',
+                customer_id_column: 'count'
+            }).reset_index()
+            
+            cohort_summary.columns = ['cohort_period', 'period_number', 'avg_transactions', 'active_customers']
+            
+            # Pivot
+            cohort_matrix = cohort_summary.pivot(
+                index='cohort_period',
+                columns='period_number',
+                values='avg_transactions'
+            )
+            
+            result["cohort_matrix"] = cohort_matrix.to_dict()
+            result["avg_frequency_by_period"] = cohort_matrix.mean().to_dict()
+        
+        # Cohort-level statistics
+        cohort_stats = []
+        for cohort in df['cohort_period'].unique():
+            cohort_df = df[df['cohort_period'] == cohort]
+            
+            stats_dict = {
+                "cohort": str(cohort),
+                "size": int(cohort_df[customer_id_column].nunique()),
+                "total_transactions": int(len(cohort_df)),
+                "avg_transactions_per_customer": float(len(cohort_df) / cohort_df[customer_id_column].nunique())
+            }
+            
+            if value_column:
+                stats_dict["total_revenue"] = float(cohort_df[value_column].sum())
+                stats_dict["avg_revenue_per_customer"] = float(cohort_df[value_column].sum() / cohort_df[customer_id_column].nunique())
+            
+            cohort_stats.append(stats_dict)
+        
+        result["cohort_statistics"] = cohort_stats
+        
+        # Calculate key insights
+        result["insights"] = _generate_cohort_insights(result, metric)
+        
+        print(f"✅ Cohort analysis complete!")
+        print(f"   Total customers: {result['total_customers']}")
+        print(f"   Cohorts analyzed: {len(cohort_stats)}")
+        
+        return result
+        
+    except Exception as e:
+        print(f"❌ Error during cohort analysis: {str(e)}")
+        raise
+
+
+def _generate_cohort_insights(result: Dict[str, Any], metric: str) -> List[str]:
+    """Generate insights from cohort analysis."""
+    insights = []
+    
+    if metric == "retention" and "retention_curve" in result:
+        retention = result["retention_curve"]
+        if len(retention) > 1:
+            initial_drop = (retention[0] - retention[1]) * 100
+            insights.append(f"Initial retention drop: {initial_drop:.1f}% in first period")
+            
+            if len(retention) > 3:
+                month_3_retention = retention[3] * 100
+                insights.append(f"3-period retention: {month_3_retention:.1f}%")
+    
+    if metric == "revenue" and "estimated_ltv" in result:
+        ltv = result["estimated_ltv"]
+        insights.append(f"Estimated customer lifetime value: ${ltv:.2f}")
+    
+    return insights
+
+
+def perform_rfm_analysis(
+    data: pl.DataFrame,
+    customer_id_column: str,
+    date_column: str,
+    value_column: str,
+    reference_date: Optional[str] = None,
+    rfm_bins: int = 5
+) -> Dict[str, Any]:
+    """
+    Perform RFM (Recency, Frequency, Monetary) analysis for customer segmentation.
+    
+    Args:
+        data: Input DataFrame with transaction data
+        customer_id_column: Column containing customer IDs
+        date_column: Column containing transaction dates
+        value_column: Column containing transaction values
+        reference_date: Reference date for recency calculation (default: max date in data)
+        rfm_bins: Number of bins for RFM scoring (typically 3, 4, or 5)
+    
+    Returns:
+        Dictionary containing RFM scores, segments, and customer profiles
+    """
+    print(f"🔍 Performing RFM analysis...")
+    
+    # Validate input
+    required_cols = [customer_id_column, date_column, value_column]
+    for col in required_cols:
+        if col not in data.columns:
+            raise ValueError(f"Column '{col}' not found in DataFrame")
+    
+    # Convert to pandas
+    df = data.to_pandas()
+    df[date_column] = pd.to_datetime(df[date_column])
+    
+    # Set reference date
+    if reference_date:
+        ref_date = pd.to_datetime(reference_date)
+    else:
+        ref_date = df[date_column].max()
+    
+    print(f"  Reference date: {ref_date.strftime('%Y-%m-%d')}")
+    
+    # Calculate RFM metrics
+    rfm = df.groupby(customer_id_column).agg({
+        date_column: lambda x: (ref_date - x.max()).days,  # Recency
+        customer_id_column: 'count',  # Frequency
+        value_column: 'sum'  # Monetary
+    })
+    
+    rfm.columns = ['recency', 'frequency', 'monetary']
+    
+    # RFM Scoring (1-5, where 5 is best)
+    # Note: For recency, lower is better, so we reverse the scoring
+    rfm['r_score'] = pd.qcut(rfm['recency'], rfm_bins, labels=range(rfm_bins, 0, -1), duplicates='drop')
+    rfm['f_score'] = pd.qcut(rfm['frequency'].rank(method='first'), rfm_bins, labels=range(1, rfm_bins+1), duplicates='drop')
+    rfm['m_score'] = pd.qcut(rfm['monetary'].rank(method='first'), rfm_bins, labels=range(1, rfm_bins+1), duplicates='drop')
+    
+    # Convert to int
+    rfm['r_score'] = rfm['r_score'].astype(int)
+    rfm['f_score'] = rfm['f_score'].astype(int)
+    rfm['m_score'] = rfm['m_score'].astype(int)
+    
+    # RFM Score (concatenated)
+    rfm['rfm_score'] = rfm['r_score'].astype(str) + rfm['f_score'].astype(str) + rfm['m_score'].astype(str)
+    
+    # RFM Total Score (sum)
+    rfm['rfm_total'] = rfm['r_score'] + rfm['f_score'] + rfm['m_score']
+    
+    # Segment customers based on RFM scores
+    def segment_customer(row):
+        r, f, m = row['r_score'], row['f_score'], row['m_score']
+        
+        if r >= 4 and f >= 4 and m >= 4:
+            return "Champions"
+        elif r >= 4 and f >= 3:
+            return "Loyal Customers"
+        elif r >= 4 and f < 3:
+            return "Potential Loyalists"
+        elif r >= 3 and f >= 3 and m >= 3:
+            return "Recent Customers"
+        elif r >= 3 and m >= 4:
+            return "Big Spenders"
+        elif r < 3 and f >= 4:
+            return "At Risk"
+        elif r < 3 and f < 3 and m >= 4:
+            return "Can't Lose Them"
+        elif r < 2:
+            return "Lost"
+        else:
+            return "Needs Attention"
+    
+    rfm['segment'] = rfm.apply(segment_customer, axis=1)
+    
+    # Results
+    result = {
+        "total_customers": len(rfm),
+        "reference_date": ref_date.strftime('%Y-%m-%d'),
+        "rfm_bins": rfm_bins,
+        "rfm_data": rfm.reset_index().to_dict('records'),
+        "segment_summary": {},
+        "rfm_statistics": {}
+    }
+    
+    # Segment summary
+    segment_stats = rfm.groupby('segment').agg({
+        'recency': ['mean', 'median'],
+        'frequency': ['mean', 'median'],
+        'monetary': ['mean', 'median', 'sum'],
+        customer_id_column: 'count'
+    }).round(2)
+    
+    for segment in rfm['segment'].unique():
+        segment_data = rfm[rfm['segment'] == segment]
+        result["segment_summary"][segment] = {
+            "count": int(len(segment_data)),
+            "percentage": float(len(segment_data) / len(rfm) * 100),
+            "avg_recency": float(segment_data['recency'].mean()),
+            "avg_frequency": float(segment_data['frequency'].mean()),
+            "avg_monetary": float(segment_data['monetary'].mean()),
+            "total_revenue": float(segment_data['monetary'].sum())
+        }
+    
+    # Overall RFM statistics
+    result["rfm_statistics"] = {
+        "recency": {
+            "mean": float(rfm['recency'].mean()),
+            "median": float(rfm['recency'].median()),
+            "min": int(rfm['recency'].min()),
+            "max": int(rfm['recency'].max())
+        },
+        "frequency": {
+            "mean": float(rfm['frequency'].mean()),
+            "median": float(rfm['frequency'].median()),
+            "min": int(rfm['frequency'].min()),
+            "max": int(rfm['frequency'].max())
+        },
+        "monetary": {
+            "mean": float(rfm['monetary'].mean()),
+            "median": float(rfm['monetary'].median()),
+            "min": float(rfm['monetary'].min()),
+            "max": float(rfm['monetary'].max()),
+            "total": float(rfm['monetary'].sum())
+        }
+    }
+    
+    # Top customers by RFM score
+    result["top_customers"] = rfm.nlargest(20, 'rfm_total').reset_index().to_dict('records')
+    
+    # Actionable insights
+    result["recommendations"] = _generate_rfm_recommendations(result)
+    
+    print(f"✅ RFM analysis complete!")
+    print(f"   Total customers: {result['total_customers']}")
+    print(f"   Segments: {len(result['segment_summary'])}")
+    print(f"   Top segment: {max(result['segment_summary'].items(), key=lambda x: x[1]['count'])[0]}")
+    
+    return result
+
+
+def _generate_rfm_recommendations(result: Dict[str, Any]) -> Dict[str, List[str]]:
+    """Generate actionable recommendations based on RFM segments."""
+    
+    recommendations = {}
+    
+    segment_actions = {
+        "Champions": [
+            "Reward with exclusive perks and early access to new products",
+            "Request reviews and referrals",
+            "Engage for product development feedback"
+        ],
+        "Loyal Customers": [
+            "Upsell higher value products",
+            "Offer loyalty rewards",
+            "Encourage referrals with incentives"
+        ],
+        "Potential Loyalists": [
+            "Recommend related products",
+            "Offer membership or loyalty program",
+            "Engage with personalized communication"
+        ],
+        "Recent Customers": [
+            "Provide onboarding support",
+            "Build relationships with targeted content",
+            "Offer starter discounts for repeat purchases"
+        ],
+        "Big Spenders": [
+            "Target with premium products",
+            "Increase engagement frequency",
+            "Offer VIP treatment"
+        ],
+        "At Risk": [
+            "Send win-back campaigns",
+            "Offer special discounts or incentives",
+            "Gather feedback on their experience"
+        ],
+        "Can't Lose Them": [
+            "Aggressive win-back campaigns",
+            "Personalized outreach",
+            "Offer significant incentives"
+        ],
+        "Lost": [
+            "Run re-engagement campaigns",
+            "Survey for feedback",
+            "Consider removing from active campaigns"
+        ],
+        "Needs Attention": [
+            "Offer limited-time promotions",
+            "Share valuable content",
+            "Re-engage with surveys"
+        ]
+    }
+    
+    for segment, actions in segment_actions.items():
+        if segment in result["segment_summary"]:
+            recommendations[segment] = actions
+    
+    return recommendations
+
+
+def detect_causal_relationships(
+    data: pl.DataFrame,
+    treatment_column: str,
+    outcome_column: str,
+    covariates: Optional[List[str]] = None,
+    method: str = "granger",
+    max_lag: int = 5,
+    confidence_level: float = 0.95
+) -> Dict[str, Any]:
+    """
+    Detect causal relationships using Granger causality, propensity matching, or uplift modeling.
+    
+    Args:
+        data: Input DataFrame
+        treatment_column: Column indicating treatment/intervention
+        outcome_column: Column indicating outcome variable
+        covariates: List of covariate columns for adjustment
+        method: Method for causal inference ('granger', 'propensity', 'uplift')
+        max_lag: Maximum lag for Granger causality test
+        confidence_level: Confidence level for statistical tests
+    
+    Returns:
+        Dictionary containing causal inference results and effect estimates
+    """
+    print(f"🔍 Detecting causal relationships using {method} method...")
+    
+    # Validate input
+    required_cols = [treatment_column, outcome_column]
+    if covariates:
+        required_cols.extend(covariates)
+    
+    for col in required_cols:
+        if col not in data.columns:
+            raise ValueError(f"Column '{col}' not found in DataFrame")
+    
+    result = {
+        "method": method,
+        "treatment": treatment_column,
+        "outcome": outcome_column,
+        "covariates": covariates or [],
+        "causal_effect": None,
+        "statistical_significance": None
+    }
+    
+    try:
+        if method == "granger" and STATSMODELS_AVAILABLE:
+            # Granger causality test for time series
+            print(f"  Testing Granger causality with max lag = {max_lag}...")
+            
+            # Convert to pandas
+            df = data.select([treatment_column, outcome_column]).to_pandas()
+            
+            # Ensure numeric
+            df = df.apply(pd.to_numeric, errors='coerce').dropna()
+            
+            # Test both directions
+            test_result = grangercausalitytests(
+                df[[outcome_column, treatment_column]],
+                max_lag,
+                verbose=False
+            )
+            
+            # Extract p-values for each lag
+            granger_results = []
+            for lag in range(1, max_lag + 1):
+                ssr_ftest = test_result[lag][0]['ssr_ftest']
+                granger_results.append({
+                    "lag": lag,
+                    "f_statistic": float(ssr_ftest[0]),
+                    "p_value": float(ssr_ftest[1]),
+                    "significant": ssr_ftest[1] < (1 - confidence_level)
+                })
+            
+            result["granger_causality"] = granger_results
+            result["causal_effect"] = any(r["significant"] for r in granger_results)
+            result["statistical_significance"] = min(r["p_value"] for r in granger_results)
+            
+        elif method == "propensity":
+            # Propensity score matching
+            print("  Performing propensity score matching...")
+            
+            df = data.to_pandas()
+            
+            # Ensure treatment is binary
+            treatment = df[treatment_column]
+            if treatment.nunique() > 2:
+                raise ValueError(f"Treatment column must be binary for propensity matching")
+            
+            outcome = df[outcome_column]
+            
+            # Simple comparison without covariates
+            if not covariates:
+                treated = outcome[treatment == 1]
+                control = outcome[treatment == 0]
+                
+                # Calculate average treatment effect
+                ate = treated.mean() - control.mean()
+                
+                # T-test for significance
+                t_stat, p_value = ttest_ind(treated, control)
+                
+                result["average_treatment_effect"] = float(ate)
+                result["t_statistic"] = float(t_stat)
+                result["p_value"] = float(p_value)
+                result["statistical_significance"] = float(p_value)
+                result["causal_effect"] = float(ate)
+                result["confidence_interval"] = [
+                    float(ate - 1.96 * np.sqrt(treated.var()/len(treated) + control.var()/len(control))),
+                    float(ate + 1.96 * np.sqrt(treated.var()/len(treated) + control.var()/len(control)))
+                ]
+            else:
+                # With covariates (simplified - use logistic regression for propensity)
+                from sklearn.linear_model import LogisticRegression
+                from sklearn.neighbors import NearestNeighbors
+                
+                X = df[covariates].apply(pd.to_numeric, errors='coerce').fillna(0)
+                
+                # Estimate propensity scores
+                ps_model = LogisticRegression(max_iter=1000)
+                ps_model.fit(X, treatment)
+                propensity_scores = ps_model.predict_proba(X)[:, 1]
+                
+                df['propensity_score'] = propensity_scores
+                
+                # Matching (1:1 nearest neighbor)
+                treated_df = df[treatment == 1]
+                control_df = df[treatment == 0]
+                
+                # Simple matching on propensity scores
+                nn = NearestNeighbors(n_neighbors=1)
+                nn.fit(control_df[['propensity_score']])
+                
+                distances, indices = nn.kneighbors(treated_df[['propensity_score']])
+                matched_control = control_df.iloc[indices.flatten()]
+                
+                # Calculate ATE on matched sample
+                ate = treated_df[outcome_column].mean() - matched_control[outcome_column].mean()
+                
+                result["average_treatment_effect"] = float(ate)
+                result["n_matched_pairs"] = len(treated_df)
+                result["causal_effect"] = float(ate)
+        
+        elif method == "uplift":
+            # Uplift modeling (treatment effect heterogeneity)
+            print("  Calculating uplift/treatment effect...")
+            
+            df = data.to_pandas()
+            
+            treatment = df[treatment_column]
+            outcome = df[outcome_column]
+            
+            # Calculate uplift by treatment group
+            treated_outcome = outcome[treatment == 1].mean()
+            control_outcome = outcome[treatment == 0].mean()
+            
+            uplift = treated_outcome - control_outcome
+            
+            # Statistical significance
+            t_stat, p_value = ttest_ind(
+                outcome[treatment == 1],
+                outcome[treatment == 0]
+            )
+            
+            result["uplift"] = float(uplift)
+            result["treated_mean"] = float(treated_outcome)
+            result["control_mean"] = float(control_outcome)
+            result["relative_uplift"] = float(uplift / control_outcome * 100) if control_outcome != 0 else 0
+            result["t_statistic"] = float(t_stat)
+            result["p_value"] = float(p_value)
+            result["statistical_significance"] = float(p_value)
+            result["causal_effect"] = float(uplift)
+            
+        else:
+            raise ValueError(f"Unknown method '{method}'. Use 'granger', 'propensity', or 'uplift'")
+        
+        print(f"✅ Causal analysis complete!")
+        if result.get("causal_effect") is not None:
+            print(f"   Estimated causal effect: {result['causal_effect']:.4f}")
+        
+        return result
+        
+    except Exception as e:
+        print(f"❌ Error during causal analysis: {str(e)}")
+        raise
+
+
+def generate_business_insights(
+    data: pl.DataFrame,
+    analysis_type: str,
+    analysis_results: Dict[str, Any],
+    additional_context: Optional[str] = None,
+    groq_api_key: Optional[str] = None
+) -> Dict[str, Any]:
+    """
+    Generate natural language business insights using Groq LLM.
+    
+    Args:
+        data: Input DataFrame (for context)
+        analysis_type: Type of analysis ('rfm', 'cohort', 'causal', 'general')
+        analysis_results: Results from previous analysis (dict)
+        additional_context: Additional business context
+        groq_api_key: Groq API key (if not in environment)
+    
+    Returns:
+        Dictionary containing natural language insights and recommendations
+    """
+    print(f"🔍 Generating business insights for {analysis_type} analysis...")
+    
+    # Get API key
+    api_key = groq_api_key or os.getenv("GROQ_API_KEY")
+    if not api_key:
+        raise ValueError("Groq API key not found. Set GROQ_API_KEY environment variable or pass groq_api_key parameter")
+    
+    client = Groq(api_key=api_key)
+    
+    # Prepare data summary
+    data_summary = {
+        "shape": data.shape,
+        "columns": data.columns,
+        "dtypes": {col: str(dtype) for col, dtype in zip(data.columns, data.dtypes)},
+        "sample_stats": {}
+    }
+    
+    # Add numeric column stats
+    for col in data.columns:
+        if data[col].dtype in [pl.Int32, pl.Int64, pl.Float32, pl.Float64]:
+            data_summary["sample_stats"][col] = {
+                "mean": float(data[col].mean()),
+                "median": float(data[col].median()),
+                "std": float(data[col].std()),
+                "min": float(data[col].min()),
+                "max": float(data[col].max())
+            }
+    
+    # Create prompt based on analysis type
+    prompt = f"""You are a senior business analyst. Analyze the following data and provide actionable business insights.
+
+Analysis Type: {analysis_type.upper()}
+
+Data Summary:
+{json.dumps(data_summary, indent=2)}
+
+Analysis Results:
+{json.dumps(analysis_results, indent=2)}
+
+Additional Context:
+{additional_context or 'None provided'}
+
+Please provide:
+1. Key findings (3-5 bullet points)
+2. Business implications
+3. Actionable recommendations (3-5 specific actions)
+4. Risk factors or caveats
+5. Suggested next steps
+
+Format your response as a structured business report."""
+    
+    try:
+        # Call Groq API
+        response = client.chat.completions.create(
+            model="llama-3.3-70b-versatile",
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a senior business analyst specializing in data-driven insights and strategic recommendations. Provide clear, actionable insights based on data analysis."
+                },
+                {
+                    "role": "user",
+                    "content": prompt
+                }
+            ],
+            temperature=0.3,
+            max_tokens=2000
+        )
+        
+        insights_text = response.choices[0].message.content
+        
+        # Parse insights (simple structure)
+        result = {
+            "analysis_type": analysis_type,
+            "insights_summary": insights_text,
+            "generated_at": datetime.now().isoformat(),
+            "model": "llama-3.3-70b-versatile",
+            "data_context": data_summary
+        }
+        
+        # Try to extract structured sections
+        sections = {}
+        current_section = None
+        
+        for line in insights_text.split('\n'):
+            line = line.strip()
+            if line.startswith('1.') or line.lower().startswith('key findings'):
+                current_section = 'key_findings'
+                sections[current_section] = []
+            elif line.startswith('2.') or line.lower().startswith('business implications'):
+                current_section = 'implications'
+                sections[current_section] = []
+            elif line.startswith('3.') or line.lower().startswith('actionable recommendations'):
+                current_section = 'recommendations'
+                sections[current_section] = []
+            elif line.startswith('4.') or line.lower().startswith('risk'):
+                current_section = 'risks'
+                sections[current_section] = []
+            elif line.startswith('5.') or line.lower().startswith('next steps'):
+                current_section = 'next_steps'
+                sections[current_section] = []
+            elif current_section and line:
+                sections[current_section].append(line)
+        
+        result["structured_insights"] = sections
+        
+        print(f"✅ Business insights generated!")
+        print(f"   Sections: {', '.join(sections.keys())}")
+        
+        return result
+        
+    except Exception as e:
+        print(f"❌ Error generating insights: {str(e)}")
+        raise
diff --git a/src/tools/cloud_data_sources.py b/src/tools/cloud_data_sources.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef44ec912915603fd3d3a38b9767edc9c3d358c8
--- /dev/null
+++ b/src/tools/cloud_data_sources.py
@@ -0,0 +1,581 @@
+"""
+Cloud Data Sources - BigQuery Integration
+Tools for loading and writing data to/from Google BigQuery.
+Compatible with existing DataScienceCopilot tool registry.
+"""
+
+import polars as pl
+import pandas as pd
+from typing import Dict, Any, Optional, Literal
+from pathlib import Path
+import sys
+import os
+
+# Add parent directory to path
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from utils.validation import validate_dataframe
+
+try:
+    from google.cloud import bigquery
+    from google.oauth2 import service_account
+    BIGQUERY_AVAILABLE = True
+except ImportError:
+    BIGQUERY_AVAILABLE = False
+    bigquery = None
+    service_account = None
+
+
+def _get_bigquery_client(project_id: str) -> 'bigquery.Client':
+    """
+    Initialize BigQuery client with credentials from environment.
+    
+    Credential sources (in order of priority):
+    1. GOOGLE_APPLICATION_CREDENTIALS env var (service account JSON path)
+    2. Default application credentials (gcloud auth application-default login)
+    
+    Args:
+        project_id: Google Cloud project ID
+        
+    Returns:
+        BigQuery client instance
+        
+    Raises:
+        ImportError: If google-cloud-bigquery not installed
+        EnvironmentError: If credentials not found
+    """
+    if not BIGQUERY_AVAILABLE:
+        raise ImportError(
+            "google-cloud-bigquery is not installed. "
+            "Install it with: pip install google-cloud-bigquery"
+        )
+    
+    # Check for service account credentials
+    creds_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
+    
+    if creds_path and Path(creds_path).exists():
+        # Use service account JSON
+        credentials = service_account.Credentials.from_service_account_file(creds_path)
+        client = bigquery.Client(project=project_id, credentials=credentials)
+    else:
+        # Use default application credentials
+        try:
+            client = bigquery.Client(project=project_id)
+        except Exception as e:
+            raise EnvironmentError(
+                "BigQuery credentials not found. Either:\n"
+                "1. Set GOOGLE_APPLICATION_CREDENTIALS to service account JSON path\n"
+                "2. Run: gcloud auth application-default login\n"
+                f"Error: {str(e)}"
+            )
+    
+    return client
+
+
+def load_bigquery_table(
+    project_id: str,
+    dataset: str,
+    table: str,
+    limit: Optional[int] = None,
+    columns: Optional[list] = None,
+    where_clause: Optional[str] = None
+) -> Dict[str, Any]:
+    """
+    Load data from BigQuery table into a Polars DataFrame.
+    
+    This tool allows the agent to load data from BigQuery for analysis.
+    Supports sampling via LIMIT and column selection for memory efficiency.
+    
+    Args:
+        project_id: Google Cloud project ID
+        dataset: BigQuery dataset name
+        table: BigQuery table name
+        limit: Optional row limit for sampling (e.g., 10000 for large tables)
+        columns: Optional list of column names to load (default: all columns)
+        where_clause: Optional SQL WHERE clause for filtering (without WHERE keyword)
+            Example: "created_at > '2024-01-01'"
+    
+    Returns:
+        Dictionary with:
+        - success: bool
+        - data_path: str (saved CSV path for downstream tools)
+        - df_info: dict (shape, columns, memory_usage)
+        - message: str
+        - query_stats: dict (bytes processed, rows returned)
+    
+    Examples:
+        >>> # Load full table
+        >>> load_bigquery_table("my-project", "analytics", "users")
+        
+        >>> # Sample 10K rows for exploration
+        >>> load_bigquery_table("my-project", "analytics", "events", limit=10000)
+        
+        >>> # Load specific columns with filter
+        >>> load_bigquery_table(
+        ...     "my-project", "sales", "transactions",
+        ...     columns=["customer_id", "amount", "date"],
+        ...     where_clause="date >= '2024-01-01'",
+        ...     limit=50000
+        ... )
+    """
+    try:
+        # Initialize client
+        client = _get_bigquery_client(project_id)
+        
+        # Build query
+        table_ref = f"{project_id}.{dataset}.{table}"
+        
+        if columns:
+            columns_str = ", ".join(columns)
+        else:
+            columns_str = "*"
+        
+        query = f"SELECT {columns_str} FROM `{table_ref}`"
+        
+        if where_clause:
+            query += f" WHERE {where_clause}"
+        
+        if limit:
+            query += f" LIMIT {limit}"
+        
+        # Execute query
+        query_job = client.query(query)
+        
+        # Load results into pandas (BigQuery SDK returns pandas)
+        df_pandas = query_job.to_dataframe()
+        
+        # Convert to Polars for consistency with existing tools
+        df = pl.from_pandas(df_pandas)
+        
+        # Validate
+        validate_dataframe(df)
+        
+        # Save to outputs/data/ for downstream tool compatibility
+        output_dir = Path("./outputs/data")
+        output_dir.mkdir(parents=True, exist_ok=True)
+        
+        output_path = output_dir / f"bigquery_{dataset}_{table}.csv"
+        df.write_csv(output_path)
+        
+        # Get query statistics
+        bytes_processed = query_job.total_bytes_processed or 0
+        bytes_billed = query_job.total_bytes_billed or 0
+        
+        return {
+            "success": True,
+            "data_path": str(output_path),
+            "df_info": {
+                "rows": df.shape[0],
+                "columns": df.shape[1],
+                "column_names": df.columns,
+                "memory_mb": round(df.estimated_size("mb"), 2)
+            },
+            "query_stats": {
+                "bytes_processed": bytes_processed,
+                "bytes_processed_mb": round(bytes_processed / 1024 / 1024, 2),
+                "bytes_billed": bytes_billed,
+                "bytes_billed_mb": round(bytes_billed / 1024 / 1024, 2),
+                "rows_returned": len(df)
+            },
+            "message": f"✅ Loaded {len(df):,} rows from {table_ref}. Saved to {output_path}",
+            "table_reference": table_ref,
+            "query": query
+        }
+    
+    except ImportError as e:
+        return {
+            "success": False,
+            "error": str(e),
+            "error_type": "ImportError",
+            "message": "BigQuery library not installed. Run: pip install google-cloud-bigquery"
+        }
+    
+    except Exception as e:
+        return {
+            "success": False,
+            "error": str(e),
+            "error_type": type(e).__name__,
+            "message": f"Failed to load BigQuery table: {str(e)}"
+        }
+
+
+def write_bigquery_table(
+    file_path: str,
+    project_id: str,
+    dataset: str,
+    table: str,
+    mode: Literal["append", "overwrite", "fail"] = "append"
+) -> Dict[str, Any]:
+    """
+    Write DataFrame to BigQuery table from CSV/Parquet file.
+    
+    This tool allows the agent to save predictions, metrics, or processed data
+    back to BigQuery for downstream consumption.
+    
+    Args:
+        file_path: Path to CSV or Parquet file containing data to write
+        project_id: Google Cloud project ID
+        dataset: BigQuery dataset name
+        table: BigQuery table name
+        mode: Write mode
+            - "append": Add rows to existing table
+            - "overwrite": Replace table contents
+            - "fail": Raise error if table exists
+    
+    Returns:
+        Dictionary with:
+        - success: bool
+        - table_reference: str
+        - rows_written: int
+        - message: str
+    
+    Examples:
+        >>> # Write predictions to BigQuery
+        >>> write_bigquery_table(
+        ...     "./outputs/data/predictions.csv",
+        ...     "my-project",
+        ...     "ml_results",
+        ...     "churn_predictions",
+        ...     mode="append"
+        ... )
+        
+        >>> # Overwrite existing metrics table
+        >>> write_bigquery_table(
+        ...     "./outputs/data/metrics.csv",
+        ...     "my-project",
+        ...     "ml_results",
+        ...     "model_metrics",
+        ...     mode="overwrite"
+        ... )
+    """
+    try:
+        # Initialize client
+        client = _get_bigquery_client(project_id)
+        
+        # Load data from file
+        file_path = Path(file_path)
+        if not file_path.exists():
+            return {
+                "success": False,
+                "error": f"File not found: {file_path}",
+                "error_type": "FileNotFoundError"
+            }
+        
+        # Load based on extension
+        if file_path.suffix.lower() == ".csv":
+            df = pl.read_csv(file_path)
+        elif file_path.suffix.lower() == ".parquet":
+            df = pl.read_parquet(file_path)
+        else:
+            return {
+                "success": False,
+                "error": f"Unsupported file format: {file_path.suffix}",
+                "error_type": "ValueError"
+            }
+        
+        # Convert to pandas (BigQuery SDK requires pandas)
+        df_pandas = df.to_pandas()
+        
+        # Build table reference
+        table_ref = f"{project_id}.{dataset}.{table}"
+        
+        # Configure write disposition
+        if mode == "append":
+            write_disposition = bigquery.WriteDisposition.WRITE_APPEND
+        elif mode == "overwrite":
+            write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE
+        elif mode == "fail":
+            write_disposition = bigquery.WriteDisposition.WRITE_EMPTY
+        else:
+            return {
+                "success": False,
+                "error": f"Invalid mode: {mode}. Use 'append', 'overwrite', or 'fail'",
+                "error_type": "ValueError"
+            }
+        
+        # Configure job
+        job_config = bigquery.LoadJobConfig(
+            write_disposition=write_disposition,
+            autodetect=True  # Auto-detect schema from DataFrame
+        )
+        
+        # Execute write job
+        job = client.load_table_from_dataframe(
+            df_pandas,
+            table_ref,
+            job_config=job_config
+        )
+        
+        # Wait for completion
+        job.result()
+        
+        return {
+            "success": True,
+            "table_reference": table_ref,
+            "rows_written": len(df_pandas),
+            "mode": mode,
+            "message": f"✅ Wrote {len(df_pandas):,} rows to {table_ref} (mode: {mode})",
+            "table_info": {
+                "project": project_id,
+                "dataset": dataset,
+                "table": table,
+                "columns": df.columns,
+                "rows": len(df)
+            }
+        }
+    
+    except ImportError as e:
+        return {
+            "success": False,
+            "error": str(e),
+            "error_type": "ImportError",
+            "message": "BigQuery library not installed. Run: pip install google-cloud-bigquery"
+        }
+    
+    except Exception as e:
+        return {
+            "success": False,
+            "error": str(e),
+            "error_type": type(e).__name__,
+            "message": f"Failed to write to BigQuery: {str(e)}"
+        }
+
+
+def profile_bigquery_table(
+    project_id: str,
+    dataset: str,
+    table: str
+) -> Dict[str, Any]:
+    """
+    Profile a BigQuery table without loading all data.
+    
+    Returns metadata including row count, column types, null counts,
+    and table size. Useful for initial exploration before full load.
+    
+    Args:
+        project_id: Google Cloud project ID
+        dataset: BigQuery dataset name
+        table: BigQuery table name
+    
+    Returns:
+        Dictionary with:
+        - success: bool
+        - table_reference: str
+        - row_count: int
+        - columns: list of dicts with column info
+        - table_size_mb: float
+        - created: str (timestamp)
+        - modified: str (timestamp)
+        - message: str
+    
+    Examples:
+        >>> # Quick profile before loading
+        >>> profile_bigquery_table("my-project", "analytics", "events")
+        {
+            "success": True,
+            "row_count": 1000000,
+            "columns": [
+                {"name": "user_id", "type": "STRING", "mode": "NULLABLE"},
+                {"name": "event_time", "type": "TIMESTAMP", "mode": "REQUIRED"},
+                ...
+            ],
+            "table_size_mb": 125.5
+        }
+    """
+    try:
+        # Initialize client
+        client = _get_bigquery_client(project_id)
+        
+        # Get table metadata
+        table_ref = f"{project_id}.{dataset}.{table}"
+        table_obj = client.get_table(table_ref)
+        
+        # Extract schema information
+        columns_info = []
+        for field in table_obj.schema:
+            columns_info.append({
+                "name": field.name,
+                "type": field.field_type,
+                "mode": field.mode,  # NULLABLE, REQUIRED, REPEATED
+                "description": field.description or ""
+            })
+        
+        # Get null counts via query (sample for efficiency)
+        null_counts = {}
+        try:
+            # Use TABLESAMPLE for large tables (1% sample)
+            sample_query = f"""
+            SELECT 
+                {', '.join([f'COUNTIF({col["name"]} IS NULL) AS {col["name"]}_nulls' for col in columns_info])}
+            FROM `{table_ref}`
+            TABLESAMPLE SYSTEM (1 PERCENT)
+            """
+            
+            query_job = client.query(sample_query)
+            result = query_job.result()
+            row = next(iter(result))
+            
+            for col in columns_info:
+                null_count = row.get(f'{col["name"]}_nulls', 0)
+                null_counts[col["name"]] = null_count
+        except Exception as e:
+            # If sampling fails, skip null counts
+            null_counts = {col["name"]: "N/A" for col in columns_info}
+        
+        # Table size information
+        table_size_bytes = table_obj.num_bytes or 0
+        table_size_mb = round(table_size_bytes / 1024 / 1024, 2)
+        
+        return {
+            "success": True,
+            "table_reference": table_ref,
+            "profile": {
+                "row_count": table_obj.num_rows,
+                "column_count": len(columns_info),
+                "table_size_mb": table_size_mb,
+                "table_size_gb": round(table_size_mb / 1024, 2)
+            },
+            "columns": columns_info,
+            "null_counts_sample": null_counts,
+            "metadata": {
+                "created": table_obj.created.isoformat() if table_obj.created else None,
+                "modified": table_obj.modified.isoformat() if table_obj.modified else None,
+                "location": table_obj.location,
+                "expiration": table_obj.expires.isoformat() if table_obj.expires else None
+            },
+            "message": f"✅ Profiled {table_ref}: {table_obj.num_rows:,} rows, {len(columns_info)} columns, {table_size_mb} MB",
+            "recommendation": (
+                f"Table has {table_obj.num_rows:,} rows. "
+                f"Consider using limit={min(10000, table_obj.num_rows)} for initial exploration."
+                if table_obj.num_rows > 10000 else
+                f"Table is small ({table_obj.num_rows:,} rows), safe to load fully."
+            )
+        }
+    
+    except ImportError as e:
+        return {
+            "success": False,
+            "error": str(e),
+            "error_type": "ImportError",
+            "message": "BigQuery library not installed. Run: pip install google-cloud-bigquery"
+        }
+    
+    except Exception as e:
+        return {
+            "success": False,
+            "error": str(e),
+            "error_type": type(e).__name__,
+            "message": f"Failed to profile BigQuery table: {str(e)}"
+        }
+
+
+def query_bigquery(
+    project_id: str,
+    query: str,
+    output_path: Optional[str] = None,
+    limit: Optional[int] = None
+) -> Dict[str, Any]:
+    """
+    Execute a custom BigQuery SQL query and return results as DataFrame.
+    
+    This tool allows the agent to run custom SQL queries for complex
+    data transformations before analysis.
+    
+    Args:
+        project_id: Google Cloud project ID
+        query: SQL query to execute
+        output_path: Optional path to save results (default: auto-generated)
+        limit: Optional row limit to append to query
+    
+    Returns:
+        Dictionary with:
+        - success: bool
+        - data_path: str
+        - df_info: dict
+        - query_stats: dict
+        - message: str
+    
+    Examples:
+        >>> # Custom aggregation query
+        >>> query_bigquery(
+        ...     "my-project",
+        ...     '''
+        ...     SELECT 
+        ...         customer_id,
+        ...         SUM(amount) as total_spent,
+        ...         COUNT(*) as num_orders
+        ...     FROM `my-project.sales.orders`
+        ...     WHERE date >= '2024-01-01'
+        ...     GROUP BY customer_id
+        ...     '''
+        ... )
+    """
+    try:
+        # Initialize client
+        client = _get_bigquery_client(project_id)
+        
+        # Add limit if specified
+        if limit:
+            query = f"{query.rstrip(';')} LIMIT {limit}"
+        
+        # Execute query
+        query_job = client.query(query)
+        df_pandas = query_job.to_dataframe()
+        
+        # Convert to Polars
+        df = pl.from_pandas(df_pandas)
+        
+        # Determine output path
+        if output_path is None:
+            output_dir = Path("./outputs/data")
+            output_dir.mkdir(parents=True, exist_ok=True)
+            output_path = str(output_dir / "bigquery_query_result.csv")
+        
+        # Save results
+        df.write_csv(output_path)
+        
+        # Get query statistics
+        bytes_processed = query_job.total_bytes_processed or 0
+        
+        return {
+            "success": True,
+            "data_path": output_path,
+            "df_info": {
+                "rows": df.shape[0],
+                "columns": df.shape[1],
+                "column_names": df.columns,
+                "memory_mb": round(df.estimated_size("mb"), 2)
+            },
+            "query_stats": {
+                "bytes_processed": bytes_processed,
+                "bytes_processed_mb": round(bytes_processed / 1024 / 1024, 2),
+                "rows_returned": len(df)
+            },
+            "message": f"✅ Query returned {len(df):,} rows. Saved to {output_path}",
+            "query": query
+        }
+    
+    except ImportError as e:
+        return {
+            "success": False,
+            "error": str(e),
+            "error_type": "ImportError",
+            "message": "BigQuery library not installed. Run: pip install google-cloud-bigquery"
+        }
+    
+    except Exception as e:
+        return {
+            "success": False,
+            "error": str(e),
+            "error_type": type(e).__name__,
+            "message": f"Failed to execute BigQuery query: {str(e)}"
+        }
+
+
+# Export functions for tool registry
+__all__ = [
+    'load_bigquery_table',
+    'write_bigquery_table',
+    'profile_bigquery_table',
+    'query_bigquery'
+]
diff --git a/src/tools/code_interpreter.py b/src/tools/code_interpreter.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1a6a75b5a81ffc586c1303ea6ba7c98f74a93ec
--- /dev/null
+++ b/src/tools/code_interpreter.py
@@ -0,0 +1,386 @@
+"""
+Code Interpreter Tool
+Allows the AI agent to write and execute custom Python code for tasks that don't have predefined tools.
+This is what makes it a TRUE AI Agent, not just a function-calling bot.
+"""
+
+import os
+import sys
+import subprocess
+import tempfile
+from pathlib import Path
+from typing import Dict, Any, Optional
+import polars as pl
+
+
+def execute_python_code(
+    code: str,
+    working_directory: str = "./outputs/code",
+    timeout: int = 60,
+    allow_file_operations: bool = True,
+    output_file: Optional[str] = None
+) -> Dict[str, Any]:
+    """
+    Execute custom Python code written by the AI agent.
+    
+    This is the KEY tool that transforms the agent from a function-calling bot
+    into a true AI agent capable of solving ANY data science problem.
+    
+    Use cases:
+    - Custom visualizations not covered by existing tools
+    - Data transformations too specific for generic tools
+    - Domain-specific calculations
+    - Interactive dashboards
+    - Custom export formats
+    
+    Args:
+        code: Python code to execute
+        working_directory: Where to run the code (default: ./outputs/code)
+        timeout: Maximum execution time in seconds
+        allow_file_operations: Whether code can read/write files
+        output_file: Optional file path to save output (e.g., HTML plot)
+        
+    Returns:
+        Dict with execution results, stdout, stderr, and any generated files
+        
+    Example:
+        # Agent can write custom Plotly code for specific visualizations
+        code = '''
+        import plotly.express as px
+        import pandas as pd
+        
+        df = pd.read_csv('./temp/sales_data.csv')
+        fig = px.line(df, x='month', y='sales', color='bike_model',
+                     title='Extended Sales by Month for Each Bike Model')
+        
+        # Add dropdown filter
+        fig.update_layout(
+            updatemenus=[{
+                'buttons': [{'label': model, 'method': 'update',
+                           'args': [{'visible': [model == m for m in df['bike_model'].unique()]}]}
+                          for model in df['bike_model'].unique()],
+                'direction': 'down',
+                'showactive': True
+            }]
+        )
+        
+        fig.write_html('./outputs/code/bike_sales_interactive.html')
+        print("Chart saved to: ./outputs/code/bike_sales_interactive.html")
+        '''
+        
+        result = execute_python_code(code)
+    """
+    try:
+        # ⚠️ CRITICAL: Basic syntax validation BEFORE execution
+        try:
+            compile(code, '<string>', 'exec')
+        except SyntaxError as e:
+            return {
+                "success": False,
+                "error": f"Syntax error in generated code: {str(e)}",
+                "error_type": "SyntaxError",
+                "line": e.lineno,
+                "suggestion": "Fix syntax errors in the code. Common issues: missing quotes, parentheses, indentation"
+            }
+        
+        # Create working directory with proper permissions
+        try:
+            os.makedirs(working_directory, exist_ok=True)
+            # Ensure directory is writable
+            test_file = os.path.join(working_directory, '.write_test')
+            with open(test_file, 'w') as f:
+                f.write('test')
+            os.remove(test_file)
+        except PermissionError:
+            return {
+                "success": False,
+                "error": f"No write permission for directory: {working_directory}",
+                "error_type": "PermissionError",
+                "suggestion": f"Check folder permissions or use a different directory"
+            }
+        except Exception as e:
+            return {
+                "success": False,
+                "error": f"Failed to create working directory: {str(e)}",
+                "error_type": type(e).__name__
+            }
+        
+        # Security: Validate code doesn't contain dangerous operations
+        dangerous_patterns = {
+            'subprocess': 'Use specialized tools instead of shell commands',
+            '__import__': 'Dynamic imports not allowed for security',
+            'eval(': 'eval() is dangerous - rewrite without it',
+            'exec(': 'exec() is dangerous - rewrite without it',
+            'compile(': 'compile() not needed - write code directly',
+            'os.system': 'Shell commands not allowed - use Python libraries',
+            'os.popen': 'Shell commands not allowed - use Python libraries'
+        }
+        
+        for pattern, reason in dangerous_patterns.items():
+            if pattern in code:
+                return {
+                    "success": False,
+                    "error": f"Code contains restricted operation: {pattern}",
+                    "error_type": "SecurityError",
+                    "reason": reason,
+                    "suggestion": "Rewrite code using safe Python operations"
+                }
+        
+        # Create temporary Python file with better error handling
+        temp_file = None
+        try:
+            with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, 
+                                            dir=working_directory, encoding='utf-8') as f:
+                temp_file = f.name
+                
+                # Add helper imports at the top + error handling wrapper
+                enhanced_code = """
+# Auto-imported libraries for convenience
+import pandas as pd
+import polars as pl
+import numpy as np
+import matplotlib
+matplotlib.use('Agg')  # Non-interactive backend
+import matplotlib.pyplot as plt
+import seaborn as sns
+import plotly.express as px
+import plotly.graph_objects as go
+from pathlib import Path
+import json
+import sys
+import traceback
+
+# Ensure output directory exists
+import os
+os.makedirs('./outputs/code', exist_ok=True)
+os.makedirs('./outputs/data', exist_ok=True)
+
+try:
+    # User's code starts here
+""" + "\n".join("    " + line for line in code.split("\n")) + """
+
+except Exception as e:
+    print(f"❌ Error in code execution: {str(e)}", file=sys.stderr)
+    traceback.print_exc()
+    sys.exit(1)
+"""
+                
+                f.write(enhanced_code)
+        
+        except Exception as e:
+            return {
+                "success": False,
+                "error": f"Failed to write temporary file: {str(e)}",
+                "error_type": type(e).__name__,
+                "suggestion": "Check file write permissions"
+            }
+        
+        # Track existing files BEFORE execution to detect new files
+        existing_files = set()
+        if allow_file_operations:
+            for output_dir in ['./outputs/code', './outputs/data', './outputs/plots']:
+                if os.path.exists(output_dir):
+                    for file_path in Path(output_dir).resolve().glob('**/*'):
+                        if file_path.is_file():
+                            existing_files.add(file_path.resolve())
+        
+        try:
+            # Execute the code with better error capture
+            # Use absolute path and normalize it for Windows
+            abs_temp_file = os.path.abspath(temp_file)
+            abs_cwd = os.path.abspath(Path.cwd())
+            
+            result = subprocess.run(
+                [sys.executable, abs_temp_file],
+                capture_output=True,
+                text=True,
+                timeout=timeout,
+                cwd=abs_cwd  # Use absolute path to avoid permission issues
+            )
+            
+            stdout = result.stdout.strip()
+            stderr = result.stderr.strip()
+            returncode = result.returncode
+            
+            # Check for errors with detailed diagnostics
+            if returncode != 0:
+                # Parse error message for common issues
+                error_hints = []
+                if "PermissionError" in stderr:
+                    error_hints.append("💡 File permission issue - check if file is open in another program")
+                if "FileNotFoundError" in stderr:
+                    error_hints.append("💡 File not found - check if path is correct (use relative paths like './outputs/data/file.csv')")
+                if "KeyError" in stderr:
+                    error_hints.append("💡 Column not found - check column names in the CSV")
+                if "ModuleNotFoundError" in stderr:
+                    error_hints.append("💡 Missing library - may need to install additional packages")
+                if "ValueError" in stderr:
+                    error_hints.append("💡 Data type mismatch - check data types and conversions")
+                
+                return {
+                    "success": False,
+                    "error": f"Code execution failed",
+                    "stderr": stderr,
+                    "stdout": stdout if stdout else None,
+                    "error_type": "ExecutionError",
+                    "exit_code": returncode,
+                    "hints": error_hints if error_hints else ["Check the error message above for details"]
+                }
+            
+            # Success! Find NEWLY generated files (not existing before execution)
+            generated_files = []
+            if allow_file_operations:
+                cwd = Path.cwd()
+                for output_dir in ['./outputs/code', './outputs/data', './outputs/plots']:
+                    if os.path.exists(output_dir):
+                        abs_output_dir = Path(output_dir).resolve()
+                        for file_path in abs_output_dir.glob('**/*'):
+                            if file_path.is_file():
+                                abs_file = file_path.resolve()
+                                
+                                # Only include if it's NEW (didn't exist before) or MODIFIED
+                                is_new = abs_file not in existing_files
+                                
+                                # Check if file was modified in last 5 seconds (just created/updated)
+                                import time
+                                file_age = time.time() - file_path.stat().st_mtime
+                                is_recent = file_age < 5
+                                
+                                if (is_new or is_recent):
+                                    # Get relative path safely (handle Windows paths)
+                                    try:
+                                        rel_path = file_path.relative_to(cwd)
+                                    except ValueError:
+                                        # Fallback: just use the file name with output dir
+                                        rel_path = Path(output_dir) / file_path.name
+                                    
+                                    # Only include if not temp file and has content
+                                    abs_temp = Path(temp_file).resolve() if temp_file else None
+                                    if file_path != abs_temp and file_path.stat().st_size > 0:
+                                        generated_files.append(str(rel_path).replace('\\', '/'))
+            
+            # Sort by modification time (newest first)
+            if generated_files:
+                generated_files = sorted(
+                    generated_files,
+                    key=lambda x: Path(x).stat().st_mtime,
+                    reverse=True
+                )[:10]  # Limit to 10 most recent files
+            
+            return {
+                "success": True,
+                "stdout": stdout if stdout else "✅ Code executed successfully (no output)",
+                "stderr": stderr if stderr else None,
+                "message": "✅ Code executed successfully",
+                "generated_files": generated_files,
+                "working_directory": working_directory,
+                "execution_summary": {
+                    "lines_of_code": len(code.split('\n')),
+                    "files_generated": len(generated_files)
+                }
+            }
+            
+        finally:
+            # Clean up temp file
+            if temp_file and os.path.exists(temp_file):
+                try:
+                    os.unlink(temp_file)
+                except Exception:
+                    pass  # Ignore cleanup errors
+                
+    except subprocess.TimeoutExpired:
+        return {
+            "success": False,
+            "error": f"Code execution timed out after {timeout} seconds",
+            "error_type": "TimeoutError",
+            "suggestion": "Code is taking too long. Optimize it or increase timeout. Avoid large loops or heavy computations."
+        }
+    except Exception as e:
+        return {
+            "success": False,
+            "error": f"Unexpected error: {str(e)}",
+            "error_type": type(e).__name__,
+            "suggestion": "This is an unexpected error. Try simplifying the code."
+        }
+
+
+def execute_code_from_file(
+    file_path: str,
+    working_directory: str = "./outputs/code",
+    timeout: int = 60
+) -> Dict[str, Any]:
+    """
+    Execute Python code from a file.
+    
+    Useful when code is too long to pass as a string, or when the agent
+    wants to run an existing script.
+    
+    Args:
+        file_path: Path to Python file to execute
+        working_directory: Where to run the code
+        timeout: Maximum execution time in seconds
+        
+    Returns:
+        Dict with execution results
+    """
+    try:
+        # Read code from file
+        with open(file_path, 'r', encoding='utf-8') as f:
+            code = f.read()
+        
+        return execute_python_code(
+            code=code,
+            working_directory=working_directory,
+            timeout=timeout
+        )
+    except FileNotFoundError:
+        return {
+            "success": False,
+            "error": f"File not found: {file_path}",
+            "error_type": "FileNotFoundError"
+        }
+    except Exception as e:
+        return {
+            "success": False,
+            "error": f"Failed to read file: {str(e)}",
+            "error_type": type(e).__name__
+        }
+
+
+def generate_custom_visualization(
+    data_file: str,
+    visualization_description: str,
+    output_path: str = "./outputs/code/custom_plot.html",
+    timeout: int = 60
+) -> Dict[str, Any]:
+    """
+    HIGH-LEVEL helper: Generate custom visualization from natural language description.
+    
+    The agent describes what it wants, and this function attempts to generate the code.
+    This is a convenience wrapper that could use an LLM to generate the plotting code.
+    
+    Args:
+        data_file: Path to dataset
+        visualization_description: Natural language description of desired plot
+        output_path: Where to save the visualization
+        timeout: Execution timeout
+        
+    Returns:
+        Dict with execution results
+        
+    Example:
+        result = generate_custom_visualization(
+            data_file="./temp/sales.csv",
+            visualization_description="Line plot of sales by month for each bike model, with dropdown filter",
+            output_path="./outputs/code/sales_plot.html"
+        )
+    """
+    # This is a placeholder - in a full implementation, this would use an LLM
+    # to generate the Plotly code from the description
+    
+    return {
+        "success": False,
+        "error": "Not yet implemented - use execute_python_code with explicit code instead",
+        "error_type": "NotImplementedError",
+        "suggestion": "Write the Plotly code explicitly and use execute_python_code()"
+    }
diff --git a/src/tools/computer_vision.py b/src/tools/computer_vision.py
new file mode 100644
index 0000000000000000000000000000000000000000..49616cb90877e9d0ae9889bc51a915171e78111a
--- /dev/null
+++ b/src/tools/computer_vision.py
@@ -0,0 +1,614 @@
+"""
+Computer Vision & Image Analytics Tools
+
+Advanced computer vision tools for image feature extraction, clustering,
+and hybrid tabular-image analysis.
+"""
+
+import polars as pl
+import numpy as np
+from typing import Dict, Any, List, Optional, Tuple
+from pathlib import Path
+import json
+
+# Core CV libraries (optional)
+try:
+    from PIL import Image
+    import cv2
+    CV2_AVAILABLE = True
+except ImportError:
+    CV2_AVAILABLE = False
+
+try:
+    import torch
+    import torchvision
+    from torchvision import models, transforms
+    TORCH_AVAILABLE = True
+except ImportError:
+    TORCH_AVAILABLE = False
+
+# ML libraries
+try:
+    from sklearn.cluster import KMeans, DBSCAN
+    from sklearn.decomposition import PCA
+    from sklearn.preprocessing import StandardScaler
+    from sklearn.manifold import TSNE
+except ImportError:
+    pass
+
+
+def extract_image_features(
+    image_paths: List[str],
+    method: str = "cnn",
+    model_name: str = "resnet50",
+    color_spaces: Optional[List[str]] = None,
+    include_histograms: bool = True,
+    histogram_bins: int = 256
+) -> Dict[str, Any]:
+    """
+    Extract features from images using CNN embeddings, color histograms, and other methods.
+    
+    Args:
+        image_paths: List of paths to image files
+        method: Feature extraction method ('cnn', 'color', 'texture', 'hybrid')
+        model_name: Pre-trained model for CNN features ('resnet50', 'efficientnet_b0', 'vgg16')
+        color_spaces: Color spaces for histograms (['rgb', 'hsv', 'lab'])
+        include_histograms: Whether to include color histograms
+        histogram_bins: Number of bins for histograms
+    
+    Returns:
+        Dictionary containing feature vectors, dimensionality, and metadata
+    """
+    print(f"🔍 Extracting image features using {method} method...")
+    
+    if not image_paths:
+        raise ValueError("No image paths provided")
+    
+    result = {
+        "method": method,
+        "n_images": len(image_paths),
+        "features": [],
+        "feature_dim": 0,
+        "failed_images": []
+    }
+    
+    try:
+        if method == "cnn" and TORCH_AVAILABLE:
+            print(f"  Using CNN model: {model_name}")
+            
+            # Load pre-trained model
+            if model_name == "resnet50":
+                model = models.resnet50(pretrained=True)
+                # Remove final classification layer
+                model = torch.nn.Sequential(*list(model.children())[:-1])
+            elif model_name == "efficientnet_b0":
+                model = models.efficientnet_b0(pretrained=True)
+                model = torch.nn.Sequential(*list(model.children())[:-1])
+            elif model_name == "vgg16":
+                model = models.vgg16(pretrained=True)
+                model.classifier = torch.nn.Sequential(*list(model.classifier.children())[:-1])
+            else:
+                raise ValueError(f"Unknown model '{model_name}'")
+            
+            model.eval()
+            
+            # Image preprocessing
+            preprocess = transforms.Compose([
+                transforms.Resize(256),
+                transforms.CenterCrop(224),
+                transforms.ToTensor(),
+                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+            ])
+            
+            # Extract features
+            for img_path in image_paths:
+                try:
+                    img = Image.open(img_path).convert('RGB')
+                    img_tensor = preprocess(img).unsqueeze(0)
+                    
+                    with torch.no_grad():
+                        features = model(img_tensor)
+                        features = features.squeeze().numpy()
+                    
+                    result["features"].append({
+                        "image_path": img_path,
+                        "feature_vector": features.tolist(),
+                        "feature_dim": len(features)
+                    })
+                    
+                except Exception as e:
+                    result["failed_images"].append({"path": img_path, "error": str(e)})
+            
+            if result["features"]:
+                result["feature_dim"] = result["features"][0]["feature_dim"]
+        
+        elif method in ["color", "hybrid"] or not TORCH_AVAILABLE:
+            print("  Using color histogram features...")
+            
+            if not CV2_AVAILABLE:
+                print("⚠️  OpenCV not available. Using PIL for basic features...")
+                return _extract_features_basic(image_paths)
+            
+            color_spaces = color_spaces or ['rgb', 'hsv']
+            
+            for img_path in image_paths:
+                try:
+                    # Read image
+                    img = cv2.imread(img_path)
+                    if img is None:
+                        raise ValueError(f"Could not read image: {img_path}")
+                    
+                    feature_vector = []
+                    
+                    # Color histograms
+                    if 'rgb' in color_spaces:
+                        for i in range(3):
+                            hist = cv2.calcHist([img], [i], None, [histogram_bins], [0, 256])
+                            feature_vector.extend(hist.flatten().tolist())
+                    
+                    if 'hsv' in color_spaces:
+                        hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
+                        for i in range(3):
+                            hist = cv2.calcHist([hsv], [i], None, [histogram_bins], [0, 256])
+                            feature_vector.extend(hist.flatten().tolist())
+                    
+                    if 'lab' in color_spaces:
+                        lab = cv2.cvtColor(img, cv2.COLOR_BGR2LAB)
+                        for i in range(3):
+                            hist = cv2.calcHist([lab], [i], None, [histogram_bins], [0, 256])
+                            feature_vector.extend(hist.flatten().tolist())
+                    
+                    # Basic image stats
+                    feature_vector.extend([
+                        img.shape[0],  # height
+                        img.shape[1],  # width
+                        img.mean(),    # mean pixel value
+                        img.std()      # std pixel value
+                    ])
+                    
+                    result["features"].append({
+                        "image_path": img_path,
+                        "feature_vector": feature_vector,
+                        "feature_dim": len(feature_vector)
+                    })
+                    
+                except Exception as e:
+                    result["failed_images"].append({"path": img_path, "error": str(e)})
+            
+            if result["features"]:
+                result["feature_dim"] = result["features"][0]["feature_dim"]
+        
+        elif method == "texture":
+            print("  Extracting texture features...")
+            
+            if not CV2_AVAILABLE:
+                raise ImportError("OpenCV required for texture features")
+            
+            for img_path in image_paths:
+                try:
+                    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
+                    if img is None:
+                        raise ValueError(f"Could not read image: {img_path}")
+                    
+                    # Edge detection
+                    edges = cv2.Canny(img, 100, 200)
+                    
+                    # Texture features
+                    feature_vector = [
+                        edges.mean(),
+                        edges.std(),
+                        np.count_nonzero(edges) / edges.size,  # edge density
+                        img.mean(),
+                        img.std()
+                    ]
+                    
+                    result["features"].append({
+                        "image_path": img_path,
+                        "feature_vector": feature_vector,
+                        "feature_dim": len(feature_vector)
+                    })
+                    
+                except Exception as e:
+                    result["failed_images"].append({"path": img_path, "error": str(e)})
+            
+            if result["features"]:
+                result["feature_dim"] = result["features"][0]["feature_dim"]
+        
+        else:
+            raise ValueError(f"Unknown method '{method}' or required libraries not available")
+        
+        print(f"✅ Feature extraction complete!")
+        print(f"   Processed: {len(result['features'])} images")
+        print(f"   Failed: {len(result['failed_images'])} images")
+        print(f"   Feature dimension: {result['feature_dim']}")
+        
+        return result
+        
+    except Exception as e:
+        print(f"❌ Error during feature extraction: {str(e)}")
+        raise
+
+
+def _extract_features_basic(image_paths: List[str]) -> Dict[str, Any]:
+    """Fallback feature extraction using PIL when OpenCV/PyTorch not available."""
+    
+    result = {
+        "method": "basic_pil",
+        "n_images": len(image_paths),
+        "features": [],
+        "feature_dim": 0,
+        "failed_images": []
+    }
+    
+    for img_path in image_paths:
+        try:
+            img = Image.open(img_path).convert('RGB')
+            img_array = np.array(img)
+            
+            # Basic statistics per channel
+            feature_vector = []
+            for channel in range(3):
+                channel_data = img_array[:, :, channel]
+                feature_vector.extend([
+                    channel_data.mean(),
+                    channel_data.std(),
+                    channel_data.min(),
+                    channel_data.max()
+                ])
+            
+            # Image dimensions
+            feature_vector.extend([img_array.shape[0], img_array.shape[1]])
+            
+            result["features"].append({
+                "image_path": img_path,
+                "feature_vector": feature_vector,
+                "feature_dim": len(feature_vector)
+            })
+            
+        except Exception as e:
+            result["failed_images"].append({"path": img_path, "error": str(e)})
+    
+    if result["features"]:
+        result["feature_dim"] = result["features"][0]["feature_dim"]
+    
+    result["note"] = "Install torch, torchvision, and opencv for advanced features"
+    
+    return result
+
+
+def perform_image_clustering(
+    features: Dict[str, Any],
+    n_clusters: int = 5,
+    method: str = "kmeans",
+    reduce_dimensions: bool = True,
+    target_dim: int = 50,
+    return_similar_pairs: bool = True,
+    top_k: int = 10
+) -> Dict[str, Any]:
+    """
+    Cluster images based on extracted features and find similar images.
+    
+    Args:
+        features: Output from extract_image_features
+        n_clusters: Number of clusters
+        method: Clustering method ('kmeans', 'dbscan')
+        reduce_dimensions: Whether to reduce dimensions before clustering
+        target_dim: Target dimensionality for reduction
+        return_similar_pairs: Whether to return most similar image pairs
+        top_k: Number of top similar pairs to return
+    
+    Returns:
+        Dictionary containing cluster assignments, centroids, and similar pairs
+    """
+    print(f"🔍 Clustering images using {method}...")
+    
+    if not features.get("features"):
+        raise ValueError("No features provided for clustering")
+    
+    # Extract feature vectors
+    feature_vectors = np.array([f["feature_vector"] for f in features["features"]])
+    image_paths = [f["image_path"] for f in features["features"]]
+    
+    print(f"  Feature matrix shape: {feature_vectors.shape}")
+    
+    result = {
+        "method": method,
+        "n_images": len(image_paths),
+        "n_clusters": n_clusters,
+        "clusters": []
+    }
+    
+    try:
+        # Normalize features
+        scaler = StandardScaler()
+        feature_vectors_scaled = scaler.fit_transform(feature_vectors)
+        
+        # Dimensionality reduction
+        if reduce_dimensions and feature_vectors_scaled.shape[1] > target_dim:
+            print(f"  Reducing dimensions from {feature_vectors_scaled.shape[1]} to {target_dim}...")
+            pca = PCA(n_components=target_dim)
+            feature_vectors_reduced = pca.fit_transform(feature_vectors_scaled)
+            result["explained_variance"] = float(pca.explained_variance_ratio_.sum())
+            print(f"    Explained variance: {result['explained_variance']:.3f}")
+        else:
+            feature_vectors_reduced = feature_vectors_scaled
+        
+        # Clustering
+        if method == "kmeans":
+            clusterer = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
+            labels = clusterer.fit_predict(feature_vectors_reduced)
+            
+            result["cluster_centers"] = clusterer.cluster_centers_.tolist()
+            result["inertia"] = float(clusterer.inertia_)
+            
+        elif method == "dbscan":
+            clusterer = DBSCAN(eps=0.5, min_samples=5)
+            labels = clusterer.fit_predict(feature_vectors_reduced)
+            
+            n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
+            result["n_clusters"] = n_clusters
+            result["n_noise_points"] = int((labels == -1).sum())
+            
+        else:
+            raise ValueError(f"Unknown method '{method}'. Use 'kmeans' or 'dbscan'")
+        
+        # Organize results by cluster
+        for cluster_id in sorted(set(labels)):
+            cluster_indices = np.where(labels == cluster_id)[0]
+            cluster_images = [image_paths[i] for i in cluster_indices]
+            
+            cluster_info = {
+                "cluster_id": int(cluster_id),
+                "size": len(cluster_images),
+                "images": cluster_images[:100]  # Limit to first 100
+            }
+            
+            if method == "kmeans":
+                # Calculate distances to centroid
+                centroid = clusterer.cluster_centers_[cluster_id]
+                distances = np.linalg.norm(feature_vectors_reduced[cluster_indices] - centroid, axis=1)
+                
+                # Representative images (closest to centroid)
+                representative_indices = distances.argsort()[:5]
+                cluster_info["representative_images"] = [
+                    cluster_images[i] for i in representative_indices
+                ]
+            
+            result["clusters"].append(cluster_info)
+        
+        # Find similar image pairs
+        if return_similar_pairs:
+            print(f"  Finding top {top_k} similar image pairs...")
+            
+            from sklearn.metrics.pairwise import cosine_similarity
+            
+            similarity_matrix = cosine_similarity(feature_vectors_reduced)
+            
+            # Get upper triangle indices (avoid duplicates and self-similarity)
+            triu_indices = np.triu_indices(len(image_paths), k=1)
+            similarities = similarity_matrix[triu_indices]
+            
+            # Get top K most similar pairs
+            top_indices = similarities.argsort()[-top_k:][::-1]
+            
+            similar_pairs = []
+            for idx in top_indices:
+                i, j = triu_indices[0][idx], triu_indices[1][idx]
+                similar_pairs.append({
+                    "image1": image_paths[i],
+                    "image2": image_paths[j],
+                    "similarity": float(similarities[idx])
+                })
+            
+            result["similar_pairs"] = similar_pairs
+        
+        # Visualize with t-SNE (if enough samples)
+        if len(image_paths) >= 30:
+            print("  Computing t-SNE for visualization...")
+            tsne = TSNE(n_components=2, random_state=42, perplexity=min(30, len(image_paths)-1))
+            embeddings_2d = tsne.fit_transform(feature_vectors_reduced)
+            
+            result["tsne_embeddings"] = embeddings_2d.tolist()
+        
+        print(f"✅ Clustering complete!")
+        print(f"   Clusters: {len(result['clusters'])}")
+        for cluster in result["clusters"]:
+            print(f"     Cluster {cluster['cluster_id']}: {cluster['size']} images")
+        
+        return result
+        
+    except Exception as e:
+        print(f"❌ Error during clustering: {str(e)}")
+        raise
+
+
+def analyze_tabular_image_hybrid(
+    tabular_data: pl.DataFrame,
+    image_column: str,
+    target_column: Optional[str] = None,
+    tabular_features: Optional[List[str]] = None,
+    fusion_method: str = "concatenate",
+    model_type: str = "classification",
+    test_size: float = 0.2
+) -> Dict[str, Any]:
+    """
+    Analyze datasets with both tabular and image data using multi-modal learning.
+    
+    Args:
+        tabular_data: DataFrame with tabular features and image paths
+        image_column: Column containing image file paths
+        target_column: Target variable column (if supervised learning)
+        tabular_features: List of tabular feature columns (if None, uses all except image/target)
+        fusion_method: How to combine features ('concatenate', 'attention', 'early', 'late')
+        model_type: Type of task ('classification', 'regression')
+        test_size: Proportion of data for testing
+    
+    Returns:
+        Dictionary containing model performance, feature importance, and predictions
+    """
+    print(f"🔍 Analyzing hybrid tabular-image data...")
+    
+    # Validate input
+    if image_column not in tabular_data.columns:
+        raise ValueError(f"Image column '{image_column}' not found in DataFrame")
+    
+    if target_column and target_column not in tabular_data.columns:
+        raise ValueError(f"Target column '{target_column}' not found in DataFrame")
+    
+    # Determine tabular features
+    if tabular_features is None:
+        exclude_cols = [image_column]
+        if target_column:
+            exclude_cols.append(target_column)
+        tabular_features = [col for col in tabular_data.columns if col not in exclude_cols]
+    
+    print(f"  Tabular features: {len(tabular_features)}")
+    print(f"  Image column: {image_column}")
+    print(f"  Target column: {target_column}")
+    
+    result = {
+        "n_samples": tabular_data.shape[0],
+        "n_tabular_features": len(tabular_features),
+        "fusion_method": fusion_method,
+        "model_type": model_type
+    }
+    
+    try:
+        # Step 1: Extract image features
+        print("\n  Step 1: Extracting image features...")
+        image_paths = tabular_data[image_column].to_list()
+        
+        # Use CNN features if available, otherwise color histograms
+        method = "cnn" if TORCH_AVAILABLE else "color"
+        image_features_result = extract_image_features(
+            image_paths,
+            method=method,
+            model_name="resnet50" if TORCH_AVAILABLE else None
+        )
+        
+        # Build image feature matrix
+        image_feature_matrix = np.array([
+            f["feature_vector"] for f in image_features_result["features"]
+        ])
+        
+        print(f"    Image features shape: {image_feature_matrix.shape}")
+        
+        # Step 2: Prepare tabular features
+        print("\n  Step 2: Preparing tabular features...")
+        tabular_feature_matrix = tabular_data.select(tabular_features).to_numpy()
+        
+        # Handle missing values
+        from sklearn.impute import SimpleImputer
+        imputer = SimpleImputer(strategy='mean')
+        tabular_feature_matrix = imputer.fit_transform(tabular_feature_matrix)
+        
+        print(f"    Tabular features shape: {tabular_feature_matrix.shape}")
+        
+        # Step 3: Fusion
+        print(f"\n  Step 3: Fusing features using '{fusion_method}' method...")
+        
+        if fusion_method == "concatenate" or fusion_method == "early":
+            # Simple concatenation
+            combined_features = np.hstack([tabular_feature_matrix, image_feature_matrix])
+            result["combined_feature_dim"] = combined_features.shape[1]
+            
+        elif fusion_method == "late":
+            # Train separate models and combine predictions
+            combined_features = tabular_feature_matrix  # Will handle separately
+            result["combined_feature_dim"] = tabular_feature_matrix.shape[1]
+            result["image_feature_dim"] = image_feature_matrix.shape[1]
+            
+        else:
+            raise ValueError(f"Unknown fusion method '{fusion_method}'")
+        
+        print(f"    Combined features shape: {combined_features.shape}")
+        
+        # Step 4: Train model (if target provided)
+        if target_column:
+            print(f"\n  Step 4: Training {model_type} model...")
+            
+            target = tabular_data[target_column].to_numpy()
+            
+            # Split data
+            from sklearn.model_selection import train_test_split
+            
+            X_train, X_test, y_train, y_test = train_test_split(
+                combined_features, target, test_size=test_size, random_state=42
+            )
+            
+            # Train model
+            if model_type == "classification":
+                from sklearn.ensemble import RandomForestClassifier
+                model = RandomForestClassifier(n_estimators=100, random_state=42)
+                model.fit(X_train, y_train)
+                
+                # Evaluate
+                from sklearn.metrics import accuracy_score, classification_report
+                
+                train_pred = model.predict(X_train)
+                test_pred = model.predict(X_test)
+                
+                result["train_accuracy"] = float(accuracy_score(y_train, train_pred))
+                result["test_accuracy"] = float(accuracy_score(y_test, test_pred))
+                
+                # Classification report
+                report = classification_report(y_test, test_pred, output_dict=True)
+                result["classification_report"] = report
+                
+            elif model_type == "regression":
+                from sklearn.ensemble import RandomForestRegressor
+                model = RandomForestRegressor(n_estimators=100, random_state=42)
+                model.fit(X_train, y_train)
+                
+                # Evaluate
+                from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
+                
+                train_pred = model.predict(X_train)
+                test_pred = model.predict(X_test)
+                
+                result["train_rmse"] = float(np.sqrt(mean_squared_error(y_train, train_pred)))
+                result["test_rmse"] = float(np.sqrt(mean_squared_error(y_test, test_pred)))
+                result["train_r2"] = float(r2_score(y_train, train_pred))
+                result["test_r2"] = float(r2_score(y_test, test_pred))
+                result["test_mae"] = float(mean_absolute_error(y_test, test_pred))
+            
+            # Feature importance
+            if fusion_method == "concatenate":
+                feature_names = tabular_features + [f"image_feat_{i}" for i in range(image_feature_matrix.shape[1])]
+                
+                # Top 20 most important features
+                importances = model.feature_importances_
+                top_indices = importances.argsort()[-20:][::-1]
+                
+                result["top_features"] = [
+                    {
+                        "feature": feature_names[i],
+                        "importance": float(importances[i])
+                    }
+                    for i in top_indices
+                ]
+                
+                # Compare tabular vs image feature importance
+                tabular_importance = importances[:len(tabular_features)].sum()
+                image_importance = importances[len(tabular_features):].sum()
+                
+                result["feature_importance_split"] = {
+                    "tabular": float(tabular_importance),
+                    "image": float(image_importance),
+                    "tabular_percentage": float(tabular_importance / importances.sum() * 100),
+                    "image_percentage": float(image_importance / importances.sum() * 100)
+                }
+        
+        print(f"\n✅ Hybrid analysis complete!")
+        if target_column:
+            if model_type == "classification":
+                print(f"   Test accuracy: {result['test_accuracy']:.4f}")
+            else:
+                print(f"   Test R²: {result['test_r2']:.4f}")
+                print(f"   Test RMSE: {result['test_rmse']:.4f}")
+        
+        return result
+        
+    except Exception as e:
+        print(f"❌ Error during hybrid analysis: {str(e)}")
+        raise
diff --git a/src/tools/data_cleaning.py b/src/tools/data_cleaning.py
new file mode 100644
index 0000000000000000000000000000000000000000..598f9f07c1d48bc0488d0f21c9eb07f6b7d6f79a
--- /dev/null
+++ b/src/tools/data_cleaning.py
@@ -0,0 +1,526 @@
+"""
+Data Cleaning Tools
+Tools for handling missing values, outliers, and data type issues.
+"""
+
+import polars as pl
+import numpy as np
+from typing import Dict, Any, List, Optional
+from pathlib import Path
+import sys
+import os
+
+# Add parent directory to path for imports
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from utils.polars_helpers import (
+    load_dataframe,
+    save_dataframe,
+    get_numeric_columns,
+    get_categorical_columns,
+    get_datetime_columns,
+    detect_id_columns,
+)
+from utils.validation import (
+    validate_file_exists,
+    validate_file_format,
+    validate_dataframe,
+    validate_columns_exist,
+)
+
+
+def clean_missing_values(file_path: str, strategy, 
+                        output_path: str, threshold: float = 0.4) -> Dict[str, Any]:
+    """
+    Handle missing values using appropriate strategies with smart threshold-based column dropping.
+    
+    Args:
+        file_path: Path to CSV or Parquet file
+        strategy: Either "auto" (string) to automatically decide strategies for all columns,
+                 or a dictionary mapping column names to strategies 
+                 ('median', 'mean', 'mode', 'forward_fill', 'drop')
+        output_path: Path to save cleaned dataset
+        threshold: For "auto" strategy, drop columns with missing % > threshold (default: 0.4 = 40%)
+        
+    Returns:
+        Dictionary with cleaning report
+        
+    Auto Strategy Behavior:
+        1. Drop columns with >threshold missing (default 40%)
+        2. Impute numeric columns with median
+        3. Impute categorical columns with mode
+        4. Forward-fill for time series columns
+    """
+    # Validation
+    validate_file_exists(file_path)
+    validate_file_format(file_path)
+    
+    # Load data
+    df = load_dataframe(file_path)
+    validate_dataframe(df)
+    
+    # Get column type information
+    numeric_cols = get_numeric_columns(df)
+    categorical_cols = get_categorical_columns(df)
+    datetime_cols = get_datetime_columns(df)
+    id_cols = detect_id_columns(df)
+    
+    report = {
+        "original_rows": len(df),
+        "original_columns": len(df.columns),
+        "columns_dropped": [],
+        "columns_processed": {},
+        "rows_dropped": 0,
+        "threshold_used": threshold
+    }
+    
+    # Handle "auto" mode - Smart threshold-based cleaning
+    if isinstance(strategy, str) and strategy == "auto":
+        # Step 1: Identify and drop high-missing columns (>threshold)
+        cols_to_drop = []
+        for col in df.columns:
+            null_count = df[col].null_count()
+            null_pct = null_count / len(df) if len(df) > 0 else 0
+            
+            if null_pct > threshold:
+                cols_to_drop.append(col)
+                report["columns_dropped"].append({
+                    "column": col,
+                    "missing_percentage": round(null_pct * 100, 2),
+                    "reason": f"Missing >{threshold*100}% of values"
+                })
+        
+        # Drop high-missing columns
+        if cols_to_drop:
+            df = df.drop(cols_to_drop)
+            print(f"🗑️  Dropped {len(cols_to_drop)} columns with >{threshold*100}% missing:")
+            for col_info in report["columns_dropped"]:
+                print(f"    - {col_info['column']} ({col_info['missing_percentage']}% missing)")
+        
+        # Step 2: Build strategy for remaining columns
+        strategy = {}
+        for col in df.columns:
+            if df[col].null_count() > 0:
+                if col in id_cols:
+                    strategy[col] = "drop"  # Drop rows with missing IDs
+                elif col in datetime_cols:
+                    strategy[col] = "forward_fill"  # Forward fill for time series
+                elif col in numeric_cols:
+                    strategy[col] = "median"  # Median for numeric (robust to outliers)
+                elif col in categorical_cols:
+                    strategy[col] = "mode"  # Mode for categorical
+                else:
+                    strategy[col] = "mode"  # Default to mode
+        
+        print(f"🔧 Auto-detected strategies for {len(strategy)} remaining columns with missing values")
+    
+    # Process each column based on strategy
+    for col, strat in strategy.items():
+        if col not in df.columns:
+            report["columns_processed"][col] = {
+                "status": "error",
+                "message": f"Column not found (may have been dropped)"
+            }
+            continue
+        
+        null_count_before = df[col].null_count()
+        
+        if null_count_before == 0:
+            report["columns_processed"][col] = {
+                "status": "skipped",
+                "message": "No missing values"
+            }
+            continue
+        
+        # Don't impute ID columns - drop rows instead
+        if col in id_cols and strat != "drop":
+            report["columns_processed"][col] = {
+                "status": "skipped",
+                "message": "ID column - not imputed (use 'drop' to remove rows)"
+            }
+            continue
+        
+        # Apply strategy
+        try:
+            rows_before = len(df)
+            
+            if strat == "median":
+                if col in numeric_cols:
+                    median_val = df[col].median()
+                    df = df.with_columns(
+                        pl.col(col).fill_null(median_val).alias(col)
+                    )
+                    report["columns_processed"][col] = {
+                        "status": "success",
+                        "strategy": "median",
+                        "nulls_before": int(null_count_before),
+                        "nulls_after": int(df[col].null_count()),
+                        "fill_value": float(median_val)
+                    }
+                else:
+                    report["columns_processed"][col] = {
+                        "status": "error",
+                        "message": "Cannot use median on non-numeric column"
+                    }
+                    continue
+            
+            elif strat == "mean":
+                if col in numeric_cols:
+                    mean_val = df[col].mean()
+                    df = df.with_columns(
+                        pl.col(col).fill_null(mean_val).alias(col)
+                    )
+                    report["columns_processed"][col] = {
+                        "status": "success",
+                        "strategy": "mean",
+                        "nulls_before": int(null_count_before),
+                        "nulls_after": int(df[col].null_count()),
+                        "fill_value": float(mean_val)
+                    }
+                else:
+                    report["columns_processed"][col] = {
+                        "status": "error",
+                        "message": "Cannot use mean on non-numeric column"
+                    }
+                    continue
+            
+            elif strat == "mode":
+                mode_val = df[col].drop_nulls().mode().first()
+                if mode_val is not None:
+                    df = df.with_columns(
+                        pl.col(col).fill_null(mode_val).alias(col)
+                    )
+                    report["columns_processed"][col] = {
+                        "status": "success",
+                        "strategy": "mode",
+                        "nulls_before": int(null_count_before),
+                        "nulls_after": int(df[col].null_count()),
+                        "fill_value": str(mode_val)
+                    }
+            
+            elif strat == "forward_fill":
+                df = df.with_columns(
+                    pl.col(col).forward_fill().alias(col)
+                )
+                report["columns_processed"][col] = {
+                    "status": "success",
+                    "strategy": "forward_fill",
+                    "nulls_before": int(null_count_before),
+                    "nulls_after": int(df[col].null_count())
+                }
+            
+            elif strat == "drop":
+                df = df.filter(pl.col(col).is_not_null())
+                rows_after = len(df)
+                report["columns_processed"][col] = {
+                    "status": "success",
+                    "strategy": "drop",
+                    "nulls_before": int(null_count_before),
+                    "rows_dropped": rows_before - rows_after
+                }
+            
+            else:
+                report["columns_processed"][col] = {
+                    "status": "error",
+                    "message": f"Unknown strategy: {strat}"
+                }
+                continue
+        
+        except Exception as e:
+            report["columns_processed"][col] = {
+                "status": "error",
+                "message": str(e)
+            }
+    
+    report["final_rows"] = len(df)
+    report["final_columns"] = len(df.columns)
+    report["rows_dropped"] = report["original_rows"] - report["final_rows"]
+    report["columns_dropped_count"] = len(report["columns_dropped"])
+    
+    # Save cleaned dataset
+    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+    save_dataframe(df, output_path)
+    report["output_path"] = output_path
+    
+    # Summary message
+    report["message"] = f"Cleaned {report['original_rows']} rows → {report['final_rows']} rows. "
+    report["message"] += f"Dropped {report['columns_dropped_count']} columns. "
+    report["message"] += f"Processed {len([c for c in report['columns_processed'].values() if c['status'] == 'success'])} columns."
+    
+    return report
+
+
+def handle_outliers(file_path: str, method: str, columns: List[str], 
+                   output_path: str) -> Dict[str, Any]:
+    """
+    Detect and handle outliers in numeric columns.
+    
+    Args:
+        file_path: Path to CSV or Parquet file
+        method: Method to handle outliers ('clip', 'winsorize', 'remove')
+        columns: List of columns to check, or ['all'] for all numeric columns
+        output_path: Path to save cleaned dataset
+        
+    Returns:
+        Dictionary with outlier handling report
+    """
+    # Validation
+    validate_file_exists(file_path)
+    validate_file_format(file_path)
+    
+    # Load data
+    df = load_dataframe(file_path)
+    validate_dataframe(df)
+    
+    # Determine which columns to process
+    numeric_cols = get_numeric_columns(df)
+    
+    if columns == ["all"]:
+        target_cols = numeric_cols
+    else:
+        # Filter to only existing numeric columns (auto-skip dropped columns)
+        target_cols = []
+        for col in columns:
+            if col not in df.columns:
+                print(f"⚠️  Skipping '{col}' - column was dropped in previous step")
+                continue
+            if col not in numeric_cols:
+                print(f"⚠️  Skipping '{col}' - not numeric")
+                continue
+            target_cols.append(col)
+        
+        # If no valid columns remain, return early
+        if not target_cols:
+            return {
+                "success": False,
+                "error": f"None of the requested columns exist in the dataset. Available numeric columns: {', '.join(numeric_cols[:20])}",
+                "error_type": "ValueError"
+            }
+    
+    report = {
+        "original_rows": len(df),
+        "method": method,
+        "columns_processed": {}
+    }
+    
+    # Process each column
+    for col in target_cols:
+        col_data = df[col].drop_nulls()
+        
+        if len(col_data) == 0:
+            report["columns_processed"][col] = {
+                "status": "skipped",
+                "message": "All values are null"
+            }
+            continue
+        
+        # Calculate IQR bounds
+        q1 = col_data.quantile(0.25)
+        q3 = col_data.quantile(0.75)
+        iqr = q3 - q1
+        
+        lower_bound = q1 - 1.5 * iqr
+        upper_bound = q3 + 1.5 * iqr
+        
+        # Count outliers
+        outliers_mask = (df[col] < lower_bound) | (df[col] > upper_bound)
+        outlier_count = outliers_mask.sum()
+        
+        if outlier_count == 0:
+            report["columns_processed"][col] = {
+                "status": "skipped",
+                "message": "No outliers detected"
+            }
+            continue
+        
+        # Apply method
+        if method == "clip":
+            # Clip values to bounds
+            df = df.with_columns(
+                pl.col(col).clip(lower_bound, upper_bound).alias(col)
+            )
+        
+        elif method == "winsorize":
+            # Winsorize: cap at 1st and 99th percentiles
+            p1 = col_data.quantile(0.01)
+            p99 = col_data.quantile(0.99)
+            df = df.with_columns(
+                pl.col(col).clip(p1, p99).alias(col)
+            )
+        
+        elif method == "remove":
+            # Remove rows with outliers
+            df = df.filter(~outliers_mask)
+        
+        report["columns_processed"][col] = {
+            "status": "success",
+            "outliers_detected": int(outlier_count),
+            "bounds": {
+                "lower": float(lower_bound),
+                "upper": float(upper_bound)
+            }
+        }
+    
+    report["final_rows"] = len(df)
+    report["rows_dropped"] = report["original_rows"] - report["final_rows"]
+    
+    # Save cleaned dataset
+    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+    save_dataframe(df, output_path)
+    report["output_path"] = output_path
+    
+    return report
+
+
+def fix_data_types(file_path: str, type_mapping: Optional[Dict[str, str]] = None,
+                  output_path: str = None) -> Dict[str, Any]:
+    """
+    Auto-detect and fix incorrect data types.
+    
+    Args:
+        file_path: Path to CSV or Parquet file
+        type_mapping: Optional dictionary mapping columns to target types
+                     ('int', 'float', 'string', 'date', 'bool', 'category')
+                     Use 'auto' or None for automatic detection
+        output_path: Path to save dataset with fixed types
+        
+    Returns:
+        Dictionary with type fixing report
+    """
+    # Validation
+    validate_file_exists(file_path)
+    validate_file_format(file_path)
+    
+    # Load data
+    df = load_dataframe(file_path)
+    validate_dataframe(df)
+    
+    if type_mapping is None or type_mapping == {"auto": "auto"}:
+        type_mapping = {}
+    
+    report = {
+        "columns_processed": {}
+    }
+    
+    for col in df.columns:
+        original_dtype = str(df[col].dtype)
+        
+        # Get target type from mapping or auto-detect
+        if col in type_mapping and type_mapping[col] != "auto":
+            target_type = type_mapping[col]
+        else:
+            # Auto-detect target type
+            target_type = _auto_detect_type(df[col])
+        
+        if target_type is None:
+            report["columns_processed"][col] = {
+                "status": "skipped",
+                "original_dtype": original_dtype,
+                "message": "Could not auto-detect type"
+            }
+            continue
+        
+        # Try to convert
+        try:
+            if target_type == "int":
+                df = df.with_columns(
+                    pl.col(col).cast(pl.Int64, strict=False).alias(col)
+                )
+            elif target_type == "float":
+                df = df.with_columns(
+                    pl.col(col).cast(pl.Float64, strict=False).alias(col)
+                )
+            elif target_type == "string":
+                df = df.with_columns(
+                    pl.col(col).cast(pl.Utf8).alias(col)
+                )
+            elif target_type == "date":
+                df = df.with_columns(
+                    pl.col(col).str.strptime(pl.Date, "%Y-%m-%d", strict=False).alias(col)
+                )
+            elif target_type == "bool":
+                df = df.with_columns(
+                    pl.col(col).cast(pl.Boolean, strict=False).alias(col)
+                )
+            elif target_type == "category":
+                df = df.with_columns(
+                    pl.col(col).cast(pl.Categorical).alias(col)
+                )
+            
+            new_dtype = str(df[col].dtype)
+            
+            report["columns_processed"][col] = {
+                "status": "success",
+                "original_dtype": original_dtype,
+                "new_dtype": new_dtype,
+                "target_type": target_type
+            }
+        
+        except Exception as e:
+            report["columns_processed"][col] = {
+                "status": "error",
+                "original_dtype": original_dtype,
+                "target_type": target_type,
+                "message": str(e)
+            }
+    
+    # Save dataset
+    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+    save_dataframe(df, output_path)
+    report["output_path"] = output_path
+    
+    return report
+
+
+def _auto_detect_type(series: pl.Series) -> Optional[str]:
+    """
+    Auto-detect appropriate type for a series.
+    
+    Args:
+        series: Polars series
+        
+    Returns:
+        Detected type string or None
+    """
+    # Already correct type
+    if series.dtype in pl.NUMERIC_DTYPES:
+        return None
+    
+    if series.dtype in [pl.Date, pl.Datetime]:
+        return None
+    
+    # Try to detect from string values
+    if series.dtype == pl.Utf8:
+        sample = series.drop_nulls().head(100)
+        
+        if len(sample) == 0:
+            return None
+        
+        # Check for boolean
+        unique_vals = set(str(v).lower() for v in sample.to_list())
+        if unique_vals.issubset({'true', 'false', '1', '0', 'yes', 'no', 't', 'f'}):
+            return "bool"
+        
+        # Check for numeric
+        try:
+            sample.cast(pl.Float64)
+            # Check if all are integers
+            if all('.' not in str(v) for v in sample.to_list() if v is not None):
+                return "int"
+            return "float"
+        except:
+            pass
+        
+        # Check for date
+        try:
+            sample.str.strptime(pl.Date, "%Y-%m-%d", strict=False)
+            return "date"
+        except:
+            pass
+        
+        # Check if should be categorical (low cardinality)
+        n_unique = series.n_unique()
+        if n_unique < len(series) * 0.5 and n_unique < 100:
+            return "category"
+    
+    return None
diff --git a/src/tools/data_profiling.py b/src/tools/data_profiling.py
new file mode 100644
index 0000000000000000000000000000000000000000..504b969aebd1ec81eca83185f8cd6ed58d13eb54
--- /dev/null
+++ b/src/tools/data_profiling.py
@@ -0,0 +1,488 @@
+"""
+Data Profiling Tools
+Tools for analyzing and understanding dataset characteristics.
+"""
+
+import polars as pl
+import numpy as np
+from typing import Dict, Any, List, Optional
+from pathlib import Path
+import sys
+import os
+
+# Add parent directory to path for imports
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from utils.polars_helpers import (
+    load_dataframe,
+    get_numeric_columns,
+    get_categorical_columns,
+    get_datetime_columns,
+    get_column_info,
+    calculate_memory_usage,
+    detect_id_columns,
+)
+from utils.validation import (
+    validate_file_exists,
+    validate_file_format,
+    validate_dataframe,
+)
+
+
+def profile_dataset(file_path: str) -> Dict[str, Any]:
+    """
+    Get comprehensive statistics about a dataset.
+    
+    Args:
+        file_path: Path to CSV or Parquet file
+        
+    Returns:
+        Dictionary with dataset profile including:
+        - shape (rows, columns)
+        - column types
+        - memory usage
+        - null counts
+        - unique values
+        - missing value percentage per column (NEW)
+        - unique value counts per column (NEW)
+        - basic statistics for each column
+    """
+    # Validation
+    validate_file_exists(file_path)
+    validate_file_format(file_path)
+    
+    # Load data
+    df = load_dataframe(file_path)
+    validate_dataframe(df)
+    
+    # Basic info
+    profile = {
+        "file_path": file_path,
+        "shape": {
+            "rows": len(df),
+            "columns": len(df.columns)
+        },
+        "memory_usage": calculate_memory_usage(df),
+        "column_types": {
+            "numeric": get_numeric_columns(df),
+            "categorical": get_categorical_columns(df),
+            "datetime": get_datetime_columns(df),
+            "id_columns": detect_id_columns(df),
+        },
+        "columns": {},
+        "missing_values_per_column": {},  # NEW: Per-column missing %
+        "unique_counts_per_column": {}   # NEW: Per-column unique counts
+    }
+    
+    # Per-column statistics with enhanced missing % and unique counts
+    for col in df.columns:
+        # Get existing column info
+        profile["columns"][col] = get_column_info(df, col)
+        
+        # NEW: Calculate missing value percentage for this column
+        null_count = df[col].null_count()
+        missing_pct = round((null_count / len(df)) * 100, 2) if len(df) > 0 else 0
+        profile["missing_values_per_column"][col] = {
+            "count": int(null_count),
+            "percentage": missing_pct
+        }
+        
+        # NEW: Calculate unique value counts (with dict handling)
+        try:
+            # Try to get unique count directly
+            unique_count = df[col].n_unique()
+            profile["unique_counts_per_column"][col] = int(unique_count)
+        except Exception as e:
+            # If column contains unhashable types (dicts, lists), handle gracefully
+            try:
+                # Convert to string and then count unique
+                unique_count = df[col].cast(pl.Utf8).n_unique()
+                profile["unique_counts_per_column"][col] = int(unique_count)
+            except:
+                profile["unique_counts_per_column"][col] = "N/A (unhashable type)"
+    
+    # Overall statistics
+    total_nulls = sum(df[col].null_count() for col in df.columns)
+    total_cells = len(df) * len(df.columns)
+    
+    profile["overall_stats"] = {
+        "total_cells": total_cells,
+        "total_nulls": total_nulls,
+        "null_percentage": round(total_nulls / total_cells * 100, 2) if total_cells > 0 else 0,
+        "duplicate_rows": df.is_duplicated().sum(),
+        "duplicate_percentage": round(df.is_duplicated().sum() / len(df) * 100, 2) if len(df) > 0 else 0,
+    }
+    
+    return profile
+
+
+def get_smart_summary(file_path: str, n_samples: int = 30) -> Dict[str, Any]:
+    """
+    Enhanced data summary with missing %, unique counts, and safe dict handling.
+    
+    This function provides a smarter, more LLM-friendly summary compared to profile_dataset().
+    It includes per-column missing percentages, unique value counts, and handles
+    dictionary columns gracefully (converts to strings to avoid hashing errors).
+    
+    Args:
+        file_path: Path to CSV or Parquet file
+        n_samples: Number of sample rows to include (default: 30)
+    
+    Returns:
+        Dictionary with comprehensive smart summary including:
+        - Basic shape info
+        - Column data types
+        - Missing value percentage by column (sorted by % descending)
+        - Unique value counts by column
+        - First N sample rows
+        - Descriptive statistics for numeric columns
+        - Safe handling of dictionary/unhashable columns
+    
+    Example:
+        >>> summary = get_smart_summary("data.csv")
+        >>> print(summary["missing_summary"])
+        >>> # Output: [("col_A", 45.2), ("col_B", 12.3), ...]
+    """
+    # Validation
+    validate_file_exists(file_path)
+    validate_file_format(file_path)
+    
+    # Load data
+    df = load_dataframe(file_path)
+    validate_dataframe(df)
+    
+    # Convert dictionary-type columns to strings (prevents unhashable dict errors)
+    for col in df.columns:
+        try:
+            # Try to detect if column might contain dicts/lists
+            sample = df[col].drop_nulls().head(5)
+            if len(sample) > 0:
+                first_val = sample[0]
+                # Check if it's a complex type
+                if isinstance(first_val, (dict, list)):
+                    df = df.with_columns(pl.col(col).cast(pl.Utf8).alias(col))
+        except:
+            # If any error, just continue
+            pass
+    
+    # Calculate missing value statistics (sorted by % descending)
+    missing_stats = []
+    for col in df.columns:
+        null_count = df[col].null_count()
+        null_pct = round((null_count / len(df)) * 100, 2) if len(df) > 0 else 0
+        missing_stats.append({
+            "column": col,
+            "count": int(null_count),
+            "percentage": null_pct
+        })
+    
+    # Sort by percentage descending
+    missing_stats.sort(key=lambda x: x["percentage"], reverse=True)
+    
+    # Calculate unique value counts
+    unique_counts = {}
+    for col in df.columns:
+        try:
+            unique_count = df[col].n_unique()
+            unique_counts[col] = int(unique_count)
+        except:
+            # Fallback for unhashable types
+            try:
+                unique_count = df[col].cast(pl.Utf8).n_unique()
+                unique_counts[col] = int(unique_count)
+            except:
+                unique_counts[col] = "N/A"
+    
+    # Get column data types
+    column_types = {col: str(df[col].dtype) for col in df.columns}
+    
+    # Get sample rows (first n_samples)
+    sample_data = df.head(n_samples).to_dicts()
+    
+    # Get descriptive statistics for numeric columns
+    numeric_cols = get_numeric_columns(df)
+    numeric_stats = {}
+    
+    if numeric_cols:
+        df_numeric = df.select(numeric_cols)
+        # Convert to pandas for describe() functionality
+        df_pd = df_numeric.to_pandas()
+        stats_df = df_pd.describe()
+        numeric_stats = stats_df.to_dict()
+    
+    # Build comprehensive summary
+    summary = {
+        "file_path": file_path,
+        "shape": {
+            "rows": len(df),
+            "columns": len(df.columns)
+        },
+        "column_types": column_types,
+        "missing_summary": missing_stats,  # Sorted by % descending
+        "unique_counts": unique_counts,
+        "sample_data": sample_data,
+        "numeric_statistics": numeric_stats,
+        "memory_usage_mb": calculate_memory_usage(df),
+        "summary_notes": []
+    }
+    
+    # Add helpful notes for LLM
+    high_missing_cols = [item for item in missing_stats if item["percentage"] > 40]
+    if high_missing_cols:
+        summary["summary_notes"].append(
+            f"{len(high_missing_cols)} column(s) have >40% missing values (consider dropping)"
+        )
+    
+    high_cardinality_cols = [col for col, count in unique_counts.items() 
+                            if isinstance(count, int) and count > len(df) * 0.5]
+    if high_cardinality_cols:
+        summary["summary_notes"].append(
+            f"{len(high_cardinality_cols)} column(s) have very high cardinality (>50% unique values)"
+        )
+    
+    return summary
+
+
+def detect_data_quality_issues(file_path: str) -> Dict[str, Any]:
+    """
+    Detect data quality issues in the dataset.
+    
+    Args:
+        file_path: Path to CSV or Parquet file
+        
+    Returns:
+        Dictionary with detected issues organized by severity:
+        - critical: Issues that will break model training
+        - warning: Issues that may affect model performance
+        - info: Observations that may be relevant
+    """
+    # Validation
+    validate_file_exists(file_path)
+    validate_file_format(file_path)
+    
+    # Load data
+    df = load_dataframe(file_path)
+    validate_dataframe(df)
+    
+    issues = {
+        "critical": [],
+        "warning": [],
+        "info": []
+    }
+    
+    # Check for completely null columns
+    for col in df.columns:
+        null_count = df[col].null_count()
+        null_pct = (null_count / len(df)) * 100
+        
+        if null_count == len(df):
+            issues["critical"].append({
+                "type": "all_null_column",
+                "column": col,
+                "message": f"Column '{col}' has all null values"
+            })
+        elif null_pct > 50:
+            issues["warning"].append({
+                "type": "high_null_percentage",
+                "column": col,
+                "null_percentage": round(null_pct, 2),
+                "message": f"Column '{col}' has {round(null_pct, 2)}% null values"
+            })
+        elif null_pct > 10:
+            issues["info"].append({
+                "type": "moderate_null_percentage",
+                "column": col,
+                "null_percentage": round(null_pct, 2),
+                "message": f"Column '{col}' has {round(null_pct, 2)}% null values"
+            })
+    
+    # Check for duplicate rows
+    dup_count = df.is_duplicated().sum()
+    if dup_count > 0:
+        dup_pct = (dup_count / len(df)) * 100
+        severity = "warning" if dup_pct > 10 else "info"
+        issues[severity].append({
+            "type": "duplicate_rows",
+            "count": int(dup_count),
+            "percentage": round(dup_pct, 2),
+            "message": f"Dataset has {dup_count} duplicate rows ({round(dup_pct, 2)}%)"
+        })
+    
+    # Check for outliers in numeric columns using IQR method
+    numeric_cols = get_numeric_columns(df)
+    for col in numeric_cols:
+        col_data = df[col].drop_nulls()
+        if len(col_data) == 0:
+            continue
+        
+        q1 = col_data.quantile(0.25)
+        q3 = col_data.quantile(0.75)
+        iqr = q3 - q1
+        
+        lower_bound = q1 - 1.5 * iqr
+        upper_bound = q3 + 1.5 * iqr
+        
+        outliers = ((col_data < lower_bound) | (col_data > upper_bound)).sum()
+        
+        if outliers > 0:
+            outlier_pct = (outliers / len(col_data)) * 100
+            if outlier_pct > 10:
+                issues["warning"].append({
+                    "type": "outliers",
+                    "column": col,
+                    "count": int(outliers),
+                    "percentage": round(outlier_pct, 2),
+                    "bounds": {"lower": float(lower_bound), "upper": float(upper_bound)},
+                    "message": f"Column '{col}' has {outliers} outliers ({round(outlier_pct, 2)}%)"
+                })
+            elif outlier_pct > 1:
+                issues["info"].append({
+                    "type": "outliers",
+                    "column": col,
+                    "count": int(outliers),
+                    "percentage": round(outlier_pct, 2),
+                    "bounds": {"lower": float(lower_bound), "upper": float(upper_bound)},
+                    "message": f"Column '{col}' has {outliers} outliers ({round(outlier_pct, 2)}%)"
+                })
+    
+    # Check for high cardinality in categorical columns
+    categorical_cols = get_categorical_columns(df)
+    for col in categorical_cols:
+        n_unique = df[col].n_unique()
+        cardinality_pct = (n_unique / len(df)) * 100
+        
+        if n_unique > 100 and cardinality_pct > 50:
+            issues["warning"].append({
+                "type": "high_cardinality",
+                "column": col,
+                "unique_values": int(n_unique),
+                "percentage": round(cardinality_pct, 2),
+                "message": f"Column '{col}' has very high cardinality ({n_unique} unique values, {round(cardinality_pct, 2)}%)"
+            })
+    
+    # Check for constant columns (single unique value)
+    for col in df.columns:
+        n_unique = df[col].n_unique()
+        if n_unique == 1:
+            issues["warning"].append({
+                "type": "constant_column",
+                "column": col,
+                "message": f"Column '{col}' has only one unique value (constant)"
+            })
+    
+    # Check for imbalanced datasets (for potential target columns)
+    for col in df.columns:
+        col_data = df[col]
+        n_unique = col_data.n_unique()
+        
+        # Check if this could be a target column (2-20 unique values)
+        if 2 <= n_unique <= 20:
+            value_counts = col_data.value_counts()
+            if len(value_counts) >= 2:
+                max_count = value_counts[value_counts.columns[1]][0]
+                max_pct = (max_count / len(df)) * 100
+                
+                if max_pct > 90:
+                    issues["warning"].append({
+                        "type": "class_imbalance",
+                        "column": col,
+                        "dominant_class_percentage": round(max_pct, 2),
+                        "message": f"Column '{col}' may be imbalanced (dominant class: {round(max_pct, 2)}%)"
+                    })
+    
+    # Summary
+    issues["summary"] = {
+        "total_issues": len(issues["critical"]) + len(issues["warning"]) + len(issues["info"]),
+        "critical_count": len(issues["critical"]),
+        "warning_count": len(issues["warning"]),
+        "info_count": len(issues["info"])
+    }
+    
+    return issues
+
+
+def analyze_correlations(file_path: str, target: Optional[str] = None) -> Dict[str, Any]:
+    """
+    Analyze correlations between features.
+    
+    Args:
+        file_path: Path to CSV or Parquet file
+        target: Optional target column to analyze correlations with
+        
+    Returns:
+        Dictionary with correlation analysis including:
+        - correlation matrix (for numeric columns)
+        - top correlations with target (if specified)
+        - highly correlated feature pairs
+    """
+    # Validation
+    validate_file_exists(file_path)
+    validate_file_format(file_path)
+    
+    # Load data
+    df = load_dataframe(file_path)
+    validate_dataframe(df)
+    
+    numeric_cols = get_numeric_columns(df)
+    
+    if len(numeric_cols) < 2:
+        return {
+            "error": "Dataset must have at least 2 numeric columns for correlation analysis",
+            "numeric_columns_found": len(numeric_cols)
+        }
+    
+    # Select only numeric columns for correlation
+    df_numeric = df.select(numeric_cols)
+    
+    # Calculate correlation matrix using pandas (Polars doesn't have native corr yet)
+    df_pd = df_numeric.to_pandas()
+    corr_matrix = df_pd.corr()
+    
+    result = {
+        "numeric_columns": numeric_cols,
+        "correlation_matrix": corr_matrix.to_dict()
+    }
+    
+    # Find highly correlated pairs (excluding diagonal)
+    high_corr_pairs = []
+    for i in range(len(corr_matrix.columns)):
+        for j in range(i + 1, len(corr_matrix.columns)):
+            col1 = corr_matrix.columns[i]
+            col2 = corr_matrix.columns[j]
+            corr_value = corr_matrix.iloc[i, j]
+            
+            if abs(corr_value) > 0.7:  # High correlation threshold
+                high_corr_pairs.append({
+                    "feature_1": col1,
+                    "feature_2": col2,
+                    "correlation": round(float(corr_value), 4)
+                })
+    
+    # Sort by absolute correlation
+    high_corr_pairs.sort(key=lambda x: abs(x["correlation"]), reverse=True)
+    result["high_correlations"] = high_corr_pairs
+    
+    # If target specified, show top correlations with target
+    if target:
+        if target not in df.columns:
+            result["target_correlations_error"] = f"Target column '{target}' not found"
+        elif target not in numeric_cols:
+            result["target_correlations_error"] = f"Target column '{target}' is not numeric"
+        else:
+            target_corrs = []
+            for col in numeric_cols:
+                if col != target:
+                    corr_value = corr_matrix.loc[target, col]
+                    target_corrs.append({
+                        "feature": col,
+                        "correlation": round(float(corr_value), 4)
+                    })
+            
+            # Sort by absolute correlation
+            target_corrs.sort(key=lambda x: abs(x["correlation"]), reverse=True)
+            result["target_correlations"] = {
+                "target": target,
+                "top_features": target_corrs[:20]  # Top 20
+            }
+    
+    return result
diff --git a/src/tools/data_type_conversion.py b/src/tools/data_type_conversion.py
new file mode 100644
index 0000000000000000000000000000000000000000..9aa1d5cf155d8d6bec959a19a0b8abd413b066b4
--- /dev/null
+++ b/src/tools/data_type_conversion.py
@@ -0,0 +1,268 @@
+"""
+Advanced data type conversion tools for handling tricky type issues.
+"""
+
+import polars as pl
+from pathlib import Path
+from typing import Dict, Any, List, Optional
+import sys
+import os
+
+# Add parent directory to path for imports
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from utils.polars_helpers import (
+    load_dataframe,
+    save_dataframe,
+    get_numeric_columns,
+    get_categorical_columns
+)
+from utils.validation import (
+    validate_file_exists,
+    validate_file_format,
+    validate_dataframe
+)
+
+
+def force_numeric_conversion(
+    file_path: str,
+    columns: List[str],
+    output_path: str,
+    errors: str = "coerce"
+) -> Dict[str, Any]:
+    """
+    Force convert columns to numeric type, even if they're detected as strings/objects.
+    
+    This is crucial for datasets where numeric columns are stored as strings with 
+    formatting issues (commas, spaces, currency symbols, etc.).
+    
+    Args:
+        file_path: Path to CSV or Parquet file
+        columns: List of column names to force convert, or ["all"] for all non-ID columns
+        output_path: Path to save converted dataset
+        errors: How to handle conversion errors:
+               - "coerce": Invalid values become null (default)
+               - "raise": Raise error on invalid values
+               
+    Returns:
+        Dictionary with conversion report and statistics
+    """
+    # Validation
+    validate_file_exists(file_path)
+    validate_file_format(file_path)
+    
+    # Load data
+    df = load_dataframe(file_path)
+    validate_dataframe(df)
+    
+    original_types = {col: str(df[col].dtype) for col in df.columns}
+    
+    # Determine which columns to convert
+    if columns == ["all"]:
+        # Auto-detect: skip ID columns, already-numeric columns, and actual text columns
+        id_keywords = ['id', 'key', 'code', 'name', 'description', 'text', 'comment', 'notes']
+        target_columns = []
+        
+        for col in df.columns:
+            # Skip if already numeric
+            if df[col].dtype in [pl.Int64, pl.Int32, pl.Float64, pl.Float32]:
+                continue
+            
+            # Skip if looks like an ID or text column
+            col_lower = col.lower()
+            if any(keyword in col_lower for keyword in id_keywords):
+                continue
+            
+            # Only attempt conversion if column looks numeric
+            # Sample first 100 non-null values to check if they're numeric-like
+            sample_values = df[col].drop_nulls().head(100).to_list()
+            if len(sample_values) == 0:
+                continue
+            
+            numeric_like_count = 0
+            for val in sample_values[:min(50, len(sample_values))]:  # Check first 50 samples
+                val_str = str(val).replace(",", "").replace(" ", "").replace("$", "").replace("€", "").strip()
+                
+                # Check if it looks like a number (digits, decimal point, minus sign)
+                if val_str.replace(".", "").replace("-", "").replace("+", "").replace("e", "").replace("E", "").isdigit():
+                    numeric_like_count += 1
+                # Also check for percentage-like values
+                elif val_str.endswith("%") and val_str[:-1].replace(".", "").isdigit():
+                    numeric_like_count += 1
+            
+            # Only include if >70% of samples look numeric
+            if len(sample_values) > 0 and (numeric_like_count / min(50, len(sample_values))) > 0.7:
+                target_columns.append(col)
+                print(f"🔍 '{col}': Detected as numeric-like ({numeric_like_count}/{min(50, len(sample_values))} samples)")
+            else:
+                print(f"⏭️ '{col}': Skipping (appears to be text, not numeric)")
+    else:
+        target_columns = columns
+    
+    print(f"🔢 Force converting {len(target_columns)} columns to numeric...")
+    
+    # Track conversion results
+    conversion_report = {
+        "successful_conversions": [],
+        "failed_conversions": [],
+        "null_values_introduced": {}
+    }
+    
+    # Convert each column
+    for col in target_columns:
+        if col not in df.columns:
+            print(f"⚠️ Column '{col}' not found, skipping")
+            conversion_report["failed_conversions"].append(col)
+            continue
+        
+        try:
+            # Get original null count
+            original_nulls = df[col].null_count()
+            
+            # Try to convert to numeric
+            # First, clean the column if it's a string (remove commas, spaces, etc.)
+            if df[col].dtype == pl.Utf8:
+                # Remove common non-numeric characters
+                df = df.with_columns([
+                    pl.col(col)
+                    .str.replace_all(",", "")  # Remove commas
+                    .str.replace_all(" ", "")  # Remove spaces
+                    .str.replace_all("$", "")  # Remove dollar signs
+                    .str.replace_all("€", "")  # Remove euro signs
+                    .str.replace_all("%", "")  # Remove percent signs
+                    .str.strip_chars()  # Strip whitespace
+                    .alias(col)
+                ])
+            
+            # Now convert to float
+            if errors == "coerce":
+                df = df.with_columns([
+                    pl.col(col).cast(pl.Float64, strict=False).alias(col)
+                ])
+            else:
+                df = df.with_columns([
+                    pl.col(col).cast(pl.Float64, strict=True).alias(col)
+                ])
+            
+            # Check how many nulls were introduced
+            new_nulls = df[col].null_count()
+            nulls_introduced = new_nulls - original_nulls
+            
+            conversion_report["successful_conversions"].append(col)
+            conversion_report["null_values_introduced"][col] = int(nulls_introduced)
+            
+            if nulls_introduced > 0:
+                print(f"✅ '{col}': Converted to numeric ({nulls_introduced} values became null)")
+            else:
+                print(f"✅ '{col}': Converted to numeric (no data loss)")
+                
+        except Exception as e:
+            print(f"❌ '{col}': Conversion failed - {str(e)}")
+            conversion_report["failed_conversions"].append(col)
+    
+    # Save converted dataset
+    save_dataframe(df, output_path)
+    
+    new_types = {col: str(df[col].dtype) for col in df.columns}
+    
+    return {
+        "status": "success",
+        "message": f"Force converted {len(conversion_report['successful_conversions'])} columns to numeric",
+        "output_path": output_path,
+        "conversion_report": conversion_report,
+        "type_changes": {
+            col: {"from": original_types[col], "to": new_types[col]}
+            for col in conversion_report["successful_conversions"]
+        },
+        "total_successful": len(conversion_report["successful_conversions"]),
+        "total_failed": len(conversion_report["failed_conversions"]),
+        "total_nulls_introduced": sum(conversion_report["null_values_introduced"].values())
+    }
+
+
+def smart_type_inference(
+    file_path: str,
+    output_path: str,
+    aggressive: bool = True
+) -> Dict[str, Any]:
+    """
+    Intelligently infer and fix data types for all columns.
+    
+    This tool goes beyond basic type detection and tries to understand the
+    semantic meaning of each column to assign the correct type.
+    
+    Args:
+        file_path: Path to CSV or Parquet file
+        output_path: Path to save dataset with fixed types
+        aggressive: If True, attempts aggressive type conversion (force numeric on ambiguous columns)
+        
+    Returns:
+        Dictionary with type inference report
+    """
+    # Validation
+    validate_file_exists(file_path)
+    validate_file_format(file_path)
+    
+    # Load data
+    df = load_dataframe(file_path)
+    validate_dataframe(df)
+    
+    original_types = {col: str(df[col].dtype) for col in df.columns}
+    type_changes = {}
+    
+    print(f"🧠 Performing smart type inference on {len(df.columns)} columns...")
+    
+    for col in df.columns:
+        current_type = df[col].dtype
+        
+        # Skip if already numeric
+        if current_type in [pl.Int64, pl.Int32, pl.Float64, pl.Float32]:
+            continue
+        
+        # If it's a string column, try to infer the correct type
+        if current_type == pl.Utf8:
+            sample_values = df[col].drop_nulls().head(100).to_list()
+            
+            if len(sample_values) == 0:
+                continue
+            
+            # Try to detect if it's actually numeric
+            numeric_count = 0
+            for val in sample_values:
+                # Clean and test
+                cleaned = str(val).replace(",", "").replace(" ", "").replace("$", "").strip()
+                try:
+                    float(cleaned)
+                    numeric_count += 1
+                except:
+                    pass
+            
+            # If >80% of values are numeric, convert to numeric
+            if numeric_count / len(sample_values) > 0.8:
+                print(f"🔢 '{col}': Detected as numeric ({numeric_count}/{len(sample_values)} samples)")
+                
+                # Clean and convert
+                df = df.with_columns([
+                    pl.col(col)
+                    .str.replace_all(",", "")
+                    .str.replace_all(" ", "")
+                    .str.replace_all("$", "")
+                    .str.replace_all("€", "")
+                    .str.strip_chars()
+                    .cast(pl.Float64, strict=False)
+                    .alias(col)
+                ])
+                
+                type_changes[col] = {"from": "Utf8", "to": "Float64", "reason": "numeric_pattern_detected"}
+    
+    # Save dataset
+    save_dataframe(df, output_path)
+    
+    return {
+        "status": "success",
+        "message": f"Smart type inference completed, changed {len(type_changes)} columns",
+        "output_path": output_path,
+        "type_changes": type_changes,
+        "original_types": original_types,
+        "new_types": {col: str(df[col].dtype) for col in df.columns}
+    }
diff --git a/src/tools/data_wrangling.py b/src/tools/data_wrangling.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbd98c9cff96ba617a82172caa8284b1be59a5c7
--- /dev/null
+++ b/src/tools/data_wrangling.py
@@ -0,0 +1,433 @@
+"""
+Data Wrangling Tools
+Tools for merging, concatenating, and manipulating multiple datasets.
+"""
+
+import polars as pl
+import numpy as np
+from typing import Dict, Any, List, Optional, Literal
+from pathlib import Path
+import sys
+import os
+
+# Add parent directory to path for imports
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from utils.polars_helpers import (
+    load_dataframe,
+    save_dataframe,
+)
+from utils.validation import (
+    validate_file_exists,
+    validate_file_format,
+    validate_dataframe,
+)
+
+
+def merge_datasets(
+    left_path: str,
+    right_path: str,
+    output_path: str,
+    how: Literal["inner", "left", "right", "outer", "cross"] = "inner",
+    on: Optional[str] = None,
+    left_on: Optional[str] = None,
+    right_on: Optional[str] = None,
+    suffix: str = "_right"
+) -> Dict[str, Any]:
+    """
+    Merge two datasets using various join strategies (SQL-like join operations).
+    
+    This function performs database-style joins on two datasets, similar to SQL JOIN operations.
+    Supports inner, left, right, outer, and cross joins.
+    
+    Args:
+        left_path: Path to left dataset (CSV or Parquet)
+        right_path: Path to right dataset (CSV or Parquet)
+        output_path: Path to save merged dataset
+        how: Join type - "inner", "left", "right", "outer", or "cross"
+            - "inner": Only rows with matching keys in both datasets
+            - "left": All rows from left, matching rows from right (nulls if no match)
+            - "right": All rows from right, matching rows from left (nulls if no match)
+            - "outer": All rows from both (nulls where no match)
+            - "cross": Cartesian product (all combinations)
+        on: Column name to join on (if same in both datasets)
+        left_on: Column name in left dataset (if different from right)
+        right_on: Column name in right dataset (if different from left)
+        suffix: Suffix to add to duplicate column names from right dataset (default: "_right")
+    
+    Returns:
+        Dictionary with merge report including:
+        - success: bool
+        - output_path: str
+        - left_rows: int
+        - right_rows: int
+        - result_rows: int
+        - merge_type: str
+        - join_columns: dict
+        - duplicate_columns: list (columns that got suffixed)
+    
+    Examples:
+        >>> # Simple join on same column name
+        >>> merge_datasets(
+        ...     "customers.csv", 
+        ...     "orders.csv",
+        ...     "merged.csv",
+        ...     how="left",
+        ...     on="customer_id"
+        ... )
+        
+        >>> # Join on different column names
+        >>> merge_datasets(
+        ...     "products.csv",
+        ...     "sales.csv",
+        ...     "product_sales.csv",
+        ...     how="inner",
+        ...     left_on="product_id",
+        ...     right_on="prod_id"
+        ... )
+    """
+    try:
+        # Validation
+        validate_file_exists(left_path)
+        validate_file_exists(right_path)
+        validate_file_format(left_path)
+        validate_file_format(right_path)
+        
+        # Load datasets
+        left_df = load_dataframe(left_path)
+        right_df = load_dataframe(right_path)
+        
+        validate_dataframe(left_df)
+        validate_dataframe(right_df)
+        
+        left_rows = len(left_df)
+        right_rows = len(right_df)
+        
+        # Determine join columns
+        if on:
+            # Same column name in both datasets
+            join_left_on = on
+            join_right_on = on
+            
+            # Validate column exists
+            if on not in left_df.columns:
+                return {
+                    "success": False,
+                    "error": f"Column '{on}' not found in left dataset. Available: {left_df.columns}"
+                }
+            if on not in right_df.columns:
+                return {
+                    "success": False,
+                    "error": f"Column '{on}' not found in right dataset. Available: {right_df.columns}"
+                }
+        elif left_on and right_on:
+            # Different column names
+            join_left_on = left_on
+            join_right_on = right_on
+            
+            # Validate columns exist
+            if left_on not in left_df.columns:
+                return {
+                    "success": False,
+                    "error": f"Column '{left_on}' not found in left dataset. Available: {left_df.columns}"
+                }
+            if right_on not in right_df.columns:
+                return {
+                    "success": False,
+                    "error": f"Column '{right_on}' not found in right dataset. Available: {right_df.columns}"
+                }
+        else:
+            return {
+                "success": False,
+                "error": "Must specify either 'on' (same column name) or both 'left_on' and 'right_on' (different names)"
+            }
+        
+        # Check for duplicate column names (excluding join columns)
+        left_cols = set(left_df.columns)
+        right_cols = set(right_df.columns)
+        duplicate_cols = list((left_cols & right_cols) - {join_left_on, join_right_on})
+        
+        # Perform merge
+        merged_df = left_df.join(
+            right_df,
+            left_on=join_left_on,
+            right_on=join_right_on,
+            how=how,
+            suffix=suffix
+        )
+        
+        result_rows = len(merged_df)
+        
+        # Save result
+        Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+        save_dataframe(merged_df, output_path)
+        
+        # Build report
+        report = {
+            "success": True,
+            "output_path": output_path,
+            "left_file": Path(left_path).name,
+            "right_file": Path(right_path).name,
+            "left_rows": left_rows,
+            "right_rows": right_rows,
+            "result_rows": result_rows,
+            "result_columns": len(merged_df.columns),
+            "merge_type": how,
+            "join_columns": {
+                "left": join_left_on,
+                "right": join_right_on
+            },
+            "duplicate_columns": duplicate_cols,
+            "rows_added": result_rows - left_rows if how in ["left", "inner"] else None,
+            "message": f"Successfully merged {left_rows:,} rows with {right_rows:,} rows using {how} join → {result_rows:,} rows"
+        }
+        
+        # Add warnings
+        if how == "inner" and result_rows < min(left_rows, right_rows):
+            report["warning"] = f"Inner join reduced data: only {result_rows:,} of {min(left_rows, right_rows):,} rows had matches"
+        elif how == "outer" and result_rows > left_rows + right_rows:
+            report["warning"] = "Outer join created duplicate rows - check for many-to-many relationships"
+        
+        if duplicate_cols:
+            report["note"] = f"{len(duplicate_cols)} column(s) were suffixed with '{suffix}': {', '.join(duplicate_cols)}"
+        
+        return report
+    
+    except Exception as e:
+        return {
+            "success": False,
+            "error": str(e),
+            "error_type": type(e).__name__
+        }
+
+
+def concat_datasets(
+    file_paths: List[str],
+    output_path: str,
+    axis: Literal["vertical", "horizontal"] = "vertical",
+    ignore_index: bool = True
+) -> Dict[str, Any]:
+    """
+    Concatenate multiple datasets vertically (stack rows) or horizontally (add columns).
+    
+    Args:
+        file_paths: List of file paths to concatenate (CSV or Parquet)
+        output_path: Path to save concatenated dataset
+        axis: "vertical" to stack rows (union), "horizontal" to add columns side-by-side
+        ignore_index: If True, reset index after concatenation (default: True)
+    
+    Returns:
+        Dictionary with concatenation report including:
+        - success: bool
+        - output_path: str
+        - input_files: int
+        - result_rows: int
+        - result_cols: int
+        - axis: str
+    
+    Examples:
+        >>> # Stack multiple CSV files (union)
+        >>> concat_datasets(
+        ...     ["jan_sales.csv", "feb_sales.csv", "mar_sales.csv"],
+        ...     "q1_sales.csv",
+        ...     axis="vertical"
+        ... )
+        
+        >>> # Combine datasets side-by-side (add columns)
+        >>> concat_datasets(
+        ...     ["features.csv", "labels.csv"],
+        ...     "full_dataset.csv",
+        ...     axis="horizontal"
+        ... )
+    """
+    try:
+        # Validation
+        if not file_paths or len(file_paths) < 2:
+            return {
+                "success": False,
+                "error": "Must provide at least 2 files to concatenate"
+            }
+        
+        for fp in file_paths:
+            validate_file_exists(fp)
+            validate_file_format(fp)
+        
+        # Load all datasets
+        dfs = []
+        file_info = []
+        
+        for fp in file_paths:
+            df = load_dataframe(fp)
+            validate_dataframe(df)
+            dfs.append(df)
+            file_info.append({
+                "file": Path(fp).name,
+                "rows": len(df),
+                "columns": len(df.columns)
+            })
+        
+        # Perform concatenation
+        if axis == "vertical":
+            # Stack rows (union) - requires same columns
+            result = pl.concat(dfs, how="vertical")
+        else:  # horizontal
+            # Add columns side-by-side - requires same number of rows
+            row_counts = [len(df) for df in dfs]
+            if len(set(row_counts)) > 1:
+                return {
+                    "success": False,
+                    "error": f"Horizontal concatenation requires same number of rows. Got: {row_counts}",
+                    "file_info": file_info
+                }
+            result = pl.concat(dfs, how="horizontal")
+        
+        # Save result
+        Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+        save_dataframe(result, output_path)
+        
+        return {
+            "success": True,
+            "output_path": output_path,
+            "input_files": len(file_paths),
+            "file_info": file_info,
+            "result_rows": len(result),
+            "result_cols": len(result.columns),
+            "axis": axis,
+            "message": f"Successfully concatenated {len(file_paths)} files ({axis}) → {len(result):,} rows × {len(result.columns)} columns"
+        }
+    
+    except Exception as e:
+        return {
+            "success": False,
+            "error": str(e),
+            "error_type": type(e).__name__
+        }
+
+
+def reshape_dataset(
+    file_path: str,
+    output_path: str,
+    operation: Literal["pivot", "melt", "transpose"],
+    **kwargs
+) -> Dict[str, Any]:
+    """
+    Reshape dataset using pivot, melt, or transpose operations.
+    
+    Args:
+        file_path: Path to CSV or Parquet file
+        output_path: Path to save reshaped dataset
+        operation: "pivot" (wide format), "melt" (long format), or "transpose"
+        **kwargs: Operation-specific parameters
+            For pivot: index, columns, values, aggregate_function
+            For melt: id_vars, value_vars, var_name, value_name
+    
+    Returns:
+        Dictionary with reshape report
+    
+    Examples:
+        >>> # Pivot: wide format
+        >>> reshape_dataset(
+        ...     "sales_long.csv",
+        ...     "sales_wide.csv",
+        ...     operation="pivot",
+        ...     index="date",
+        ...     columns="product",
+        ...     values="sales"
+        ... )
+        
+        >>> # Melt: long format
+        >>> reshape_dataset(
+        ...     "sales_wide.csv",
+        ...     "sales_long.csv",
+        ...     operation="melt",
+        ...     id_vars=["date"],
+        ...     value_vars=["product_a", "product_b"],
+        ...     var_name="product",
+        ...     value_name="sales"
+        ... )
+    """
+    try:
+        # Validation
+        validate_file_exists(file_path)
+        validate_file_format(file_path)
+        
+        # Load data
+        df = load_dataframe(file_path)
+        validate_dataframe(df)
+        
+        original_shape = (len(df), len(df.columns))
+        
+        # Perform operation
+        if operation == "pivot":
+            # Pivot to wide format
+            index = kwargs.get("index")
+            columns = kwargs.get("columns")
+            values = kwargs.get("values")
+            
+            if not all([index, columns, values]):
+                return {
+                    "success": False,
+                    "error": "Pivot requires: index, columns, values parameters"
+                }
+            
+            result = df.pivot(
+                index=index,
+                columns=columns,
+                values=values
+            )
+        
+        elif operation == "melt":
+            # Melt to long format
+            id_vars = kwargs.get("id_vars")
+            value_vars = kwargs.get("value_vars")
+            var_name = kwargs.get("var_name", "variable")
+            value_name = kwargs.get("value_name", "value")
+            
+            if not id_vars:
+                return {
+                    "success": False,
+                    "error": "Melt requires: id_vars parameter"
+                }
+            
+            result = df.melt(
+                id_vars=id_vars,
+                value_vars=value_vars,
+                variable_name=var_name,
+                value_name=value_name
+            )
+        
+        elif operation == "transpose":
+            # Transpose rows and columns
+            result = df.transpose()
+        
+        else:
+            return {
+                "success": False,
+                "error": f"Unknown operation: {operation}. Use 'pivot', 'melt', or 'transpose'"
+            }
+        
+        # Save result
+        Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+        save_dataframe(result, output_path)
+        
+        return {
+            "success": True,
+            "output_path": output_path,
+            "operation": operation,
+            "original_shape": {
+                "rows": original_shape[0],
+                "columns": original_shape[1]
+            },
+            "result_shape": {
+                "rows": len(result),
+                "columns": len(result.columns)
+            },
+            "message": f"Successfully {operation}ed dataset: {original_shape[0]}×{original_shape[1]} → {len(result)}×{len(result.columns)}"
+        }
+    
+    except Exception as e:
+        return {
+            "success": False,
+            "error": str(e),
+            "error_type": type(e).__name__
+        }
diff --git a/src/tools/eda_reports.py b/src/tools/eda_reports.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd7ec1251cedc66f1e6142598461c73cba206691
--- /dev/null
+++ b/src/tools/eda_reports.py
@@ -0,0 +1,321 @@
+"""
+EDA Report Generation Tools
+Generates comprehensive HTML reports using Sweetviz and ydata-profiling.
+"""
+
+import os
+from pathlib import Path
+from typing import Dict, Any, Optional
+import polars as pl
+
+
+def generate_sweetviz_report(
+    file_path: str,
+    output_path: str = "./outputs/reports/sweetviz_report.html",
+    target_column: Optional[str] = None,
+    compare_file_path: Optional[str] = None
+) -> Dict[str, Any]:
+    """
+    Generate a beautiful HTML report using Sweetviz.
+    
+    Sweetviz creates stunning visualizations for EDA with:
+    - Target analysis (associations with target variable)
+    - Feature distributions and statistics
+    - Correlations and relationships
+    - Missing value analysis
+    - Comparison between datasets (train vs test)
+    
+    Args:
+        file_path: Path to the dataset CSV file
+        output_path: Where to save the HTML report
+        target_column: Optional target variable for analysis
+        compare_file_path: Optional second dataset to compare against
+        
+    Returns:
+        Dict with success status, report path, and summary
+    """
+    try:
+        import sweetviz as sv
+        import pandas as pd
+        
+        # Read dataset (Sweetviz requires pandas)
+        if file_path.endswith('.csv'):
+            df = pd.read_csv(file_path)
+        elif file_path.endswith('.parquet'):
+            df = pd.read_parquet(file_path)
+        else:
+            raise ValueError(f"Unsupported file format: {file_path}")
+        
+        # Create output directory if needed
+        os.makedirs(os.path.dirname(output_path) or "./outputs/reports", exist_ok=True)
+        
+        # Generate report based on configuration
+        if compare_file_path:
+            # Comparison report (e.g., train vs test)
+            if compare_file_path.endswith('.csv'):
+                df_compare = pd.read_csv(compare_file_path)
+            elif compare_file_path.endswith('.parquet'):
+                df_compare = pd.read_parquet(compare_file_path)
+            else:
+                raise ValueError(f"Unsupported compare file format: {compare_file_path}")
+            
+            report = sv.compare([df, "Dataset 1"], [df_compare, "Dataset 2"], target_column)
+        elif target_column:
+            # Analysis with target variable
+            if target_column not in df.columns:
+                available = list(df.columns)
+                return {
+                    "success": False,
+                    "error": f"Column '{target_column}' not found. Available columns: {', '.join(available)}",
+                    "suggestion": f"Did you mean one of: {', '.join(available[:5])}?"
+                }
+            report = sv.analyze([df, "Dataset"], target_feat=target_column)
+        else:
+            # Basic analysis without target
+            report = sv.analyze(df)
+        
+        # Generate HTML report
+        report.show_html(filepath=output_path, open_browser=False, layout='vertical', scale=1.0)
+        
+        # Get summary statistics
+        num_features = len(df.columns)
+        num_rows = len(df)
+        num_numeric = df.select_dtypes(include=['number']).shape[1]
+        num_categorical = df.select_dtypes(include=['object', 'category']).shape[1]
+        missing_pct = (df.isnull().sum().sum() / (num_rows * num_features)) * 100
+        
+        return {
+            "success": True,
+            "report_path": output_path,
+            "message": f"✅ Sweetviz report generated successfully at: {output_path}",
+            "summary": {
+                "features": num_features,
+                "rows": num_rows,
+                "numeric_features": num_numeric,
+                "categorical_features": num_categorical,
+                "missing_percentage": round(missing_pct, 2),
+                "target_column": target_column,
+                "has_comparison": compare_file_path is not None
+            }
+        }
+        
+    except ImportError:
+        return {
+            "success": False,
+            "error": "Sweetviz not installed. Install with: pip install sweetviz",
+            "error_type": "MissingDependency"
+        }
+    except Exception as e:
+        return {
+            "success": False,
+            "error": f"Failed to generate Sweetviz report: {str(e)}",
+            "error_type": type(e).__name__
+        }
+
+
+def generate_ydata_profiling_report(
+    file_path: str,
+    output_path: str = "./outputs/reports/ydata_profile.html",
+    minimal: bool = False,
+    title: str = "Data Profiling Report"
+) -> Dict[str, Any]:
+    """
+    Generate a comprehensive HTML report using ydata-profiling (formerly pandas-profiling).
+    
+    ydata-profiling provides extensive analysis including:
+    - Overview: dataset statistics, warnings, reproduction
+    - Variables: type inference, statistics, histograms, common values, missing values
+    - Interactions: scatter plots, correlations (Pearson, Spearman, Kendall, Cramér's V)
+    - Correlations: detailed correlation matrices and heatmaps
+    - Missing values: matrix, heatmap, and dendrogram
+    - Sample: first/last rows of the dataset
+    - Duplicate rows: analysis and examples
+    
+    Args:
+        file_path: Path to the dataset CSV file
+        output_path: Where to save the HTML report
+        minimal: If True, generates faster minimal report (useful for large datasets)
+        title: Title for the report
+        
+    Returns:
+        Dict with success status, report path, and statistics
+    """
+    try:
+        from ydata_profiling import ProfileReport
+        import pandas as pd
+        
+        # Read dataset (ydata-profiling requires pandas)
+        if file_path.endswith('.csv'):
+            df = pd.read_csv(file_path)
+        elif file_path.endswith('.parquet'):
+            df = pd.read_parquet(file_path)
+        else:
+            raise ValueError(f"Unsupported file format: {file_path}")
+        
+        # Create output directory if needed
+        os.makedirs(os.path.dirname(output_path) or "./outputs/reports", exist_ok=True)
+        
+        # Configure profile based on minimal flag
+        if minimal:
+            # Minimal mode: faster for large datasets
+            profile = ProfileReport(
+                df,
+                title=title,
+                minimal=True,
+                explorative=False
+            )
+        else:
+            # Full mode: comprehensive analysis
+            profile = ProfileReport(
+                df,
+                title=title,
+                explorative=True,
+                correlations={
+                    "pearson": {"calculate": True},
+                    "spearman": {"calculate": True},
+                    "kendall": {"calculate": False},  # Slow for large datasets
+                    "phi_k": {"calculate": True},
+                    "cramers": {"calculate": True},
+                }
+            )
+        
+        # Generate HTML report
+        profile.to_file(output_path)
+        
+        # Extract key statistics
+        num_features = len(df.columns)
+        num_rows = len(df)
+        num_numeric = df.select_dtypes(include=['number']).shape[1]
+        num_categorical = df.select_dtypes(include=['object', 'category']).shape[1]
+        num_boolean = df.select_dtypes(include=['bool']).shape[1]
+        missing_cells = df.isnull().sum().sum()
+        total_cells = num_rows * num_features
+        missing_pct = (missing_cells / total_cells) * 100 if total_cells > 0 else 0
+        duplicate_rows = df.duplicated().sum()
+        
+        return {
+            "success": True,
+            "report_path": output_path,
+            "message": f"✅ ydata-profiling report generated successfully at: {output_path}",
+            "statistics": {
+                "dataset_size": {
+                    "rows": num_rows,
+                    "columns": num_features,
+                    "cells": total_cells
+                },
+                "variable_types": {
+                    "numeric": num_numeric,
+                    "categorical": num_categorical,
+                    "boolean": num_boolean
+                },
+                "data_quality": {
+                    "missing_cells": missing_cells,
+                    "missing_percentage": round(missing_pct, 2),
+                    "duplicate_rows": int(duplicate_rows)
+                },
+                "report_config": {
+                    "minimal_mode": minimal,
+                    "title": title
+                }
+            }
+        }
+        
+    except ImportError:
+        return {
+            "success": False,
+            "error": "ydata-profiling not installed. Install with: pip install ydata-profiling",
+            "error_type": "MissingDependency"
+        }
+    except Exception as e:
+        return {
+            "success": False,
+            "error": f"Failed to generate ydata-profiling report: {str(e)}",
+            "error_type": type(e).__name__
+        }
+
+
+def generate_combined_eda_report(
+    file_path: str,
+    output_dir: str = "./outputs/reports",
+    target_column: Optional[str] = None,
+    minimal: bool = False
+) -> Dict[str, Any]:
+    """
+    Generate both Sweetviz and ydata-profiling reports in one call.
+    
+    This convenience function creates comprehensive EDA reports using both tools,
+    giving you the best of both worlds:
+    - Sweetviz: Beautiful, fast, focused visualizations
+    - ydata-profiling: Comprehensive, detailed analysis
+    
+    Args:
+        file_path: Path to the dataset CSV file
+        output_dir: Directory to save both reports
+        target_column: Optional target variable for Sweetviz analysis
+        minimal: If True, uses minimal mode for ydata-profiling
+        
+    Returns:
+        Dict with success status and paths to both reports
+    """
+    try:
+        # Create output directory
+        os.makedirs(output_dir, exist_ok=True)
+        
+        # Generate Sweetviz report
+        sweetviz_path = os.path.join(output_dir, "sweetviz_report.html")
+        sweetviz_result = generate_sweetviz_report(
+            file_path=file_path,
+            output_path=sweetviz_path,
+            target_column=target_column
+        )
+        
+        # Generate ydata-profiling report
+        ydata_path = os.path.join(output_dir, "ydata_profile.html")
+        ydata_result = generate_ydata_profiling_report(
+            file_path=file_path,
+            output_path=ydata_path,
+            minimal=minimal
+        )
+        
+        # Check if both succeeded
+        both_success = sweetviz_result["success"] and ydata_result["success"]
+        
+        if both_success:
+            return {
+                "success": True,
+                "message": f"✅ Generated both EDA reports successfully in: {output_dir}",
+                "reports": {
+                    "sweetviz": {
+                        "path": sweetviz_path,
+                        "summary": sweetviz_result.get("summary", {})
+                    },
+                    "ydata_profiling": {
+                        "path": ydata_path,
+                        "statistics": ydata_result.get("statistics", {})
+                    }
+                },
+                "recommendation": "Open both reports in your browser to get comprehensive insights!"
+            }
+        else:
+            # At least one failed
+            errors = []
+            if not sweetviz_result["success"]:
+                errors.append(f"Sweetviz: {sweetviz_result['error']}")
+            if not ydata_result["success"]:
+                errors.append(f"ydata-profiling: {ydata_result['error']}")
+            
+            return {
+                "success": False,
+                "error": " | ".join(errors),
+                "partial_results": {
+                    "sweetviz": sweetviz_result,
+                    "ydata_profiling": ydata_result
+                }
+            }
+            
+    except Exception as e:
+        return {
+            "success": False,
+            "error": f"Failed to generate combined reports: {str(e)}",
+            "error_type": type(e).__name__
+        }
diff --git a/src/tools/enhanced_feature_engineering.py b/src/tools/enhanced_feature_engineering.py
new file mode 100644
index 0000000000000000000000000000000000000000..b53ffe0d39ecff3c1dee094512731b8d3641846d
--- /dev/null
+++ b/src/tools/enhanced_feature_engineering.py
@@ -0,0 +1,241 @@
+"""
+Enhanced Feature Engineering - Additional robust features
+"""
+
+import polars as pl
+import numpy as np
+from typing import Dict, Any, List, Optional
+from pathlib import Path
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from utils.polars_helpers import load_dataframe, save_dataframe, get_numeric_columns
+from utils.validation import validate_file_exists, validate_dataframe
+
+
+def create_ratio_features(file_path: str, 
+                          columns: Optional[List[str]] = None,
+                          max_ratios: int = 20,
+                          output_path: Optional[str] = None) -> Dict[str, Any]:
+    """
+    Create ratio features (a/b) for all numeric column pairs.
+    ROBUST: Handles division by zero, infinity, and NaN values.
+    
+    Args:
+        file_path: Path to dataset
+        columns: Columns to use (None = all numeric)
+        max_ratios: Maximum number of ratio features
+        output_path: Output file path
+        
+    Returns:
+        Dictionary with results
+    """
+    validate_file_exists(file_path)
+    df = load_dataframe(file_path)
+    validate_dataframe(df)
+    
+    if columns is None:
+        columns = get_numeric_columns(df)
+    
+    print(f"🔢 Creating ratio features from {len(columns)} columns...")
+    
+    ratio_exprs = []
+    feature_names = []
+    
+    for i, col1 in enumerate(columns[:15]):
+        for col2 in columns[i+1:16]:
+            if len(ratio_exprs) >= max_ratios:
+                break
+            
+            # Safe division (avoid div by zero, replace inf/nan)
+            ratio_name = f"ratio_{col1}_div_{col2}"
+            ratio_expr = (
+                pl.when(pl.col(col2).abs() < 1e-10)
+                .then(0)
+                .otherwise(pl.col(col1) / pl.col(col2))
+                .clip(-1e6, 1e6)  # Clip extreme values
+                .fill_nan(0)
+                .fill_null(0)
+                .alias(ratio_name)
+            )
+            ratio_exprs.append(ratio_expr)
+            feature_names.append(ratio_name)
+    
+    df = df.with_columns(ratio_exprs)
+    
+    if output_path:
+        save_dataframe(df, output_path)
+    
+    return {
+        'success': True,
+        'tool': 'create_ratio_features',
+        'result': {
+            'new_features': len(ratio_exprs),
+            'feature_names': feature_names,
+            'output_path': output_path
+        }
+    }
+
+
+def create_statistical_features(file_path: str,
+                                columns: Optional[List[str]] = None,
+                                output_path: Optional[str] = None) -> Dict[str, Any]:
+    """
+    Create row-wise statistical features (mean, std, min, max, range).
+    ROBUST: Handles missing values and edge cases.
+    
+    Args:
+        file_path: Path to dataset
+        columns: Columns to use (None = all numeric)
+        output_path: Output file path
+        
+    Returns:
+        Dictionary with results
+    """
+    validate_file_exists(file_path)
+    df = load_dataframe(file_path)
+    validate_dataframe(df)
+    
+    if columns is None:
+        columns = get_numeric_columns(df)
+    
+    print(f"📊 Creating statistical features across {len(columns)} columns...")
+    
+    # Row-wise statistics
+    stat_features = [
+        pl.concat_list([pl.col(c) for c in columns]).list.mean().fill_null(0).alias('row_mean'),
+        pl.concat_list([pl.col(c) for c in columns]).list.std().fill_null(0).alias('row_std'),
+        pl.concat_list([pl.col(c) for c in columns]).list.min().fill_null(0).alias('row_min'),
+        pl.concat_list([pl.col(c) for c in columns]).list.max().fill_null(0).alias('row_max'),
+        (pl.concat_list([pl.col(c) for c in columns]).list.max() - 
+         pl.concat_list([pl.col(c) for c in columns]).list.min()).fill_null(0).alias('row_range'),
+        pl.concat_list([pl.col(c) for c in columns]).list.sum().fill_null(0).alias('row_sum'),
+    ]
+    
+    df = df.with_columns(stat_features)
+    
+    if output_path:
+        save_dataframe(df, output_path)
+    
+    return {
+        'success': True,
+        'tool': 'create_statistical_features',
+        'result': {
+            'new_features': 6,
+            'feature_names': ['row_mean', 'row_std', 'row_min', 'row_max', 'row_range', 'row_sum'],
+            'output_path': output_path
+        }
+    }
+
+
+def create_log_features(file_path: str,
+                       columns: Optional[List[str]] = None,
+                       output_path: Optional[str] = None) -> Dict[str, Any]:
+    """
+    Create log-transformed features for skewed distributions.
+    ROBUST: Handles negative values and zeros.
+    
+    Args:
+        file_path: Path to dataset
+        columns: Columns to use (None = all numeric with positive values)
+        output_path: Output file path
+        
+    Returns:
+        Dictionary with results
+    """
+    validate_file_exists(file_path)
+    df = load_dataframe(file_path)
+    validate_dataframe(df)
+    
+    if columns is None:
+        columns = get_numeric_columns(df)
+    
+    print(f"📈 Creating log-transformed features for {len(columns)} columns...")
+    
+    log_exprs = []
+    feature_names = []
+    
+    for col in columns:
+        # Check if column has positive values
+        min_val = df[col].min()
+        if min_val is not None and min_val > 0:
+            # log(x)
+            log_exprs.append(pl.col(col).log().fill_nan(0).alias(f"log_{col}"))
+            feature_names.append(f"log_{col}")
+        elif min_val is not None and min_val >= 0:
+            # log(x+1) for non-negative values
+            log_exprs.append((pl.col(col) + 1).log().fill_nan(0).alias(f"log1p_{col}"))
+            feature_names.append(f"log1p_{col}")
+    
+    if log_exprs:
+        df = df.with_columns(log_exprs)
+    
+    if output_path:
+        save_dataframe(df, output_path)
+    
+    return {
+        'success': True,
+        'tool': 'create_log_features',
+        'result': {
+            'new_features': len(log_exprs),
+            'feature_names': feature_names,
+            'output_path': output_path
+        }
+    }
+
+
+def create_binned_features(file_path: str,
+                          columns: Optional[List[str]] = None,
+                          n_bins: int = 5,
+                          output_path: Optional[str] = None) -> Dict[str, Any]:
+    """
+    Create binned (discretized) features from continuous variables.
+    ROBUST: Uses quantile-based binning to handle outliers.
+    
+    Args:
+        file_path: Path to dataset
+        columns: Columns to use (None = all numeric)
+        n_bins: Number of bins
+        output_path: Output file path
+        
+    Returns:
+        Dictionary with results
+    """
+    validate_file_exists(file_path)
+    df = load_dataframe(file_path)
+    validate_dataframe(df)
+    
+    if columns is None:
+        columns = get_numeric_columns(df)[:10]  # Limit to 10 columns
+    
+    print(f"🗂️  Creating binned features for {len(columns)} columns with {n_bins} bins...")
+    
+    binned_exprs = []
+    feature_names = []
+    
+    for col in columns:
+        # Quantile-based binning
+        bin_name = f"{col}_binned"
+        binned_exprs.append(
+            pl.col(col).qcut(n_bins, labels=[f"bin_{i}" for i in range(n_bins)], 
+                            allow_duplicates=True).fill_null("bin_0").alias(bin_name)
+        )
+        feature_names.append(bin_name)
+    
+    df = df.with_columns(binned_exprs)
+    
+    if output_path:
+        save_dataframe(df, output_path)
+    
+    return {
+        'success': True,
+        'tool': 'create_binned_features',
+        'result': {
+            'new_features': len(binned_exprs),
+            'feature_names': feature_names,
+            'n_bins': n_bins,
+            'output_path': output_path
+        }
+    }
diff --git a/src/tools/feature_engineering.py b/src/tools/feature_engineering.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ffeddf09622ee78e446f549fdda1060172b3bbf
--- /dev/null
+++ b/src/tools/feature_engineering.py
@@ -0,0 +1,302 @@
+"""
+Feature Engineering Tools
+Tools for creating new features from existing data.
+"""
+
+import polars as pl
+import numpy as np
+from typing import Dict, Any, List, Optional
+from pathlib import Path
+import sys
+import os
+
+# Add parent directory to path for imports
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from utils.polars_helpers import (
+    load_dataframe,
+    save_dataframe,
+    get_numeric_columns,
+    get_categorical_columns,
+)
+from utils.validation import (
+    validate_file_exists,
+    validate_file_format,
+    validate_dataframe,
+    validate_column_exists,
+    validate_datetime_column,
+)
+
+
+def create_time_features(file_path: str, date_col: str, 
+                        output_path: str) -> Dict[str, Any]:
+    """
+    Extract comprehensive time-based features from datetime column.
+    
+    Args:
+        file_path: Path to CSV or Parquet file
+        date_col: Name of datetime column
+        output_path: Path to save dataset with new features
+        
+    Returns:
+        Dictionary with feature engineering report
+    """
+    # Validation
+    validate_file_exists(file_path)
+    validate_file_format(file_path)
+    
+    # Load data
+    df = load_dataframe(file_path)
+    validate_dataframe(df)
+    validate_column_exists(df, date_col)
+    
+    # Try to parse datetime if it's a string
+    if df[date_col].dtype == pl.Utf8:
+        try:
+            df = df.with_columns(
+                pl.col(date_col).str.strptime(pl.Datetime, strict=False).alias(date_col)
+            )
+        except:
+            return {
+                "status": "error",
+                "message": f"Could not parse column '{date_col}' as datetime"
+            }
+    
+    # Validate it's now a datetime
+    if df[date_col].dtype not in [pl.Date, pl.Datetime]:
+        return {
+            "status": "error",
+            "message": f"Column '{date_col}' is not a datetime type (dtype: {df[date_col].dtype})"
+        }
+    
+    features_created = []
+    
+    # Extract basic time features
+    df = df.with_columns([
+        pl.col(date_col).dt.year().alias(f"{date_col}_year"),
+        pl.col(date_col).dt.month().alias(f"{date_col}_month"),
+        pl.col(date_col).dt.day().alias(f"{date_col}_day"),
+        pl.col(date_col).dt.weekday().alias(f"{date_col}_dayofweek"),
+        pl.col(date_col).dt.quarter().alias(f"{date_col}_quarter"),
+    ])
+    
+    features_created.extend([
+        f"{date_col}_year",
+        f"{date_col}_month",
+        f"{date_col}_day",
+        f"{date_col}_dayofweek",
+        f"{date_col}_quarter"
+    ])
+    
+    # Create is_weekend feature
+    df = df.with_columns(
+        (pl.col(f"{date_col}_dayofweek") >= 5).cast(pl.Int8).alias(f"{date_col}_is_weekend")
+    )
+    features_created.append(f"{date_col}_is_weekend")
+    
+    # Cyclical encoding for month (sin/cos)
+    df = df.with_columns([
+        (2 * np.pi * pl.col(f"{date_col}_month") / 12).sin().alias(f"{date_col}_month_sin"),
+        (2 * np.pi * pl.col(f"{date_col}_month") / 12).cos().alias(f"{date_col}_month_cos"),
+    ])
+    features_created.extend([
+        f"{date_col}_month_sin",
+        f"{date_col}_month_cos"
+    ])
+    
+    # If datetime has time component, extract hour
+    if df[date_col].dtype == pl.Datetime:
+        try:
+            df = df.with_columns([
+                pl.col(date_col).dt.hour().alias(f"{date_col}_hour"),
+            ])
+            features_created.append(f"{date_col}_hour")
+            
+            # Cyclical encoding for hour
+            df = df.with_columns([
+                (2 * np.pi * pl.col(f"{date_col}_hour") / 24).sin().alias(f"{date_col}_hour_sin"),
+                (2 * np.pi * pl.col(f"{date_col}_hour") / 24).cos().alias(f"{date_col}_hour_cos"),
+            ])
+            features_created.extend([
+                f"{date_col}_hour_sin",
+                f"{date_col}_hour_cos"
+            ])
+        except:
+            pass  # Hour extraction failed, skip
+    
+    # Save dataset
+    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+    save_dataframe(df, output_path)
+    
+    return {
+        "status": "success",
+        "features_created": features_created,
+        "num_features": len(features_created),
+        "output_path": output_path
+    }
+
+
+def encode_categorical(file_path: str, method: str = "auto", columns: Optional[List[str]] = None,
+                      target_col: Optional[str] = None, 
+                      output_path: str = None) -> Dict[str, Any]:
+    """
+    Encode categorical variables.
+    
+    Args:
+        file_path: Path to CSV or Parquet file
+        method: Encoding method ('one_hot', 'target', 'frequency', 'auto')
+        columns: List of columns to encode, or ['all'] for all categorical. If None, defaults to all categorical columns
+        target_col: Required for target encoding - name of target column
+        output_path: Path to save dataset with encoded features
+        
+    Returns:
+        Dictionary with encoding report
+    """
+    # Validation
+    validate_file_exists(file_path)
+    validate_file_format(file_path)
+    
+    # Load data
+    df = load_dataframe(file_path)
+    validate_dataframe(df)
+    
+    # Determine which columns to process
+    categorical_cols = get_categorical_columns(df)
+    
+    # Default to all categorical columns if not specified
+    if columns is None or columns == ["all"]:
+        target_cols = categorical_cols
+    else:
+        # Validate columns exist
+        for col in columns:
+            if col not in df.columns:
+                raise ValueError(f"Column '{col}' not found")
+        target_cols = columns
+    
+    # Auto-detect method if 'auto'
+    if method == "auto":
+        # Use frequency encoding for high-cardinality, one-hot for low
+        method = "frequency"  # Default safe choice
+    
+    # For target encoding, validate target column
+    if method == "target":
+        if target_col is None:
+            return {
+                "status": "error",
+                "message": "target_col is required for target encoding"
+            }
+        validate_column_exists(df, target_col)
+    
+    report = {
+        "method": method,
+        "columns_processed": {},
+        "features_created": []
+    }
+    
+    # Process each column
+    for col in target_cols:
+        if col not in df.columns:
+            report["columns_processed"][col] = {
+                "status": "error",
+                "message": "Column not found"
+            }
+            continue
+        
+        n_unique = df[col].n_unique()
+        
+        try:
+            if method == "one_hot":
+                # One-hot encoding
+                # Limit to top categories if too many
+                if n_unique > 50:
+                    report["columns_processed"][col] = {
+                        "status": "warning",
+                        "message": f"Column has {n_unique} unique values. Consider using frequency or target encoding instead."
+                    }
+                    continue
+                
+                # Get dummies
+                encoded = df.select(pl.col(col)).to_dummies(columns=[col])
+                
+                # Add encoded columns to dataframe
+                for enc_col in encoded.columns:
+                    df = df.with_columns(encoded[enc_col])
+                    report["features_created"].append(enc_col)
+                
+                # Drop original column
+                df = df.drop(col)
+                
+                report["columns_processed"][col] = {
+                    "status": "success",
+                    "num_features_created": len(encoded.columns)
+                }
+            
+            elif method == "frequency":
+                # Frequency encoding
+                value_counts = df[col].value_counts()
+                freq_map = {
+                    row[0]: row[1] / len(df)
+                    for row in value_counts.iter_rows()
+                }
+                
+                # Create new column with frequencies
+                new_col_name = f"{col}_freq"
+                df = df.with_columns(
+                    pl.col(col).map_dict(freq_map, default=0.0).alias(new_col_name)
+                )
+                
+                # Drop original column
+                df = df.drop(col)
+                
+                report["features_created"].append(new_col_name)
+                report["columns_processed"][col] = {
+                    "status": "success",
+                    "num_features_created": 1
+                }
+            
+            elif method == "target":
+                # Target encoding (mean encoding)
+                # Calculate mean target value for each category
+                target_means = (
+                    df.group_by(col)
+                    .agg(pl.col(target_col).mean().alias("target_mean"))
+                )
+                
+                # Create dictionary for mapping
+                target_map = {
+                    row[0]: row[1]
+                    for row in target_means.iter_rows()
+                }
+                
+                # Global mean for unseen categories
+                global_mean = df[target_col].mean()
+                
+                # Create new column with target encoding
+                new_col_name = f"{col}_target_enc"
+                df = df.with_columns(
+                    pl.col(col).map_dict(target_map, default=global_mean).alias(new_col_name)
+                )
+                
+                # Drop original column
+                df = df.drop(col)
+                
+                report["features_created"].append(new_col_name)
+                report["columns_processed"][col] = {
+                    "status": "success",
+                    "num_features_created": 1
+                }
+        
+        except Exception as e:
+            report["columns_processed"][col] = {
+                "status": "error",
+                "message": str(e)
+            }
+    
+    report["total_features_created"] = len(report["features_created"])
+    
+    # Save dataset
+    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+    save_dataframe(df, output_path)
+    report["output_path"] = output_path
+    
+    return report
diff --git a/src/tools/matplotlib_visualizations.py b/src/tools/matplotlib_visualizations.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e03500352ab4f79cd3932e8f9ee4a3cc4f00bd9
--- /dev/null
+++ b/src/tools/matplotlib_visualizations.py
@@ -0,0 +1,1327 @@
+"""
+Matplotlib + Seaborn Visualization Engine
+Production-quality visualizations that work reliably with Gradio UI.
+
+All functions return matplotlib Figure objects (not file paths).
+Designed for publication-quality plots with professional styling.
+"""
+
+import matplotlib
+matplotlib.use('Agg')  # Use non-interactive backend for Gradio compatibility
+
+import matplotlib.pyplot as plt
+import seaborn as sns
+import numpy as np
+import pandas as pd
+from typing import Dict, Any, List, Optional, Tuple, Union
+from pathlib import Path
+import warnings
+
+warnings.filterwarnings('ignore')
+
+# Set global style
+sns.set_style('whitegrid')
+plt.rcParams['figure.facecolor'] = 'white'
+plt.rcParams['axes.facecolor'] = 'white'
+plt.rcParams['font.size'] = 10
+plt.rcParams['axes.labelsize'] = 12
+plt.rcParams['axes.titlesize'] = 14
+plt.rcParams['xtick.labelsize'] = 10
+plt.rcParams['ytick.labelsize'] = 10
+plt.rcParams['legend.fontsize'] = 10
+
+
+# ============================================================================
+# BASIC PLOTS
+# ============================================================================
+
+def create_scatter_plot(
+    x: Union[np.ndarray, pd.Series, list],
+    y: Union[np.ndarray, pd.Series, list],
+    hue: Optional[Union[np.ndarray, pd.Series, list]] = None,
+    size: Optional[Union[np.ndarray, pd.Series, list]] = None,
+    title: str = "Scatter Plot",
+    xlabel: str = "X",
+    ylabel: str = "Y",
+    figsize: Tuple[int, int] = (10, 6),
+    alpha: float = 0.6,
+    save_path: Optional[str] = None
+) -> plt.Figure:
+    """
+    Create a professional scatter plot with optional color coding and size variation.
+    
+    Args:
+        x: X-axis data
+        y: Y-axis data
+        hue: Optional categorical data for color coding
+        size: Optional numeric data for size variation
+        title: Plot title
+        xlabel: X-axis label
+        ylabel: Y-axis label
+        figsize: Figure size (width, height)
+        alpha: Point transparency (0-1)
+        save_path: Optional path to save PNG file
+        
+    Returns:
+        matplotlib Figure object
+        
+    Example:
+        >>> fig = create_scatter_plot(df['feature1'], df['target'], 
+        ...                           hue=df['category'], title='Feature vs Target')
+        >>> # Display in Gradio: gr.Plot(value=fig)
+    """
+    try:
+        fig, ax = plt.subplots(figsize=figsize)
+        
+        # Convert inputs to arrays
+        x = np.array(x)
+        y = np.array(y)
+        
+        if hue is not None:
+            hue = np.array(hue)
+            unique_hues = np.unique(hue)
+            colors = sns.color_palette('Set2', n_colors=len(unique_hues))
+            
+            for i, hue_val in enumerate(unique_hues):
+                mask = hue == hue_val
+                scatter_size = 50 if size is None else np.array(size)[mask]
+                ax.scatter(x[mask], y[mask], 
+                          c=[colors[i]], 
+                          s=scatter_size,
+                          alpha=alpha, 
+                          label=str(hue_val),
+                          edgecolors='black',
+                          linewidth=0.5)
+            ax.legend(title='Category', loc='best', framealpha=0.9)
+        else:
+            scatter_size = 50 if size is None else size
+            ax.scatter(x, y, 
+                      c='steelblue', 
+                      s=scatter_size,
+                      alpha=alpha,
+                      edgecolors='black',
+                      linewidth=0.5)
+        
+        ax.set_title(title, fontsize=14, fontweight='bold', pad=20)
+        ax.set_xlabel(xlabel, fontsize=12)
+        ax.set_ylabel(ylabel, fontsize=12)
+        ax.grid(True, alpha=0.3, linestyle='--')
+        plt.tight_layout()
+        
+        if save_path:
+            fig.savefig(save_path, dpi=300, bbox_inches='tight')
+            print(f"   ✓ Saved scatter plot to {save_path}")
+        
+        return fig
+    
+    except Exception as e:
+        print(f"   ✗ Error creating scatter plot: {str(e)}")
+        return None
+
+
+def create_line_plot(
+    x: Union[np.ndarray, pd.Series, list],
+    y: Union[Dict[str, np.ndarray], np.ndarray, pd.Series, list],
+    title: str = "Line Plot",
+    xlabel: str = "X",
+    ylabel: str = "Y",
+    figsize: Tuple[int, int] = (10, 6),
+    markers: bool = True,
+    save_path: Optional[str] = None
+) -> plt.Figure:
+    """
+    Create a line plot (supports multiple lines via dict).
+    
+    Args:
+        x: X-axis data
+        y: Y-axis data (dict for multiple lines: {'label1': y1, 'label2': y2})
+        title: Plot title
+        xlabel: X-axis label
+        ylabel: Y-axis label
+        figsize: Figure size
+        markers: Show markers on lines
+        save_path: Optional save path
+        
+    Returns:
+        matplotlib Figure object
+    """
+    try:
+        fig, ax = plt.subplots(figsize=figsize)
+        
+        x = np.array(x)
+        marker_style = 'o' if markers else None
+        
+        if isinstance(y, dict):
+            colors = sns.color_palette('husl', n_colors=len(y))
+            for i, (label, y_data) in enumerate(y.items()):
+                ax.plot(x, np.array(y_data), 
+                       marker=marker_style, 
+                       label=label,
+                       linewidth=2,
+                       markersize=6,
+                       color=colors[i])
+            ax.legend(loc='best', framealpha=0.9)
+        else:
+            ax.plot(x, np.array(y), 
+                   marker=marker_style,
+                   linewidth=2,
+                   markersize=6,
+                   color='steelblue')
+        
+        ax.set_title(title, fontsize=14, fontweight='bold', pad=20)
+        ax.set_xlabel(xlabel, fontsize=12)
+        ax.set_ylabel(ylabel, fontsize=12)
+        ax.grid(True, alpha=0.3, linestyle='--')
+        plt.tight_layout()
+        
+        if save_path:
+            fig.savefig(save_path, dpi=300, bbox_inches='tight')
+            print(f"   ✓ Saved line plot to {save_path}")
+        
+        return fig
+    
+    except Exception as e:
+        print(f"   ✗ Error creating line plot: {str(e)}")
+        return None
+
+
+def create_bar_chart(
+    categories: Union[list, np.ndarray],
+    values: Union[np.ndarray, pd.Series, list],
+    title: str = "Bar Chart",
+    xlabel: str = "Category",
+    ylabel: str = "Value",
+    figsize: Tuple[int, int] = (10, 6),
+    horizontal: bool = False,
+    color: str = 'steelblue',
+    save_path: Optional[str] = None
+) -> plt.Figure:
+    """
+    Create a bar chart (vertical or horizontal).
+    
+    Args:
+        categories: Category names
+        values: Values for each category
+        title: Plot title
+        xlabel: X-axis label
+        ylabel: Y-axis label
+        figsize: Figure size
+        horizontal: If True, create horizontal bars
+        color: Bar color
+        save_path: Optional save path
+        
+    Returns:
+        matplotlib Figure object
+    """
+    try:
+        fig, ax = plt.subplots(figsize=figsize)
+        
+        categories = list(categories)
+        values = np.array(values)
+        
+        if horizontal:
+            ax.barh(categories, values, color=color, edgecolor='black', linewidth=0.7)
+            ax.set_xlabel(ylabel, fontsize=12)
+            ax.set_ylabel(xlabel, fontsize=12)
+        else:
+            ax.bar(categories, values, color=color, edgecolor='black', linewidth=0.7)
+            ax.set_xlabel(xlabel, fontsize=12)
+            ax.set_ylabel(ylabel, fontsize=12)
+            
+            # Rotate labels if many categories
+            if len(categories) > 10:
+                plt.xticks(rotation=45, ha='right')
+        
+        ax.set_title(title, fontsize=14, fontweight='bold', pad=20)
+        ax.grid(True, alpha=0.3, linestyle='--', axis='y' if not horizontal else 'x')
+        plt.tight_layout()
+        
+        if save_path:
+            fig.savefig(save_path, dpi=300, bbox_inches='tight')
+            print(f"   ✓ Saved bar chart to {save_path}")
+        
+        return fig
+    
+    except Exception as e:
+        print(f"   ✗ Error creating bar chart: {str(e)}")
+        return None
+
+
+def create_histogram(
+    data: Union[np.ndarray, pd.Series, list],
+    title: str = "Histogram",
+    xlabel: str = "Value",
+    ylabel: str = "Frequency",
+    bins: int = 30,
+    figsize: Tuple[int, int] = (10, 6),
+    kde: bool = True,
+    save_path: Optional[str] = None
+) -> plt.Figure:
+    """
+    Create a histogram with optional KDE overlay.
+    
+    Args:
+        data: Data to plot
+        title: Plot title
+        xlabel: X-axis label
+        ylabel: Y-axis label
+        bins: Number of bins
+        figsize: Figure size
+        kde: Show kernel density estimate
+        save_path: Optional save path
+        
+    Returns:
+        matplotlib Figure object
+    """
+    try:
+        fig, ax = plt.subplots(figsize=figsize)
+        
+        data = np.array(data)
+        data = data[~np.isnan(data)]  # Remove NaN values
+        
+        if len(data) == 0:
+            print("   ✗ No valid data for histogram")
+            return None
+        
+        # Create histogram
+        ax.hist(data, bins=bins, color='steelblue', 
+               edgecolor='black', alpha=0.7, density=kde)
+        
+        # Add KDE if requested
+        if kde:
+            ax2 = ax.twinx()
+            sns.kdeplot(data, ax=ax2, color='darkred', linewidth=2, label='KDE')
+            ax2.set_ylabel('Density', fontsize=12)
+            ax2.legend(loc='upper right')
+        
+        ax.set_title(title, fontsize=14, fontweight='bold', pad=20)
+        ax.set_xlabel(xlabel, fontsize=12)
+        ax.set_ylabel(ylabel, fontsize=12)
+        ax.grid(True, alpha=0.3, linestyle='--')
+        plt.tight_layout()
+        
+        if save_path:
+            fig.savefig(save_path, dpi=300, bbox_inches='tight')
+            print(f"   ✓ Saved histogram to {save_path}")
+        
+        return fig
+    
+    except Exception as e:
+        print(f"   ✗ Error creating histogram: {str(e)}")
+        return None
+
+
+def create_boxplot(
+    data: Union[Dict[str, np.ndarray], pd.DataFrame],
+    title: str = "Box Plot",
+    xlabel: str = "Category",
+    ylabel: str = "Value",
+    figsize: Tuple[int, int] = (10, 6),
+    horizontal: bool = False,
+    save_path: Optional[str] = None
+) -> plt.Figure:
+    """
+    Create box plots for multiple columns/categories.
+    
+    Args:
+        data: Dictionary of {column_name: values} or DataFrame
+        title: Plot title
+        xlabel: X-axis label
+        ylabel: Y-axis label
+        figsize: Figure size
+        horizontal: If True, create horizontal boxplots
+        save_path: Optional save path
+        
+    Returns:
+        matplotlib Figure object
+    """
+    try:
+        fig, ax = plt.subplots(figsize=figsize)
+        
+        if isinstance(data, pd.DataFrame):
+            data_to_plot = [data[col].dropna() for col in data.columns]
+            labels = data.columns
+        elif isinstance(data, dict):
+            data_to_plot = [np.array(v)[~np.isnan(np.array(v))] for v in data.values()]
+            labels = list(data.keys())
+        else:
+            raise ValueError("Data must be DataFrame or dict")
+        
+        bp = ax.boxplot(data_to_plot, 
+                       labels=labels,
+                       vert=not horizontal,
+                       patch_artist=True,
+                       notch=True,
+                       showmeans=True)
+        
+        # Styling
+        for patch in bp['boxes']:
+            patch.set_facecolor('lightblue')
+            patch.set_alpha(0.7)
+        
+        for whisker in bp['whiskers']:
+            whisker.set(linewidth=1.5, color='gray')
+        
+        for cap in bp['caps']:
+            cap.set(linewidth=1.5, color='gray')
+        
+        for median in bp['medians']:
+            median.set(linewidth=2, color='darkred')
+        
+        for mean in bp['means']:
+            mean.set(marker='D', markerfacecolor='green', markersize=6)
+        
+        ax.set_title(title, fontsize=14, fontweight='bold', pad=20)
+        
+        if horizontal:
+            ax.set_xlabel(ylabel, fontsize=12)
+            ax.set_ylabel(xlabel, fontsize=12)
+        else:
+            ax.set_xlabel(xlabel, fontsize=12)
+            ax.set_ylabel(ylabel, fontsize=12)
+            if len(labels) > 8:
+                plt.xticks(rotation=45, ha='right')
+        
+        ax.grid(True, alpha=0.3, linestyle='--', axis='y' if not horizontal else 'x')
+        plt.tight_layout()
+        
+        if save_path:
+            fig.savefig(save_path, dpi=300, bbox_inches='tight')
+            print(f"   ✓ Saved boxplot to {save_path}")
+        
+        return fig
+    
+    except Exception as e:
+        print(f"   ✗ Error creating boxplot: {str(e)}")
+        return None
+
+
+# ============================================================================
+# STATISTICAL PLOTS
+# ============================================================================
+
+def create_correlation_heatmap(
+    data: Union[pd.DataFrame, np.ndarray],
+    columns: Optional[List[str]] = None,
+    title: str = "Correlation Heatmap",
+    figsize: Tuple[int, int] = (12, 10),
+    annot: bool = True,
+    cmap: str = 'RdBu_r',
+    save_path: Optional[str] = None
+) -> plt.Figure:
+    """
+    Create a correlation heatmap with annotations.
+    
+    Args:
+        data: DataFrame or correlation matrix
+        columns: Column names (if data is np.ndarray)
+        title: Plot title
+        figsize: Figure size
+        annot: Show correlation values as annotations
+        cmap: Colormap (diverging, centered at 0)
+        save_path: Optional save path
+        
+    Returns:
+        matplotlib Figure object
+        
+    Example:
+        >>> fig = create_correlation_heatmap(df[numeric_cols])
+    """
+    try:
+        fig, ax = plt.subplots(figsize=figsize)
+        
+        # Calculate correlation if DataFrame
+        if isinstance(data, pd.DataFrame):
+            corr_matrix = data.corr()
+        else:
+            corr_matrix = pd.DataFrame(data, columns=columns, index=columns)
+        
+        # Create heatmap
+        mask = np.triu(np.ones_like(corr_matrix, dtype=bool))  # Mask upper triangle
+        
+        sns.heatmap(corr_matrix,
+                   mask=mask,
+                   annot=annot,
+                   fmt='.2f',
+                   cmap=cmap,
+                   center=0,
+                   square=True,
+                   linewidths=0.5,
+                   cbar_kws={'shrink': 0.8, 'label': 'Correlation'},
+                   ax=ax,
+                   vmin=-1,
+                   vmax=1)
+        
+        ax.set_title(title, fontsize=14, fontweight='bold', pad=20)
+        plt.tight_layout()
+        
+        if save_path:
+            fig.savefig(save_path, dpi=300, bbox_inches='tight')
+            print(f"   ✓ Saved correlation heatmap to {save_path}")
+        
+        return fig
+    
+    except Exception as e:
+        print(f"   ✗ Error creating correlation heatmap: {str(e)}")
+        return None
+
+
+def create_distribution_plot(
+    data: Union[np.ndarray, pd.Series, list],
+    title: str = "Distribution Plot",
+    xlabel: str = "Value",
+    figsize: Tuple[int, int] = (10, 6),
+    show_rug: bool = False,
+    save_path: Optional[str] = None
+) -> plt.Figure:
+    """
+    Create a distribution plot with histogram and KDE.
+    
+    Args:
+        data: Data to plot
+        title: Plot title
+        xlabel: X-axis label
+        figsize: Figure size
+        show_rug: Show rug plot (data points on x-axis)
+        save_path: Optional save path
+        
+    Returns:
+        matplotlib Figure object
+    """
+    try:
+        fig, ax = plt.subplots(figsize=figsize)
+        
+        data = np.array(data)
+        data = data[~np.isnan(data)]
+        
+        if len(data) == 0:
+            print("   ✗ No valid data for distribution plot")
+            return None
+        
+        # Create distribution plot
+        sns.histplot(data, kde=True, ax=ax, color='steelblue', 
+                    edgecolor='black', alpha=0.6, bins=30)
+        
+        if show_rug:
+            sns.rugplot(data, ax=ax, color='darkred', alpha=0.5, height=0.05)
+        
+        # Add statistics text
+        mean_val = np.mean(data)
+        median_val = np.median(data)
+        std_val = np.std(data)
+        
+        stats_text = f'Mean: {mean_val:.2f}\nMedian: {median_val:.2f}\nStd: {std_val:.2f}'
+        ax.text(0.98, 0.98, stats_text,
+               transform=ax.transAxes,
+               verticalalignment='top',
+               horizontalalignment='right',
+               bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5),
+               fontsize=10)
+        
+        # Add vertical lines for mean and median
+        ax.axvline(mean_val, color='red', linestyle='--', linewidth=2, label='Mean')
+        ax.axvline(median_val, color='green', linestyle='--', linewidth=2, label='Median')
+        
+        ax.set_title(title, fontsize=14, fontweight='bold', pad=20)
+        ax.set_xlabel(xlabel, fontsize=12)
+        ax.set_ylabel('Frequency / Density', fontsize=12)
+        ax.legend(loc='upper left')
+        ax.grid(True, alpha=0.3, linestyle='--')
+        plt.tight_layout()
+        
+        if save_path:
+            fig.savefig(save_path, dpi=300, bbox_inches='tight')
+            print(f"   ✓ Saved distribution plot to {save_path}")
+        
+        return fig
+    
+    except Exception as e:
+        print(f"   ✗ Error creating distribution plot: {str(e)}")
+        return None
+
+
+def create_violin_plot(
+    data: Union[Dict[str, np.ndarray], pd.DataFrame],
+    title: str = "Violin Plot",
+    xlabel: str = "Category",
+    ylabel: str = "Value",
+    figsize: Tuple[int, int] = (10, 6),
+    save_path: Optional[str] = None
+) -> plt.Figure:
+    """
+    Create violin plots showing distribution for multiple categories.
+    
+    Args:
+        data: Dictionary or DataFrame with categories
+        title: Plot title
+        xlabel: X-axis label
+        ylabel: Y-axis label
+        figsize: Figure size
+        save_path: Optional save path
+        
+    Returns:
+        matplotlib Figure object
+    """
+    try:
+        fig, ax = plt.subplots(figsize=figsize)
+        
+        if isinstance(data, dict):
+            # Convert dict to DataFrame for seaborn
+            df_list = []
+            for key, values in data.items():
+                df_list.append(pd.DataFrame({
+                    'Category': [key] * len(values),
+                    'Value': values
+                }))
+            plot_df = pd.concat(df_list, ignore_index=True)
+        else:
+            plot_df = data
+        
+        # Create violin plot
+        sns.violinplot(data=plot_df, x='Category', y='Value', ax=ax,
+                      palette='Set2', inner='box')
+        
+        ax.set_title(title, fontsize=14, fontweight='bold', pad=20)
+        ax.set_xlabel(xlabel, fontsize=12)
+        ax.set_ylabel(ylabel, fontsize=12)
+        
+        if len(plot_df['Category'].unique()) > 8:
+            plt.xticks(rotation=45, ha='right')
+        
+        ax.grid(True, alpha=0.3, linestyle='--', axis='y')
+        plt.tight_layout()
+        
+        if save_path:
+            fig.savefig(save_path, dpi=300, bbox_inches='tight')
+            print(f"   ✓ Saved violin plot to {save_path}")
+        
+        return fig
+    
+    except Exception as e:
+        print(f"   ✗ Error creating violin plot: {str(e)}")
+        return None
+
+
+def create_pairplot(
+    data: pd.DataFrame,
+    hue: Optional[str] = None,
+    title: str = "Pair Plot",
+    figsize: Tuple[int, int] = (12, 12),
+    save_path: Optional[str] = None
+) -> plt.Figure:
+    """
+    Create a pairplot (scatterplot matrix) for multiple features.
+    
+    Args:
+        data: DataFrame with features to plot
+        hue: Column name for color coding
+        title: Plot title
+        figsize: Figure size
+        save_path: Optional save path
+        
+    Returns:
+        matplotlib Figure object
+    """
+    try:
+        # Seaborn pairplot returns a PairGrid, we need to extract the figure
+        if hue and hue in data.columns:
+            pair_grid = sns.pairplot(data, hue=hue, palette='Set2',
+                                    diag_kind='kde', corner=True)
+        else:
+            pair_grid = sns.pairplot(data, palette='Set2',
+                                    diag_kind='kde', corner=True)
+        
+        fig = pair_grid.fig
+        fig.suptitle(title, fontsize=14, fontweight='bold', y=1.01)
+        plt.tight_layout()
+        
+        if save_path:
+            fig.savefig(save_path, dpi=300, bbox_inches='tight')
+            print(f"   ✓ Saved pairplot to {save_path}")
+        
+        return fig
+    
+    except Exception as e:
+        print(f"   ✗ Error creating pairplot: {str(e)}")
+        return None
+
+
+# ============================================================================
+# MACHINE LEARNING PLOTS
+# ============================================================================
+
+def create_roc_curve(
+    models_data: Dict[str, Tuple[np.ndarray, np.ndarray, float]],
+    title: str = "ROC Curve Comparison",
+    figsize: Tuple[int, int] = (10, 8),
+    save_path: Optional[str] = None
+) -> plt.Figure:
+    """
+    Create ROC curves for multiple models on the same plot.
+    
+    Args:
+        models_data: Dict of {model_name: (fpr, tpr, auc_score)}
+        title: Plot title
+        figsize: Figure size
+        save_path: Optional save path
+        
+    Returns:
+        matplotlib Figure object
+        
+    Example:
+        >>> from sklearn.metrics import roc_curve, auc
+        >>> fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
+        >>> auc_score = auc(fpr, tpr)
+        >>> models = {'Random Forest': (fpr, tpr, auc_score)}
+        >>> fig = create_roc_curve(models)
+    """
+    try:
+        fig, ax = plt.subplots(figsize=figsize)
+        
+        colors = sns.color_palette('husl', n_colors=len(models_data))
+        
+        for i, (model_name, (fpr, tpr, auc_score)) in enumerate(models_data.items()):
+            ax.plot(fpr, tpr, 
+                   linewidth=2.5,
+                   label=f'{model_name} (AUC = {auc_score:.3f})',
+                   color=colors[i])
+        
+        # Add diagonal reference line (random classifier)
+        ax.plot([0, 1], [0, 1], 
+               linestyle='--', 
+               linewidth=2,
+               color='gray',
+               label='Random Classifier (AUC = 0.500)')
+        
+        ax.set_xlim([0.0, 1.0])
+        ax.set_ylim([0.0, 1.05])
+        ax.set_xlabel('False Positive Rate', fontsize=12)
+        ax.set_ylabel('True Positive Rate', fontsize=12)
+        ax.set_title(title, fontsize=14, fontweight='bold', pad=20)
+        ax.legend(loc='lower right', fontsize=10, framealpha=0.9)
+        ax.grid(True, alpha=0.3, linestyle='--')
+        plt.tight_layout()
+        
+        if save_path:
+            fig.savefig(save_path, dpi=300, bbox_inches='tight')
+            print(f"   ✓ Saved ROC curve to {save_path}")
+        
+        return fig
+    
+    except Exception as e:
+        print(f"   ✗ Error creating ROC curve: {str(e)}")
+        return None
+
+
+def create_confusion_matrix(
+    cm: np.ndarray,
+    class_names: Optional[List[str]] = None,
+    title: str = "Confusion Matrix",
+    figsize: Tuple[int, int] = (10, 8),
+    show_percentages: bool = True,
+    save_path: Optional[str] = None
+) -> plt.Figure:
+    """
+    Create a confusion matrix heatmap with annotations.
+    
+    Args:
+        cm: Confusion matrix (from sklearn.metrics.confusion_matrix)
+        class_names: Names of classes (optional)
+        title: Plot title
+        figsize: Figure size
+        show_percentages: Show percentages in addition to counts
+        save_path: Optional save path
+        
+    Returns:
+        matplotlib Figure object
+        
+    Example:
+        >>> from sklearn.metrics import confusion_matrix
+        >>> cm = confusion_matrix(y_true, y_pred)
+        >>> fig = create_confusion_matrix(cm, class_names=['Class 0', 'Class 1'])
+    """
+    try:
+        fig, ax = plt.subplots(figsize=figsize)
+        
+        if class_names is None:
+            class_names = [f'Class {i}' for i in range(len(cm))]
+        
+        # Normalize for percentages
+        cm_percent = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100
+        
+        # Create annotations
+        if show_percentages:
+            annotations = np.array([[f'{count}\n({percent:.1f}%)' 
+                                   for count, percent in zip(row_counts, row_percents)]
+                                  for row_counts, row_percents in zip(cm, cm_percent)])
+        else:
+            annotations = cm
+        
+        # Create heatmap
+        sns.heatmap(cm,
+                   annot=annotations,
+                   fmt='',
+                   cmap='Blues',
+                   square=True,
+                   linewidths=0.5,
+                   cbar_kws={'label': 'Count'},
+                   xticklabels=class_names,
+                   yticklabels=class_names,
+                   ax=ax)
+        
+        ax.set_title(title, fontsize=14, fontweight='bold', pad=20)
+        ax.set_ylabel('Actual', fontsize=12)
+        ax.set_xlabel('Predicted', fontsize=12)
+        plt.tight_layout()
+        
+        if save_path:
+            fig.savefig(save_path, dpi=300, bbox_inches='tight')
+            print(f"   ✓ Saved confusion matrix to {save_path}")
+        
+        return fig
+    
+    except Exception as e:
+        print(f"   ✗ Error creating confusion matrix: {str(e)}")
+        return None
+
+
+def create_precision_recall_curve(
+    models_data: Dict[str, Tuple[np.ndarray, np.ndarray, float]],
+    title: str = "Precision-Recall Curve",
+    figsize: Tuple[int, int] = (10, 8),
+    save_path: Optional[str] = None
+) -> plt.Figure:
+    """
+    Create precision-recall curves for multiple models.
+    
+    Args:
+        models_data: Dict of {model_name: (precision, recall, avg_precision)}
+        title: Plot title
+        figsize: Figure size
+        save_path: Optional save path
+        
+    Returns:
+        matplotlib Figure object
+    """
+    try:
+        fig, ax = plt.subplots(figsize=figsize)
+        
+        colors = sns.color_palette('husl', n_colors=len(models_data))
+        
+        for i, (model_name, (precision, recall, avg_precision)) in enumerate(models_data.items()):
+            ax.plot(recall, precision,
+                   linewidth=2.5,
+                   label=f'{model_name} (AP = {avg_precision:.3f})',
+                   color=colors[i])
+        
+        ax.set_xlim([0.0, 1.0])
+        ax.set_ylim([0.0, 1.05])
+        ax.set_xlabel('Recall', fontsize=12)
+        ax.set_ylabel('Precision', fontsize=12)
+        ax.set_title(title, fontsize=14, fontweight='bold', pad=20)
+        ax.legend(loc='best', fontsize=10, framealpha=0.9)
+        ax.grid(True, alpha=0.3, linestyle='--')
+        plt.tight_layout()
+        
+        if save_path:
+            fig.savefig(save_path, dpi=300, bbox_inches='tight')
+            print(f"   ✓ Saved precision-recall curve to {save_path}")
+        
+        return fig
+    
+    except Exception as e:
+        print(f"   ✗ Error creating precision-recall curve: {str(e)}")
+        return None
+
+
+def create_feature_importance(
+    feature_names: List[str],
+    importances: np.ndarray,
+    title: str = "Feature Importance",
+    top_n: int = 20,
+    figsize: Tuple[int, int] = (10, 8),
+    save_path: Optional[str] = None
+) -> plt.Figure:
+    """
+    Create a horizontal bar chart of feature importances.
+    
+    Args:
+        feature_names: List of feature names
+        importances: Array of importance values
+        title: Plot title
+        top_n: Number of top features to show
+        figsize: Figure size
+        save_path: Optional save path
+        
+    Returns:
+        matplotlib Figure object
+        
+    Example:
+        >>> importances = model.feature_importances_
+        >>> fig = create_feature_importance(feature_names, importances, top_n=15)
+    """
+    try:
+        # Sort by importance
+        indices = np.argsort(importances)[::-1][:top_n]
+        sorted_features = [feature_names[i] for i in indices]
+        sorted_importances = importances[indices]
+        
+        # Create figure with appropriate height
+        height = max(8, top_n * 0.4)
+        fig, ax = plt.subplots(figsize=(figsize[0], height))
+        
+        # Color bars by positive/negative (if any negative values)
+        colors = ['green' if x >= 0 else 'red' for x in sorted_importances]
+        
+        # Create horizontal bar chart
+        y_pos = np.arange(len(sorted_features))
+        ax.barh(y_pos, sorted_importances, color=colors, edgecolor='black', linewidth=0.7)
+        
+        ax.set_yticks(y_pos)
+        ax.set_yticklabels(sorted_features)
+        ax.invert_yaxis()  # Top features at top
+        ax.set_xlabel('Importance Score', fontsize=12)
+        ax.set_title(title, fontsize=14, fontweight='bold', pad=20)
+        ax.grid(True, alpha=0.3, linestyle='--', axis='x')
+        plt.tight_layout()
+        
+        if save_path:
+            fig.savefig(save_path, dpi=300, bbox_inches='tight')
+            print(f"   ✓ Saved feature importance to {save_path}")
+        
+        return fig
+    
+    except Exception as e:
+        print(f"   ✗ Error creating feature importance plot: {str(e)}")
+        return None
+
+
+def create_residual_plot(
+    y_true: np.ndarray,
+    y_pred: np.ndarray,
+    title: str = "Residual Plot",
+    figsize: Tuple[int, int] = (10, 6),
+    save_path: Optional[str] = None
+) -> plt.Figure:
+    """
+    Create a residual plot (Predicted vs Actual) for regression models.
+    
+    Args:
+        y_true: True target values
+        y_pred: Predicted values
+        title: Plot title
+        figsize: Figure size
+        save_path: Optional save path
+        
+    Returns:
+        matplotlib Figure object
+    """
+    try:
+        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(figsize[0]*2, figsize[1]))
+        
+        residuals = y_true - y_pred
+        
+        # Plot 1: Predicted vs Actual
+        ax1.scatter(y_true, y_pred, alpha=0.5, s=50, edgecolors='black', linewidth=0.5)
+        
+        # Add perfect prediction line
+        min_val = min(y_true.min(), y_pred.min())
+        max_val = max(y_true.max(), y_pred.max())
+        ax1.plot([min_val, max_val], [min_val, max_val], 
+                'r--', linewidth=2, label='Perfect Prediction')
+        
+        ax1.set_xlabel('Actual Values', fontsize=12)
+        ax1.set_ylabel('Predicted Values', fontsize=12)
+        ax1.set_title('Predicted vs Actual', fontsize=13, fontweight='bold')
+        ax1.legend()
+        ax1.grid(True, alpha=0.3, linestyle='--')
+        
+        # Plot 2: Residuals vs Predicted
+        ax2.scatter(y_pred, residuals, alpha=0.5, s=50, 
+                   color='steelblue', edgecolors='black', linewidth=0.5)
+        ax2.axhline(y=0, color='red', linestyle='--', linewidth=2)
+        
+        ax2.set_xlabel('Predicted Values', fontsize=12)
+        ax2.set_ylabel('Residuals', fontsize=12)
+        ax2.set_title('Residuals vs Predicted', fontsize=13, fontweight='bold')
+        ax2.grid(True, alpha=0.3, linestyle='--')
+        
+        fig.suptitle(title, fontsize=14, fontweight='bold', y=1.02)
+        plt.tight_layout()
+        
+        if save_path:
+            fig.savefig(save_path, dpi=300, bbox_inches='tight')
+            print(f"   ✓ Saved residual plot to {save_path}")
+        
+        return fig
+    
+    except Exception as e:
+        print(f"   ✗ Error creating residual plot: {str(e)}")
+        return None
+
+
+def create_learning_curve(
+    train_sizes: np.ndarray,
+    train_scores_mean: np.ndarray,
+    train_scores_std: np.ndarray,
+    val_scores_mean: np.ndarray,
+    val_scores_std: np.ndarray,
+    title: str = "Learning Curve",
+    figsize: Tuple[int, int] = (10, 6),
+    save_path: Optional[str] = None
+) -> plt.Figure:
+    """
+    Create a learning curve showing training and validation scores.
+    
+    Args:
+        train_sizes: Array of training set sizes
+        train_scores_mean: Mean training scores
+        train_scores_std: Std of training scores
+        val_scores_mean: Mean validation scores
+        val_scores_std: Std of validation scores
+        title: Plot title
+        figsize: Figure size
+        save_path: Optional save path
+        
+    Returns:
+        matplotlib Figure object
+    """
+    try:
+        fig, ax = plt.subplots(figsize=figsize)
+        
+        # Plot training scores
+        ax.plot(train_sizes, train_scores_mean, 'o-', color='blue',
+               linewidth=2, markersize=8, label='Training Score')
+        ax.fill_between(train_sizes,
+                       train_scores_mean - train_scores_std,
+                       train_scores_mean + train_scores_std,
+                       alpha=0.2, color='blue')
+        
+        # Plot validation scores
+        ax.plot(train_sizes, val_scores_mean, 'o-', color='orange',
+               linewidth=2, markersize=8, label='Validation Score')
+        ax.fill_between(train_sizes,
+                       val_scores_mean - val_scores_std,
+                       val_scores_mean + val_scores_std,
+                       alpha=0.2, color='orange')
+        
+        ax.set_xlabel('Training Set Size', fontsize=12)
+        ax.set_ylabel('Score', fontsize=12)
+        ax.set_title(title, fontsize=14, fontweight='bold', pad=20)
+        ax.legend(loc='best', fontsize=11, framealpha=0.9)
+        ax.grid(True, alpha=0.3, linestyle='--')
+        plt.tight_layout()
+        
+        if save_path:
+            fig.savefig(save_path, dpi=300, bbox_inches='tight')
+            print(f"   ✓ Saved learning curve to {save_path}")
+        
+        return fig
+    
+    except Exception as e:
+        print(f"   ✗ Error creating learning curve: {str(e)}")
+        return None
+
+
+# ============================================================================
+# DATA QUALITY PLOTS
+# ============================================================================
+
+def create_missing_values_heatmap(
+    df: pd.DataFrame,
+    title: str = "Missing Values Heatmap",
+    figsize: Tuple[int, int] = (12, 8),
+    save_path: Optional[str] = None
+) -> plt.Figure:
+    """
+    Create a heatmap showing missing values pattern.
+    
+    Args:
+        df: DataFrame to analyze
+        title: Plot title
+        figsize: Figure size
+        save_path: Optional save path
+        
+    Returns:
+        matplotlib Figure object
+    """
+    try:
+        fig, ax = plt.subplots(figsize=figsize)
+        
+        # Create binary matrix (1 = missing, 0 = present)
+        missing_matrix = df.isnull().astype(int)
+        
+        # Plot heatmap
+        sns.heatmap(missing_matrix.T,
+                   cbar=False,
+                   cmap='RdYlGn_r',
+                   ax=ax,
+                   yticklabels=df.columns)
+        
+        ax.set_title(title, fontsize=14, fontweight='bold', pad=20)
+        ax.set_xlabel('Sample Index', fontsize=12)
+        ax.set_ylabel('Features', fontsize=12)
+        plt.tight_layout()
+        
+        if save_path:
+            fig.savefig(save_path, dpi=300, bbox_inches='tight')
+            print(f"   ✓ Saved missing values heatmap to {save_path}")
+        
+        return fig
+    
+    except Exception as e:
+        print(f"   ✗ Error creating missing values heatmap: {str(e)}")
+        return None
+
+
+def create_missing_values_bar(
+    df: pd.DataFrame,
+    title: str = "Missing Values by Column",
+    figsize: Tuple[int, int] = (10, 6),
+    save_path: Optional[str] = None
+) -> plt.Figure:
+    """
+    Create a bar chart showing percentage of missing values per column.
+    
+    Args:
+        df: DataFrame to analyze
+        title: Plot title
+        figsize: Figure size
+        save_path: Optional save path
+        
+    Returns:
+        matplotlib Figure object
+    """
+    try:
+        # Calculate missing percentages
+        missing_pct = (df.isnull().sum() / len(df) * 100).sort_values(ascending=False)
+        missing_pct = missing_pct[missing_pct > 0]  # Only columns with missing values
+        
+        if len(missing_pct) == 0:
+            print("   ℹ No missing values found")
+            return None
+        
+        height = max(6, len(missing_pct) * 0.3)
+        fig, ax = plt.subplots(figsize=(figsize[0], height))
+        
+        # Create horizontal bar chart
+        colors = plt.cm.Reds(missing_pct / 100)
+        ax.barh(range(len(missing_pct)), missing_pct.values,
+               color=colors, edgecolor='black', linewidth=0.7)
+        
+        ax.set_yticks(range(len(missing_pct)))
+        ax.set_yticklabels(missing_pct.index)
+        ax.set_xlabel('Missing Values (%)', fontsize=12)
+        ax.set_title(title, fontsize=14, fontweight='bold', pad=20)
+        ax.grid(True, alpha=0.3, linestyle='--', axis='x')
+        
+        # Add percentage labels
+        for i, v in enumerate(missing_pct.values):
+            ax.text(v + 1, i, f'{v:.1f}%', va='center', fontsize=10)
+        
+        plt.tight_layout()
+        
+        if save_path:
+            fig.savefig(save_path, dpi=300, bbox_inches='tight')
+            print(f"   ✓ Saved missing values bar chart to {save_path}")
+        
+        return fig
+    
+    except Exception as e:
+        print(f"   ✗ Error creating missing values bar chart: {str(e)}")
+        return None
+
+
+def create_outlier_detection_boxplot(
+    df: pd.DataFrame,
+    columns: Optional[List[str]] = None,
+    title: str = "Outlier Detection",
+    figsize: Tuple[int, int] = (12, 6),
+    save_path: Optional[str] = None
+) -> plt.Figure:
+    """
+    Create box plots for outlier detection across multiple columns.
+    
+    Args:
+        df: DataFrame with numeric columns
+        columns: Columns to plot (None = all numeric)
+        title: Plot title
+        figsize: Figure size
+        save_path: Optional save path
+        
+    Returns:
+        matplotlib Figure object
+    """
+    try:
+        if columns is None:
+            columns = df.select_dtypes(include=[np.number]).columns.tolist()[:10]
+        
+        return create_boxplot(df[columns], title=title, figsize=figsize, save_path=save_path)
+    
+    except Exception as e:
+        print(f"   ✗ Error creating outlier detection plot: {str(e)}")
+        return None
+
+
+def create_skewness_plot(
+    df: pd.DataFrame,
+    title: str = "Feature Skewness Distribution",
+    figsize: Tuple[int, int] = (10, 6),
+    save_path: Optional[str] = None
+) -> plt.Figure:
+    """
+    Create a bar chart showing skewness of numeric features.
+    
+    Args:
+        df: DataFrame with numeric columns
+        title: Plot title
+        figsize: Figure size
+        save_path: Optional save path
+        
+    Returns:
+        matplotlib Figure object
+    """
+    try:
+        # Calculate skewness for numeric columns
+        numeric_cols = df.select_dtypes(include=[np.number]).columns
+        skewness = df[numeric_cols].skew().sort_values(ascending=False)
+        
+        if len(skewness) == 0:
+            print("   ℹ No numeric columns to analyze")
+            return None
+        
+        height = max(6, len(skewness) * 0.3)
+        fig, ax = plt.subplots(figsize=(figsize[0], height))
+        
+        # Color by skewness level
+        colors = ['green' if abs(x) < 0.5 else 'orange' if abs(x) < 1 else 'red' 
+                 for x in skewness.values]
+        
+        ax.barh(range(len(skewness)), skewness.values,
+               color=colors, edgecolor='black', linewidth=0.7)
+        
+        ax.set_yticks(range(len(skewness)))
+        ax.set_yticklabels(skewness.index)
+        ax.set_xlabel('Skewness', fontsize=12)
+        ax.set_title(title, fontsize=14, fontweight='bold', pad=20)
+        ax.axvline(x=0, color='black', linestyle='-', linewidth=1)
+        ax.axvline(x=-0.5, color='gray', linestyle='--', linewidth=1, alpha=0.5)
+        ax.axvline(x=0.5, color='gray', linestyle='--', linewidth=1, alpha=0.5)
+        ax.grid(True, alpha=0.3, linestyle='--', axis='x')
+        
+        # Add legend
+        from matplotlib.patches import Patch
+        legend_elements = [
+            Patch(facecolor='green', label='Low (|skew| < 0.5)'),
+            Patch(facecolor='orange', label='Moderate (0.5 ≤ |skew| < 1)'),
+            Patch(facecolor='red', label='High (|skew| ≥ 1)')
+        ]
+        ax.legend(handles=legend_elements, loc='best')
+        
+        plt.tight_layout()
+        
+        if save_path:
+            fig.savefig(save_path, dpi=300, bbox_inches='tight')
+            print(f"   ✓ Saved skewness plot to {save_path}")
+        
+        return fig
+    
+    except Exception as e:
+        print(f"   ✗ Error creating skewness plot: {str(e)}")
+        return None
+
+
+# ============================================================================
+# UTILITY FUNCTIONS
+# ============================================================================
+
+def save_figure(fig: plt.Figure, path: str, dpi: int = 300) -> None:
+    """
+    Save a matplotlib figure to file.
+    
+    Args:
+        fig: Matplotlib Figure object
+        path: Output file path (supports .png, .jpg, .pdf, .svg)
+        dpi: Resolution (dots per inch)
+    """
+    try:
+        Path(path).parent.mkdir(parents=True, exist_ok=True)
+        fig.savefig(path, dpi=dpi, bbox_inches='tight', facecolor='white')
+        print(f"   ✓ Saved figure to {path}")
+    except Exception as e:
+        print(f"   ✗ Error saving figure: {str(e)}")
+
+
+def close_figure(fig: plt.Figure) -> None:
+    """
+    Close a matplotlib figure to free memory.
+    
+    Args:
+        fig: Matplotlib Figure object
+    """
+    if fig is not None:
+        plt.close(fig)
+
+
+def create_subplots_grid(
+    plot_data: List[Dict[str, Any]],
+    rows: int,
+    cols: int,
+    figsize: Tuple[int, int] = (15, 12),
+    title: str = "Plot Grid",
+    save_path: Optional[str] = None
+) -> plt.Figure:
+    """
+    Create a grid of subplots.
+    
+    Args:
+        plot_data: List of dicts with plot specifications
+        rows: Number of rows
+        cols: Number of columns
+        figsize: Figure size
+        title: Overall title
+        save_path: Optional save path
+        
+    Returns:
+        matplotlib Figure object
+        
+    Example:
+        >>> plots = [
+        ...     {'type': 'scatter', 'x': x1, 'y': y1, 'title': 'Plot 1'},
+        ...     {'type': 'hist', 'data': data1, 'title': 'Plot 2'}
+        ... ]
+        >>> fig = create_subplots_grid(plots, 2, 2)
+    """
+    try:
+        fig, axes = plt.subplots(rows, cols, figsize=figsize)
+        axes = axes.flatten() if rows * cols > 1 else [axes]
+        
+        for i, (ax, plot_spec) in enumerate(zip(axes, plot_data)):
+            plot_type = plot_spec.get('type', 'scatter')
+            
+            if plot_type == 'scatter':
+                ax.scatter(plot_spec['x'], plot_spec['y'], alpha=0.6)
+            elif plot_type == 'hist':
+                ax.hist(plot_spec['data'], bins=30, edgecolor='black')
+            elif plot_type == 'line':
+                ax.plot(plot_spec['x'], plot_spec['y'])
+            
+            ax.set_title(plot_spec.get('title', f'Subplot {i+1}'), fontweight='bold')
+            ax.grid(True, alpha=0.3)
+        
+        # Hide unused subplots
+        for i in range(len(plot_data), len(axes)):
+            axes[i].axis('off')
+        
+        fig.suptitle(title, fontsize=16, fontweight='bold', y=0.995)
+        plt.tight_layout()
+        
+        if save_path:
+            fig.savefig(save_path, dpi=300, bbox_inches='tight')
+            print(f"   ✓ Saved subplot grid to {save_path}")
+        
+        return fig
+    
+    except Exception as e:
+        print(f"   ✗ Error creating subplot grid: {str(e)}")
+        return None
diff --git a/src/tools/model_training.py b/src/tools/model_training.py
new file mode 100644
index 0000000000000000000000000000000000000000..e77ce5f03ce7757f2a692ef7641a70dadbf856ab
--- /dev/null
+++ b/src/tools/model_training.py
@@ -0,0 +1,463 @@
+"""
+Model Training Tools
+Tools for training machine learning models and generating reports.
+"""
+
+import polars as pl
+import numpy as np
+from typing import Dict, Any, List, Optional
+from pathlib import Path
+import sys
+import os
+import joblib
+import json
+import tempfile
+
+# Add parent directory to path for imports
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+# Import artifact store
+try:
+    from storage.helpers import save_model_with_store
+    ARTIFACT_STORE_AVAILABLE = True
+except ImportError:
+    ARTIFACT_STORE_AVAILABLE = False
+    print("⚠️  Artifact store not available, using local paths")
+
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import LogisticRegression, Ridge, Lasso
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+from xgboost import XGBClassifier, XGBRegressor
+from lightgbm import LGBMClassifier, LGBMRegressor
+from catboost import CatBoostClassifier, CatBoostRegressor
+from sklearn.metrics import (
+    accuracy_score, precision_score, recall_score, f1_score,
+    confusion_matrix, classification_report,
+    mean_squared_error, mean_absolute_error, r2_score
+)
+import shap
+
+try:
+    from .visualization_engine import (
+        generate_model_performance_plots,
+        generate_feature_importance_plot
+    )
+    VISUALIZATION_AVAILABLE = True
+except ImportError as e:
+    VISUALIZATION_AVAILABLE = False
+    print(f"⚠️  Visualization engine not available: {e}")
+
+from utils.polars_helpers import (
+    load_dataframe,
+    get_numeric_columns,
+    split_features_target,
+)
+from utils.validation import (
+    validate_file_exists,
+    validate_file_format,
+    validate_dataframe,
+    validate_column_exists,
+    validate_target_column,
+)
+
+
+def train_baseline_models(file_path: str, target_col: str, 
+                         task_type: str = "auto",
+                         test_size: float = 0.2,
+                         random_state: int = 42) -> Dict[str, Any]:
+    """
+    Train multiple baseline models and compare performance.
+    
+    Args:
+        file_path: Path to prepared dataset
+        target_col: Name of target column
+        task_type: 'classification', 'regression', or 'auto'
+        test_size: Proportion for test split
+        random_state: Random seed
+        
+    Returns:
+        Dictionary with training results and best model
+    """
+    # Validation
+    validate_file_exists(file_path)
+    validate_file_format(file_path)
+    
+    # Load data
+    df = load_dataframe(file_path)
+    validate_dataframe(df)
+    validate_column_exists(df, target_col)
+    
+    # Infer task type if auto
+    if task_type == "auto":
+        task_type = validate_target_column(df, target_col)
+    
+    # Split features and target
+    X, y = split_features_target(df, target_col)
+    
+    # Convert to numpy for sklearn
+    # Only keep numeric columns for X
+    numeric_cols = get_numeric_columns(X)
+    if len(numeric_cols) == 0:
+        return {
+            "status": "error",
+            "message": "No numeric features found. Please encode categorical variables first."
+        }
+    
+    X_numeric = X.select(numeric_cols)
+    X_np = X_numeric.to_numpy()
+    y_np = y.to_numpy()
+    
+    # Handle missing values (simple imputation with mean)
+    from sklearn.impute import SimpleImputer
+    imputer = SimpleImputer(strategy='mean')
+    X_np = imputer.fit_transform(X_np)
+    
+    # Train-test split
+    X_train, X_test, y_train, y_test = train_test_split(
+        X_np, y_np, test_size=test_size, random_state=random_state
+    )
+    
+    results = {
+        "task_type": task_type,
+        "n_features": X_np.shape[1],
+        "n_samples": len(X_np),
+        "train_size": len(X_train),
+        "test_size": len(X_test),
+        "feature_names": numeric_cols,
+        "models": {}
+    }
+    
+    # Train models based on task type
+    if task_type == "classification":
+        models = {
+            "logistic_regression": LogisticRegression(max_iter=1000, random_state=random_state),
+            "random_forest": RandomForestClassifier(n_estimators=100, random_state=random_state, n_jobs=-1),
+            "xgboost": XGBClassifier(n_estimators=100, random_state=random_state, n_jobs=-1),
+            "lightgbm": LGBMClassifier(n_estimators=100, random_state=random_state, n_jobs=-1, verbose=-1),
+            "catboost": CatBoostClassifier(iterations=100, random_state=random_state, verbose=0, allow_writing_files=False)
+        }
+        
+        for model_name, model in models.items():
+            try:
+                # Train
+                model.fit(X_train, y_train)
+                
+                # Predict
+                y_pred_train = model.predict(X_train)
+                y_pred_test = model.predict(X_test)
+                
+                # Metrics
+                results["models"][model_name] = {
+                    "train_metrics": {
+                        "accuracy": float(accuracy_score(y_train, y_pred_train)),
+                        "precision": float(precision_score(y_train, y_pred_train, average='weighted', zero_division=0)),
+                        "recall": float(recall_score(y_train, y_pred_train, average='weighted', zero_division=0)),
+                        "f1": float(f1_score(y_train, y_pred_train, average='weighted', zero_division=0))
+                    },
+                    "test_metrics": {
+                        "accuracy": float(accuracy_score(y_test, y_pred_test)),
+                        "precision": float(precision_score(y_test, y_pred_test, average='weighted', zero_division=0)),
+                        "recall": float(recall_score(y_test, y_pred_test, average='weighted', zero_division=0)),
+                        "f1": float(f1_score(y_test, y_pred_test, average='weighted', zero_division=0))
+                    }
+                }
+                
+                # Save model using artifact store
+                if ARTIFACT_STORE_AVAILABLE:
+                    model_path = save_model_with_store(
+                        model_data={
+                            "model": model,
+                            "imputer": imputer,
+                            "feature_names": numeric_cols
+                        },
+                        filename=f"{model_name}.pkl",
+                        metadata={
+                            "model_name": model_name,
+                            "task_type": "classification",
+                            "train_accuracy": float(accuracy_score(y_train, y_pred_train)),
+                            "test_accuracy": float(accuracy_score(y_test, y_pred_test)),
+                            "features": numeric_cols
+                        }
+                    )
+                else:
+                    model_path = f"./outputs/models/{model_name}.pkl"
+                    Path(model_path).parent.mkdir(parents=True, exist_ok=True)
+                    joblib.dump({
+                        "model": model,
+                        "imputer": imputer,
+                        "feature_names": numeric_cols
+                    }, model_path)
+                
+                results["models"][model_name]["model_path"] = model_path
+                
+            except Exception as e:
+                results["models"][model_name] = {
+                    "status": "error",
+                    "message": str(e)
+                }
+    
+    else:  # regression
+        models = {
+            "ridge": Ridge(random_state=random_state),
+            "lasso": Lasso(random_state=random_state),
+            "random_forest": RandomForestRegressor(n_estimators=100, random_state=random_state, n_jobs=-1),
+            "xgboost": XGBRegressor(n_estimators=100, random_state=random_state, n_jobs=-1),
+            "lightgbm": LGBMRegressor(n_estimators=100, random_state=random_state, n_jobs=-1, verbose=-1),
+            "catboost": CatBoostRegressor(iterations=100, random_state=random_state, verbose=0, allow_writing_files=False)
+        }
+        
+        for model_name, model in models.items():
+            try:
+                # Train
+                model.fit(X_train, y_train)
+                
+                # Predict
+                y_pred_train = model.predict(X_train)
+                y_pred_test = model.predict(X_test)
+                
+                # Metrics
+                results["models"][model_name] = {
+                    "train_metrics": {
+                        "mse": float(mean_squared_error(y_train, y_pred_train)),
+                        "rmse": float(np.sqrt(mean_squared_error(y_train, y_pred_train))),
+                        "mae": float(mean_absolute_error(y_train, y_pred_train)),
+                        "r2": float(r2_score(y_train, y_pred_train))
+                    },
+                    "test_metrics": {
+                        "mse": float(mean_squared_error(y_test, y_pred_test)),
+                        "rmse": float(np.sqrt(mean_squared_error(y_test, y_pred_test))),
+                        "mae": float(mean_absolute_error(y_test, y_pred_test)),
+                        "r2": float(r2_score(y_test, y_pred_test))
+                    }
+                }
+                
+                # Save model using artifact store
+                if ARTIFACT_STORE_AVAILABLE:
+                    model_path = save_model_with_store(
+                        model_data={
+                            "model": model,
+                            "imputer": imputer,
+                            "feature_names": numeric_cols
+                        },
+                        filename=f"{model_name}.pkl",
+                        metadata={
+                            "model_name": model_name,
+                            "task_type": "regression",
+                            "train_r2": float(r2_score(y_train, y_pred_train)),
+                            "test_r2": float(r2_score(y_test, y_pred_test)),
+                            "features": numeric_cols
+                        }
+                    )
+                else:
+                    model_path = f"./outputs/models/{model_name}.pkl"
+                    Path(model_path).parent.mkdir(parents=True, exist_ok=True)
+                    joblib.dump({
+                        "model": model,
+                        "imputer": imputer,
+                        "feature_names": numeric_cols
+                    }, model_path)
+                
+                results["models"][model_name]["model_path"] = model_path
+                
+            except Exception as e:
+                results["models"][model_name] = {
+                    "status": "error",
+                    "message": str(e)
+                }
+    
+    # Determine best model
+    best_model_name = None
+    best_score = -float('inf')
+    
+    for model_name, model_results in results["models"].items():
+        if "test_metrics" in model_results:
+            if task_type == "classification":
+                score = model_results["test_metrics"]["f1"]
+            else:
+                score = model_results["test_metrics"]["r2"]
+            
+            if score > best_score:
+                best_score = score
+                best_model_name = model_name
+    
+    results["best_model"] = {
+        "name": best_model_name,
+        "score": best_score,
+        "model_path": results["models"][best_model_name]["model_path"] if best_model_name else None
+    }
+    
+    # Generate visualizations for best model
+    if VISUALIZATION_AVAILABLE and best_model_name:
+        try:
+            print(f"\n🎨 Generating visualizations for {best_model_name}...")
+            
+            # Load best model
+            model_data = joblib.dump({
+                "model": models[best_model_name],
+                "imputer": imputer,
+                "feature_names": numeric_cols
+            }, f"./outputs/models/{best_model_name}_temp.pkl")
+            
+            # Get predictions for visualization
+            best_model = models[best_model_name]
+            y_pred_test = best_model.predict(X_test)
+            y_pred_proba = None
+            if hasattr(best_model, "predict_proba") and task_type == "classification":
+                y_pred_proba = best_model.predict_proba(X_test)
+            
+            # Generate model performance plots
+            plot_dir = "./outputs/plots/model_performance"
+            perf_plots = generate_model_performance_plots(
+                y_true=y_test,
+                y_pred=y_pred_test,
+                y_pred_proba=y_pred_proba,
+                task_type=task_type,
+                model_name=best_model_name,
+                output_dir=plot_dir
+            )
+            results["performance_plots"] = perf_plots["plot_paths"]
+            
+            # Generate feature importance plot if available
+            if hasattr(best_model, "feature_importances_"):
+                feature_importance = dict(zip(numeric_cols, best_model.feature_importances_))
+                importance_plot = generate_feature_importance_plot(
+                    feature_importances=feature_importance,
+                    output_path=f"{plot_dir}/feature_importance_{best_model_name}.png"
+                )
+                results["feature_importance_plot"] = importance_plot
+            
+            print(f"   ✓ Generated {len(perf_plots.get('plot_paths', []))} performance plots")
+            results["visualization_generated"] = True
+            
+        except Exception as e:
+            print(f"   ⚠️ Could not generate visualizations: {str(e)}")
+            results["visualization_generated"] = False
+    else:
+        results["visualization_generated"] = False
+    
+    return results
+
+
+def generate_model_report(model_path: str, test_data_path: str, 
+                         target_col: str, output_path: str) -> Dict[str, Any]:
+    """
+    Generate comprehensive model evaluation report.
+    
+    Args:
+        model_path: Path to saved model file
+        test_data_path: Path to test dataset
+        target_col: Name of target column
+        output_path: Path to save report JSON
+        
+    Returns:
+        Dictionary with model report
+    """
+    # Validation
+    validate_file_exists(model_path)
+    validate_file_exists(test_data_path)
+    
+    # Load model
+    model_data = joblib.load(model_path)
+    model = model_data["model"]
+    imputer = model_data["imputer"]
+    feature_names = model_data["feature_names"]
+    
+    # Load test data
+    df = load_dataframe(test_data_path)
+    validate_dataframe(df)
+    validate_column_exists(df, target_col)
+    
+    # Prepare features
+    X = df.select(feature_names)
+    y = df[target_col].to_numpy()
+    X_np = imputer.transform(X.to_numpy())
+    
+    # Predict
+    y_pred = model.predict(X_np)
+    
+    # Determine task type
+    if hasattr(model, "predict_proba"):
+        task_type = "classification"
+    else:
+        task_type = "regression"
+    
+    report = {
+        "model_path": model_path,
+        "task_type": task_type,
+        "n_features": len(feature_names),
+        "n_samples": len(X_np)
+    }
+    
+    # Calculate metrics
+    if task_type == "classification":
+        report["metrics"] = {
+            "accuracy": float(accuracy_score(y, y_pred)),
+            "precision": float(precision_score(y, y_pred, average='weighted', zero_division=0)),
+            "recall": float(recall_score(y, y_pred, average='weighted', zero_division=0)),
+            "f1": float(f1_score(y, y_pred, average='weighted', zero_division=0))
+        }
+        
+        # Confusion matrix
+        cm = confusion_matrix(y, y_pred)
+        report["confusion_matrix"] = cm.tolist()
+        
+        # Classification report
+        class_report = classification_report(y, y_pred, output_dict=True, zero_division=0)
+        report["classification_report"] = class_report
+    
+    else:  # regression
+        report["metrics"] = {
+            "mse": float(mean_squared_error(y, y_pred)),
+            "rmse": float(np.sqrt(mean_squared_error(y, y_pred))),
+            "mae": float(mean_absolute_error(y, y_pred)),
+            "r2": float(r2_score(y, y_pred))
+        }
+    
+    # Feature importance
+    if hasattr(model, "feature_importances_"):
+        importances = model.feature_importances_
+        feature_importance = [
+            {"feature": name, "importance": float(imp)}
+            for name, imp in zip(feature_names, importances)
+        ]
+        feature_importance.sort(key=lambda x: x["importance"], reverse=True)
+        report["feature_importance"] = feature_importance[:20]  # Top 20
+    
+    # SHAP values (for top 10 features)
+    try:
+        # Use TreeExplainer for tree-based models
+        if hasattr(model, "feature_importances_"):
+            explainer = shap.TreeExplainer(model)
+        else:
+            # Use KernelExplainer for other models (sample for speed)
+            sample_size = min(100, len(X_np))
+            explainer = shap.KernelExplainer(
+                model.predict, 
+                X_np[:sample_size]
+            )
+        
+        shap_values = explainer.shap_values(X_np[:100])  # First 100 samples
+        
+        # Calculate mean absolute SHAP values
+        if isinstance(shap_values, list):  # Multi-class
+            shap_values = shap_values[0]
+        
+        mean_shap = np.abs(shap_values).mean(axis=0)
+        shap_importance = [
+            {"feature": name, "shap_value": float(val)}
+            for name, val in zip(feature_names, mean_shap)
+        ]
+        shap_importance.sort(key=lambda x: x["shap_value"], reverse=True)
+        report["shap_feature_importance"] = shap_importance[:10]  # Top 10
+    
+    except Exception as e:
+        report["shap_error"] = f"Could not compute SHAP values: {str(e)}"
+    
+    # Save report
+    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+    with open(output_path, 'w') as f:
+        json.dump(report, f, indent=2)
+    
+    report["output_path"] = output_path
+    
+    return report
diff --git a/src/tools/nlp_text_analytics.py b/src/tools/nlp_text_analytics.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c2f4ab79d07e4c25784e3bcc8cf34809f5f217f
--- /dev/null
+++ b/src/tools/nlp_text_analytics.py
@@ -0,0 +1,733 @@
+"""
+NLP & Text Analytics Tools
+
+Advanced natural language processing tools for text analysis, topic modeling,
+named entity recognition, sentiment analysis, and text similarity.
+"""
+
+import polars as pl
+import numpy as np
+from typing import Dict, Any, List, Optional, Tuple
+from pathlib import Path
+import json
+
+# Core NLP
+try:
+    from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
+    from sklearn.decomposition import LatentDirichletAllocation, NMF
+    from sklearn.metrics.pairwise import cosine_similarity
+except ImportError:
+    pass
+
+# Advanced NLP (optional)
+try:
+    import spacy
+    SPACY_AVAILABLE = True
+except ImportError:
+    SPACY_AVAILABLE = False
+
+try:
+    from transformers import pipeline, AutoTokenizer, AutoModel
+    import torch
+    TRANSFORMERS_AVAILABLE = True
+except ImportError:
+    TRANSFORMERS_AVAILABLE = False
+
+try:
+    from bertopic import BERTopic
+    BERTOPIC_AVAILABLE = True
+except ImportError:
+    BERTOPIC_AVAILABLE = False
+
+# Basic NLP
+try:
+    from textblob import TextBlob
+except ImportError:
+    pass
+
+import re
+from collections import Counter
+
+
+def perform_topic_modeling(
+    data: pl.DataFrame,
+    text_column: str,
+    n_topics: int = 5,
+    method: str = "lda",
+    n_top_words: int = 10,
+    min_df: int = 2,
+    max_df: float = 0.95,
+    ngram_range: Tuple[int, int] = (1, 2),
+    random_state: int = 42,
+    **kwargs
+) -> Dict[str, Any]:
+    """
+    Perform topic modeling on text data using LDA, NMF, or BERTopic.
+    
+    Args:
+        data: Input DataFrame
+        text_column: Column containing text data
+        n_topics: Number of topics to extract
+        method: Topic modeling method ('lda', 'nmf', 'bertopic')
+        n_top_words: Number of top words per topic
+        min_df: Minimum document frequency for terms
+        max_df: Maximum document frequency for terms
+        ngram_range: Range of n-grams to extract
+        random_state: Random state for reproducibility
+        **kwargs: Additional parameters for the chosen method
+    
+    Returns:
+        Dictionary containing topics, document-topic distributions, and metrics
+    """
+    print(f"🔍 Performing topic modeling using {method.upper()}...")
+    
+    # Validate input
+    if text_column not in data.columns:
+        raise ValueError(f"Text column '{text_column}' not found in DataFrame")
+    
+    # Extract text and clean
+    texts = data[text_column].to_list()
+    texts = [str(t) if t is not None else "" for t in texts]
+    
+    # Filter out empty texts
+    valid_indices = [i for i, t in enumerate(texts) if len(t.strip()) > 0]
+    texts_clean = [texts[i] for i in valid_indices]
+    
+    if len(texts_clean) < n_topics:
+        raise ValueError(f"Not enough documents ({len(texts_clean)}) for {n_topics} topics")
+    
+    result = {
+        "method": method,
+        "n_topics": n_topics,
+        "n_documents": len(texts_clean),
+        "topics": [],
+        "document_topics": None,
+        "topic_coherence": None
+    }
+    
+    try:
+        if method == "bertopic" and BERTOPIC_AVAILABLE:
+            # BERTopic - transformer-based topic modeling
+            print("  Using BERTopic (transformer-based)...")
+            
+            model = BERTopic(
+                nr_topics=n_topics,
+                language="english",
+                calculate_probabilities=True,
+                verbose=False,
+                **kwargs
+            )
+            
+            topics_assigned, probabilities = model.fit_transform(texts_clean)
+            
+            # Extract topic information
+            topic_info = model.get_topic_info()
+            
+            for topic_id in range(n_topics):
+                if topic_id in model.get_topics():
+                    topic_words = model.get_topic(topic_id)[:n_top_words]
+                    result["topics"].append({
+                        "topic_id": topic_id,
+                        "words": [word for word, score in topic_words],
+                        "scores": [float(score) for word, score in topic_words],
+                        "size": int(topic_info[topic_info['Topic'] == topic_id]['Count'].iloc[0])
+                    })
+            
+            # Document-topic distributions
+            result["document_topics"] = probabilities.tolist() if probabilities is not None else None
+            result["topic_assignments"] = topics_assigned.tolist()
+            
+        elif method in ["lda", "nmf"]:
+            # Traditional topic modeling with sklearn
+            print(f"  Using {method.upper()} with TF-IDF/Count vectorization...")
+            
+            # Vectorization
+            if method == "lda":
+                vectorizer = CountVectorizer(
+                    min_df=min_df,
+                    max_df=max_df,
+                    ngram_range=ngram_range,
+                    stop_words='english',
+                    max_features=kwargs.get('max_features', 1000)
+                )
+            else:  # nmf
+                vectorizer = TfidfVectorizer(
+                    min_df=min_df,
+                    max_df=max_df,
+                    ngram_range=ngram_range,
+                    stop_words='english',
+                    max_features=kwargs.get('max_features', 1000)
+                )
+            
+            doc_term_matrix = vectorizer.fit_transform(texts_clean)
+            feature_names = vectorizer.get_feature_names_out()
+            
+            # Topic modeling
+            if method == "lda":
+                model = LatentDirichletAllocation(
+                    n_components=n_topics,
+                    random_state=random_state,
+                    max_iter=kwargs.get('max_iter', 20),
+                    learning_method='online',
+                    n_jobs=-1
+                )
+            else:  # nmf
+                model = NMF(
+                    n_components=n_topics,
+                    random_state=random_state,
+                    max_iter=kwargs.get('max_iter', 200),
+                    init='nndsvda'
+                )
+            
+            doc_topic_dist = model.fit_transform(doc_term_matrix)
+            
+            # Extract topics
+            for topic_idx, topic in enumerate(model.components_):
+                top_indices = topic.argsort()[-n_top_words:][::-1]
+                top_words = [feature_names[i] for i in top_indices]
+                top_scores = [float(topic[i]) for i in top_indices]
+                
+                result["topics"].append({
+                    "topic_id": topic_idx,
+                    "words": top_words,
+                    "scores": top_scores,
+                    "size": int((doc_topic_dist.argmax(axis=1) == topic_idx).sum())
+                })
+            
+            # Document-topic distributions
+            result["document_topics"] = doc_topic_dist.tolist()
+            
+            # Topic assignments (most probable topic per document)
+            result["topic_assignments"] = doc_topic_dist.argmax(axis=1).tolist()
+            
+            # Calculate perplexity for LDA
+            if method == "lda":
+                result["perplexity"] = float(model.perplexity(doc_term_matrix))
+                result["log_likelihood"] = float(model.score(doc_term_matrix))
+            
+            # Vocabulary size
+            result["vocabulary_size"] = len(feature_names)
+            
+        else:
+            raise ValueError(f"Unknown method '{method}'. Use 'lda', 'nmf', or 'bertopic'")
+        
+        # Calculate topic diversity (unique words across topics)
+        all_topic_words = set()
+        total_topic_words = 0
+        for topic in result["topics"]:
+            all_topic_words.update(topic["words"])
+            total_topic_words += len(topic["words"])
+        
+        result["topic_diversity"] = len(all_topic_words) / total_topic_words if total_topic_words > 0 else 0
+        
+        # Summary statistics
+        result["summary"] = {
+            "total_topics": len(result["topics"]),
+            "avg_topic_size": np.mean([t["size"] for t in result["topics"]]),
+            "topic_diversity": result["topic_diversity"]
+        }
+        
+        print(f"✅ Topic modeling complete! Found {len(result['topics'])} topics")
+        print(f"   Topic diversity: {result['topic_diversity']:.3f}")
+        
+        return result
+        
+    except Exception as e:
+        print(f"❌ Error during topic modeling: {str(e)}")
+        raise
+
+
+def perform_named_entity_recognition(
+    data: pl.DataFrame,
+    text_column: str,
+    model: str = "en_core_web_sm",
+    entity_types: Optional[List[str]] = None,
+    min_confidence: float = 0.0
+) -> Dict[str, Any]:
+    """
+    Perform named entity recognition to extract people, organizations, locations, etc.
+    
+    Args:
+        data: Input DataFrame
+        text_column: Column containing text data
+        model: spaCy model to use ('en_core_web_sm', 'en_core_web_md', 'en_core_web_lg')
+        entity_types: List of entity types to extract (e.g., ['PERSON', 'ORG', 'GPE'])
+                     If None, extracts all types
+        min_confidence: Minimum confidence score for entity extraction (0.0-1.0)
+    
+    Returns:
+        Dictionary containing extracted entities, counts, and statistics
+    """
+    print(f"🔍 Performing named entity recognition with spaCy...")
+    
+    if not SPACY_AVAILABLE:
+        # Fallback to basic pattern matching
+        print("⚠️  spaCy not available. Using basic pattern matching...")
+        return _perform_ner_basic(data, text_column)
+    
+    # Validate input
+    if text_column not in data.columns:
+        raise ValueError(f"Text column '{text_column}' not found in DataFrame")
+    
+    try:
+        # Load spaCy model
+        try:
+            nlp = spacy.load(model)
+        except OSError:
+            print(f"⚠️  Model '{model}' not found. Attempting to download...")
+            import subprocess
+            subprocess.run(["python", "-m", "spacy", "download", model], check=True)
+            nlp = spacy.load(model)
+        
+        # Extract text
+        texts = data[text_column].to_list()
+        texts = [str(t) if t is not None else "" for t in texts]
+        
+        # Process documents
+        all_entities = []
+        entity_counts = Counter()
+        entity_by_type = {}
+        
+        print(f"  Processing {len(texts)} documents...")
+        
+        for doc_idx, text in enumerate(texts):
+            if len(text.strip()) == 0:
+                continue
+            
+            doc = nlp(text)
+            
+            for ent in doc.ents:
+                # Filter by entity type if specified
+                if entity_types and ent.label_ not in entity_types:
+                    continue
+                
+                entity_info = {
+                    "text": ent.text,
+                    "label": ent.label_,
+                    "start": ent.start_char,
+                    "end": ent.end_char,
+                    "document_id": doc_idx
+                }
+                
+                all_entities.append(entity_info)
+                entity_counts[(ent.text, ent.label_)] += 1
+                
+                if ent.label_ not in entity_by_type:
+                    entity_by_type[ent.label_] = []
+                entity_by_type[ent.label_].append(ent.text)
+        
+        # Aggregate results
+        result = {
+            "total_entities": len(all_entities),
+            "unique_entities": len(entity_counts),
+            "entities": all_entities,
+            "entity_counts": [
+                {"text": text, "label": label, "count": count}
+                for (text, label), count in entity_counts.most_common(100)
+            ],
+            "by_type": {}
+        }
+        
+        # Statistics by entity type
+        for entity_type, entities in entity_by_type.items():
+            type_counter = Counter(entities)
+            result["by_type"][entity_type] = {
+                "total": len(entities),
+                "unique": len(type_counter),
+                "top_entities": [
+                    {"text": text, "count": count}
+                    for text, count in type_counter.most_common(10)
+                ]
+            }
+        
+        print(f"✅ NER complete! Found {result['total_entities']} entities")
+        print(f"   Unique entities: {result['unique_entities']}")
+        print(f"   Entity types: {', '.join(result['by_type'].keys())}")
+        
+        return result
+        
+    except Exception as e:
+        print(f"❌ Error during NER: {str(e)}")
+        raise
+
+
+def _perform_ner_basic(data: pl.DataFrame, text_column: str) -> Dict[str, Any]:
+    """Fallback NER using basic pattern matching when spaCy is not available."""
+    
+    texts = data[text_column].to_list()
+    texts = [str(t) if t is not None else "" for t in texts]
+    
+    # Basic patterns
+    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
+    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
+    phone_pattern = r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b'
+    
+    emails = []
+    urls = []
+    phones = []
+    
+    for text in texts:
+        emails.extend(re.findall(email_pattern, text))
+        urls.extend(re.findall(url_pattern, text))
+        phones.extend(re.findall(phone_pattern, text))
+    
+    return {
+        "method": "basic_pattern_matching",
+        "total_entities": len(emails) + len(urls) + len(phones),
+        "by_type": {
+            "EMAIL": {"total": len(emails), "unique": len(set(emails)), "examples": list(set(emails))[:10]},
+            "URL": {"total": len(urls), "unique": len(set(urls)), "examples": list(set(urls))[:10]},
+            "PHONE": {"total": len(phones), "unique": len(set(phones)), "examples": list(set(phones))[:10]}
+        },
+        "note": "Install spaCy for advanced NER: pip install spacy && python -m spacy download en_core_web_sm"
+    }
+
+
+def analyze_sentiment_advanced(
+    data: pl.DataFrame,
+    text_column: str,
+    method: str = "transformer",
+    model_name: str = "distilbert-base-uncased-finetuned-sst-2-english",
+    aspects: Optional[List[str]] = None,
+    detect_emotions: bool = True
+) -> Dict[str, Any]:
+    """
+    Perform advanced sentiment analysis with aspect-based sentiment and emotion detection.
+    
+    Args:
+        data: Input DataFrame
+        text_column: Column containing text data
+        method: Analysis method ('transformer', 'textblob', 'vader')
+        model_name: Transformer model for sentiment analysis
+        aspects: List of aspects for aspect-based sentiment (e.g., ['price', 'quality'])
+        detect_emotions: Whether to detect emotions (joy, anger, sadness, etc.)
+    
+    Returns:
+        Dictionary containing sentiment scores, emotions, and statistics
+    """
+    print(f"🔍 Performing advanced sentiment analysis...")
+    
+    # Validate input
+    if text_column not in data.columns:
+        raise ValueError(f"Text column '{text_column}' not found in DataFrame")
+    
+    # Extract text
+    texts = data[text_column].to_list()
+    texts = [str(t) if t is not None else "" for t in texts]
+    texts_clean = [t for t in texts if len(t.strip()) > 0]
+    
+    result = {
+        "method": method,
+        "n_documents": len(texts_clean),
+        "sentiments": [],
+        "statistics": {}
+    }
+    
+    try:
+        if method == "transformer" and TRANSFORMERS_AVAILABLE:
+            print(f"  Using transformer model: {model_name}")
+            
+            # Sentiment analysis pipeline
+            sentiment_pipeline = pipeline(
+                "sentiment-analysis",
+                model=model_name,
+                truncation=True,
+                max_length=512
+            )
+            
+            # Process in batches
+            batch_size = 32
+            all_sentiments = []
+            
+            for i in range(0, len(texts_clean), batch_size):
+                batch = texts_clean[i:i+batch_size]
+                batch_results = sentiment_pipeline(batch)
+                all_sentiments.extend(batch_results)
+            
+            result["sentiments"] = [
+                {
+                    "label": s["label"],
+                    "score": float(s["score"]),
+                    "text": texts_clean[i][:100]  # First 100 chars
+                }
+                for i, s in enumerate(all_sentiments)
+            ]
+            
+            # Emotion detection
+            if detect_emotions:
+                try:
+                    emotion_pipeline = pipeline(
+                        "text-classification",
+                        model="j-hartmann/emotion-english-distilroberta-base",
+                        truncation=True,
+                        max_length=512
+                    )
+                    
+                    emotions = []
+                    for i in range(0, len(texts_clean), batch_size):
+                        batch = texts_clean[i:i+batch_size]
+                        batch_emotions = emotion_pipeline(batch)
+                        emotions.extend(batch_emotions)
+                    
+                    result["emotions"] = [
+                        {"emotion": e["label"], "score": float(e["score"])}
+                        for e in emotions
+                    ]
+                    
+                    # Emotion distribution
+                    emotion_counts = Counter([e["label"] for e in emotions])
+                    result["emotion_distribution"] = dict(emotion_counts)
+                    
+                except Exception as e:
+                    print(f"⚠️  Emotion detection failed: {str(e)}")
+                    result["emotions"] = None
+            
+        else:
+            # Fallback to TextBlob
+            print("  Using TextBlob for sentiment analysis...")
+            
+            sentiments = []
+            for text in texts_clean:
+                blob = TextBlob(text)
+                sentiments.append({
+                    "polarity": blob.sentiment.polarity,
+                    "subjectivity": blob.sentiment.subjectivity,
+                    "label": "POSITIVE" if blob.sentiment.polarity > 0 else "NEGATIVE" if blob.sentiment.polarity < 0 else "NEUTRAL",
+                    "text": text[:100]
+                })
+            
+            result["sentiments"] = sentiments
+        
+        # Aspect-based sentiment
+        if aspects:
+            print(f"  Analyzing aspect-based sentiment for: {', '.join(aspects)}")
+            result["aspect_sentiments"] = _extract_aspect_sentiments(texts_clean, aspects)
+        
+        # Calculate statistics
+        if method == "transformer":
+            sentiment_counts = Counter([s["label"] for s in result["sentiments"]])
+            result["statistics"] = {
+                "sentiment_distribution": dict(sentiment_counts),
+                "positive_ratio": sentiment_counts.get("POSITIVE", 0) / len(texts_clean),
+                "negative_ratio": sentiment_counts.get("NEGATIVE", 0) / len(texts_clean),
+                "avg_confidence": np.mean([s["score"] for s in result["sentiments"]])
+            }
+        else:
+            polarities = [s["polarity"] for s in result["sentiments"]]
+            result["statistics"] = {
+                "avg_polarity": np.mean(polarities),
+                "std_polarity": np.std(polarities),
+                "positive_ratio": sum(1 for p in polarities if p > 0) / len(polarities),
+                "negative_ratio": sum(1 for p in polarities if p < 0) / len(polarities),
+                "neutral_ratio": sum(1 for p in polarities if p == 0) / len(polarities)
+            }
+        
+        print(f"✅ Sentiment analysis complete!")
+        print(f"   Distribution: {result['statistics'].get('sentiment_distribution', 'N/A')}")
+        
+        return result
+        
+    except Exception as e:
+        print(f"❌ Error during sentiment analysis: {str(e)}")
+        raise
+
+
+def _extract_aspect_sentiments(texts: List[str], aspects: List[str]) -> Dict[str, Any]:
+    """Extract sentiment for specific aspects in text."""
+    
+    aspect_sentiments = {aspect: [] for aspect in aspects}
+    
+    for text in texts:
+        text_lower = text.lower()
+        
+        for aspect in aspects:
+            # Find sentences containing the aspect
+            sentences = text.split('.')
+            aspect_sentences = [s for s in sentences if aspect.lower() in s.lower()]
+            
+            if aspect_sentences:
+                # Analyze sentiment of aspect sentences
+                for sentence in aspect_sentences:
+                    blob = TextBlob(sentence)
+                    aspect_sentiments[aspect].append({
+                        "text": sentence.strip(),
+                        "polarity": blob.sentiment.polarity,
+                        "subjectivity": blob.sentiment.subjectivity
+                    })
+    
+    # Aggregate aspect sentiments
+    result = {}
+    for aspect, sentiments in aspect_sentiments.items():
+        if sentiments:
+            polarities = [s["polarity"] for s in sentiments]
+            result[aspect] = {
+                "count": len(sentiments),
+                "avg_polarity": np.mean(polarities),
+                "positive_mentions": sum(1 for p in polarities if p > 0),
+                "negative_mentions": sum(1 for p in polarities if p < 0),
+                "examples": sentiments[:5]
+            }
+        else:
+            result[aspect] = {"count": 0, "avg_polarity": 0.0}
+    
+    return result
+
+
+def perform_text_similarity(
+    data: pl.DataFrame,
+    text_column: str,
+    query_text: Optional[str] = None,
+    method: str = "cosine",
+    top_k: int = 10,
+    use_embeddings: bool = False,
+    model_name: str = "sentence-transformers/all-MiniLM-L6-v2"
+) -> Dict[str, Any]:
+    """
+    Calculate text similarity using cosine, Jaccard, or semantic embeddings.
+    
+    Args:
+        data: Input DataFrame
+        text_column: Column containing text data
+        query_text: Query text to find similar documents (if None, computes pairwise)
+        method: Similarity method ('cosine', 'jaccard', 'semantic')
+        top_k: Number of top similar documents to return
+        use_embeddings: Whether to use transformer embeddings (for semantic similarity)
+        model_name: Model for semantic embeddings
+    
+    Returns:
+        Dictionary containing similarity scores and top matches
+    """
+    print(f"🔍 Calculating text similarity using {method} method...")
+    
+    # Validate input
+    if text_column not in data.columns:
+        raise ValueError(f"Text column '{text_column}' not found in DataFrame")
+    
+    # Extract text
+    texts = data[text_column].to_list()
+    texts = [str(t) if t is not None else "" for t in texts]
+    
+    result = {
+        "method": method,
+        "n_documents": len(texts),
+        "query_text": query_text,
+        "similarities": []
+    }
+    
+    try:
+        if method == "semantic" and use_embeddings and TRANSFORMERS_AVAILABLE:
+            print(f"  Using semantic embeddings: {model_name}")
+            
+            # Load model and tokenizer
+            tokenizer = AutoTokenizer.from_pretrained(model_name)
+            model = AutoModel.from_pretrained(model_name)
+            
+            def get_embedding(text: str) -> np.ndarray:
+                inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
+                with torch.no_grad():
+                    outputs = model(**inputs)
+                # Mean pooling
+                return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
+            
+            # Get embeddings
+            if query_text:
+                query_embedding = get_embedding(query_text)
+                text_embeddings = np.array([get_embedding(t) for t in texts])
+                
+                # Calculate cosine similarity
+                similarities = cosine_similarity([query_embedding], text_embeddings)[0]
+                
+                # Top K
+                top_indices = similarities.argsort()[-top_k:][::-1]
+                result["similarities"] = [
+                    {
+                        "document_id": int(idx),
+                        "text": texts[idx][:200],
+                        "score": float(similarities[idx])
+                    }
+                    for idx in top_indices
+                ]
+            else:
+                # Pairwise similarity
+                text_embeddings = np.array([get_embedding(t) for t in texts])
+                similarity_matrix = cosine_similarity(text_embeddings)
+                result["similarity_matrix"] = similarity_matrix.tolist()
+                result["avg_similarity"] = float(np.mean(similarity_matrix[np.triu_indices_from(similarity_matrix, k=1)]))
+        
+        elif method == "cosine":
+            print("  Using TF-IDF with cosine similarity...")
+            
+            vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
+            
+            if query_text:
+                all_texts = [query_text] + texts
+                tfidf_matrix = vectorizer.fit_transform(all_texts)
+                
+                # Similarity between query and all documents
+                similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])[0]
+                
+                # Top K
+                top_indices = similarities.argsort()[-top_k:][::-1]
+                result["similarities"] = [
+                    {
+                        "document_id": int(idx),
+                        "text": texts[idx][:200],
+                        "score": float(similarities[idx])
+                    }
+                    for idx in top_indices
+                ]
+            else:
+                # Pairwise similarity
+                tfidf_matrix = vectorizer.fit_transform(texts)
+                similarity_matrix = cosine_similarity(tfidf_matrix)
+                result["similarity_matrix"] = similarity_matrix.tolist()
+                result["avg_similarity"] = float(np.mean(similarity_matrix[np.triu_indices_from(similarity_matrix, k=1)]))
+        
+        elif method == "jaccard":
+            print("  Using Jaccard similarity...")
+            
+            def jaccard_similarity(text1: str, text2: str) -> float:
+                set1 = set(text1.lower().split())
+                set2 = set(text2.lower().split())
+                intersection = len(set1.intersection(set2))
+                union = len(set1.union(set2))
+                return intersection / union if union > 0 else 0.0
+            
+            if query_text:
+                similarities = [jaccard_similarity(query_text, text) for text in texts]
+                
+                # Top K
+                top_indices = np.argsort(similarities)[-top_k:][::-1]
+                result["similarities"] = [
+                    {
+                        "document_id": int(idx),
+                        "text": texts[idx][:200],
+                        "score": float(similarities[idx])
+                    }
+                    for idx in top_indices
+                ]
+            else:
+                # Pairwise similarity
+                n = len(texts)
+                similarity_matrix = np.zeros((n, n))
+                for i in range(n):
+                    for j in range(i+1, n):
+                        sim = jaccard_similarity(texts[i], texts[j])
+                        similarity_matrix[i, j] = sim
+                        similarity_matrix[j, i] = sim
+                
+                result["similarity_matrix"] = similarity_matrix.tolist()
+                result["avg_similarity"] = float(np.mean(similarity_matrix[np.triu_indices_from(similarity_matrix, k=1)]))
+        
+        else:
+            raise ValueError(f"Unknown method '{method}'. Use 'cosine', 'jaccard', or 'semantic'")
+        
+        print(f"✅ Similarity calculation complete!")
+        if result.get("similarities"):
+            print(f"   Top similarity score: {result['similarities'][0]['score']:.3f}")
+        
+        return result
+        
+    except Exception as e:
+        print(f"❌ Error during similarity calculation: {str(e)}")
+        raise
diff --git a/src/tools/plotly_visualizations.py b/src/tools/plotly_visualizations.py
new file mode 100644
index 0000000000000000000000000000000000000000..edfd3d187bd448470fe565f4fd173e2e39d84298
--- /dev/null
+++ b/src/tools/plotly_visualizations.py
@@ -0,0 +1,492 @@
+"""
+Plotly Interactive Visualization Tools
+Create interactive, web-based visualizations that can be explored in browsers.
+"""
+
+import polars as pl
+import plotly.express as px
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+import numpy as np
+from typing import Dict, Any, List, Optional
+from pathlib import Path
+import sys
+import os
+
+# Add parent directory to path for imports
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from utils.polars_helpers import (
+    load_dataframe,
+    get_numeric_columns,
+    get_categorical_columns,
+)
+from utils.validation import (
+    validate_file_exists,
+    validate_file_format,
+    validate_dataframe,
+    validate_column_exists,
+)
+
+
+def generate_interactive_scatter(
+    file_path: str,
+    x_col: str,
+    y_col: str,
+    color_col: Optional[str] = None,
+    size_col: Optional[str] = None,
+    output_path: str = "./outputs/plots/interactive/scatter.html"
+) -> Dict[str, Any]:
+    """
+    Create interactive scatter plot with Plotly.
+    
+    Args:
+        file_path: Path to dataset
+        x_col: Column for X-axis
+        y_col: Column for Y-axis
+        color_col: Optional column for color coding
+        size_col: Optional column for bubble size
+        output_path: Path to save HTML file
+        
+    Returns:
+        Dictionary with plot info and path
+    """
+    # Validation
+    validate_file_exists(file_path)
+    validate_file_format(file_path)
+    
+    # Load data
+    df = load_dataframe(file_path)
+    validate_dataframe(df)
+    validate_column_exists(df, x_col)
+    validate_column_exists(df, y_col)
+    
+    if color_col:
+        validate_column_exists(df, color_col)
+    if size_col:
+        validate_column_exists(df, size_col)
+    
+    # Convert to pandas for plotly
+    df_pd = df.to_pandas()
+    
+    # Create figure
+    fig = px.scatter(
+        df_pd,
+        x=x_col,
+        y=y_col,
+        color=color_col,
+        size=size_col,
+        hover_data=df_pd.columns.tolist(),
+        title=f"Interactive Scatter: {y_col} vs {x_col}",
+        template="plotly_white"
+    )
+    
+    # Update layout for better interactivity
+    fig.update_layout(
+        hovermode='closest',
+        height=600,
+        font=dict(size=12)
+    )
+    
+    # Save
+    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+    fig.write_html(output_path)
+    
+    return {
+        "status": "success",
+        "plot_type": "interactive_scatter",
+        "output_path": output_path,
+        "x_col": x_col,
+        "y_col": y_col,
+        "color_col": color_col,
+        "size_col": size_col,
+        "num_points": len(df)
+    }
+
+
+def generate_interactive_histogram(
+    file_path: str,
+    column: str,
+    bins: int = 30,
+    color_col: Optional[str] = None,
+    output_path: str = "./outputs/plots/interactive/histogram.html"
+) -> Dict[str, Any]:
+    """
+    Create interactive histogram with Plotly.
+    
+    Args:
+        file_path: Path to dataset
+        column: Column to plot
+        bins: Number of bins
+        color_col: Optional column for grouped histograms
+        output_path: Path to save HTML file
+        
+    Returns:
+        Dictionary with plot info
+    """
+    # Validation
+    validate_file_exists(file_path)
+    validate_file_format(file_path)
+    
+    df = load_dataframe(file_path)
+    validate_dataframe(df)
+    validate_column_exists(df, column)
+    
+    if color_col:
+        validate_column_exists(df, color_col)
+    
+    df_pd = df.to_pandas()
+    
+    # Create histogram
+    fig = px.histogram(
+        df_pd,
+        x=column,
+        nbins=bins,
+        color=color_col,
+        title=f"Distribution of {column}",
+        template="plotly_white",
+        marginal="box"  # Add box plot on top
+    )
+    
+    fig.update_layout(
+        bargap=0.1,
+        height=600,
+        showlegend=True if color_col else False
+    )
+    
+    # Save
+    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+    fig.write_html(output_path)
+    
+    return {
+        "status": "success",
+        "plot_type": "interactive_histogram",
+        "output_path": output_path,
+        "column": column,
+        "bins": bins,
+        "color_col": color_col
+    }
+
+
+def generate_interactive_correlation_heatmap(
+    file_path: str,
+    output_path: str = "./outputs/plots/interactive/correlation_heatmap.html"
+) -> Dict[str, Any]:
+    """
+    Create interactive correlation heatmap with Plotly.
+    
+    Args:
+        file_path: Path to dataset
+        output_path: Path to save HTML file
+        
+    Returns:
+        Dictionary with plot info
+    """
+    # Validation
+    validate_file_exists(file_path)
+    validate_file_format(file_path)
+    
+    df = load_dataframe(file_path)
+    validate_dataframe(df)
+    
+    # Get numeric columns
+    numeric_cols = get_numeric_columns(df)
+    
+    if len(numeric_cols) < 2:
+        return {
+            "status": "error",
+            "message": "Need at least 2 numeric columns for correlation"
+        }
+    
+    # Calculate correlation matrix
+    df_numeric = df.select(numeric_cols)
+    corr_matrix = df_numeric.to_pandas().corr()
+    
+    # Create heatmap
+    fig = go.Figure(data=go.Heatmap(
+        z=corr_matrix.values,
+        x=corr_matrix.columns,
+        y=corr_matrix.columns,
+        colorscale='RdBu',
+        zmid=0,
+        text=np.round(corr_matrix.values, 2),
+        texttemplate='%{text}',
+        textfont={"size": 10},
+        colorbar=dict(title="Correlation")
+    ))
+    
+    fig.update_layout(
+        title="Interactive Correlation Heatmap",
+        template="plotly_white",
+        height=max(600, len(numeric_cols) * 30),
+        width=max(600, len(numeric_cols) * 30),
+        xaxis={'side': 'bottom'},
+        yaxis={'side': 'left'}
+    )
+    
+    # Save
+    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+    fig.write_html(output_path)
+    
+    return {
+        "status": "success",
+        "plot_type": "interactive_correlation_heatmap",
+        "output_path": output_path,
+        "num_features": len(numeric_cols)
+    }
+
+
+def generate_interactive_box_plots(
+    file_path: str,
+    columns: Optional[List[str]] = None,
+    group_by: Optional[str] = None,
+    output_path: str = "./outputs/plots/interactive/box_plots.html"
+) -> Dict[str, Any]:
+    """
+    Create interactive box plots for outlier detection.
+    
+    Args:
+        file_path: Path to dataset
+        columns: Columns to plot (all numeric if None)
+        group_by: Optional categorical column for grouping
+        output_path: Path to save HTML file
+        
+    Returns:
+        Dictionary with plot info
+    """
+    # Validation
+    validate_file_exists(file_path)
+    validate_file_format(file_path)
+    
+    df = load_dataframe(file_path)
+    validate_dataframe(df)
+    
+    # Determine columns to plot
+    if columns is None:
+        columns = get_numeric_columns(df)
+    else:
+        for col in columns:
+            validate_column_exists(df, col)
+    
+    if len(columns) == 0:
+        return {
+            "status": "error",
+            "message": "No numeric columns to plot"
+        }
+    
+    if group_by:
+        validate_column_exists(df, group_by)
+    
+    df_pd = df.to_pandas()
+    
+    # Create subplots
+    rows = (len(columns) + 2) // 3  # 3 plots per row
+    cols = min(3, len(columns))
+    
+    fig = make_subplots(
+        rows=rows,
+        cols=cols,
+        subplot_titles=columns,
+        vertical_spacing=0.1
+    )
+    
+    for idx, col in enumerate(columns):
+        row = idx // 3 + 1
+        col_idx = idx % 3 + 1
+        
+        if group_by:
+            for group in df_pd[group_by].unique():
+                group_data = df_pd[df_pd[group_by] == group][col]
+                fig.add_trace(
+                    go.Box(y=group_data, name=str(group), showlegend=(idx == 0)),
+                    row=row,
+                    col=col_idx
+                )
+        else:
+            fig.add_trace(
+                go.Box(y=df_pd[col], name=col, showlegend=False),
+                row=row,
+                col=col_idx
+            )
+    
+    fig.update_layout(
+        title="Interactive Box Plots - Outlier Detection",
+        template="plotly_white",
+        height=400 * rows,
+        showlegend=bool(group_by)
+    )
+    
+    # Save
+    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+    fig.write_html(output_path)
+    
+    return {
+        "status": "success",
+        "plot_type": "interactive_box_plots",
+        "output_path": output_path,
+        "columns_plotted": columns,
+        "group_by": group_by
+    }
+
+
+def generate_interactive_time_series(
+    file_path: str,
+    time_col: str,
+    value_cols: List[str],
+    output_path: str = "./outputs/plots/interactive/time_series.html"
+) -> Dict[str, Any]:
+    """
+    Create interactive time series plot with Plotly.
+    
+    Args:
+        file_path: Path to dataset
+        time_col: Column with datetime values
+        value_cols: Columns to plot over time
+        output_path: Path to save HTML file
+        
+    Returns:
+        Dictionary with plot info
+    """
+    # Validation
+    validate_file_exists(file_path)
+    validate_file_format(file_path)
+    
+    df = load_dataframe(file_path)
+    validate_dataframe(df)
+    validate_column_exists(df, time_col)
+    
+    for col in value_cols:
+        validate_column_exists(df, col)
+    
+    # Parse datetime if needed
+    if df[time_col].dtype == pl.Utf8:
+        df = df.with_columns(
+            pl.col(time_col).str.strptime(pl.Datetime, strict=False).alias(time_col)
+        )
+    
+    df_pd = df.to_pandas()
+    
+    # Create figure
+    fig = go.Figure()
+    
+    for col in value_cols:
+        fig.add_trace(go.Scatter(
+            x=df_pd[time_col],
+            y=df_pd[col],
+            mode='lines+markers',
+            name=col,
+            hovertemplate=f'<b>{col}</b><br>Time: %{{x}}<br>Value: %{{y:.2f}}<extra></extra>'
+        ))
+    
+    fig.update_layout(
+        title="Interactive Time Series",
+        xaxis_title=time_col,
+        yaxis_title="Value",
+        template="plotly_white",
+        height=600,
+        hovermode='x unified',
+        legend=dict(
+            orientation="h",
+            yanchor="bottom",
+            y=1.02,
+            xanchor="right",
+            x=1
+        )
+    )
+    
+    # Add range slider
+    fig.update_xaxes(rangeslider_visible=True)
+    
+    # Save
+    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+    fig.write_html(output_path)
+    
+    return {
+        "status": "success",
+        "plot_type": "interactive_time_series",
+        "output_path": output_path,
+        "time_col": time_col,
+        "value_cols": value_cols
+    }
+
+
+def generate_plotly_dashboard(
+    file_path: str,
+    target_col: Optional[str] = None,
+    output_dir: str = "./outputs/plots/interactive"
+) -> Dict[str, Any]:
+    """
+    Generate a complete dashboard with multiple interactive plots.
+    
+    Args:
+        file_path: Path to dataset
+        target_col: Optional target column for supervised analysis
+        output_dir: Directory to save all plots
+        
+    Returns:
+        Dictionary with paths to all generated plots
+    """
+    # Validation
+    validate_file_exists(file_path)
+    validate_file_format(file_path)
+    
+    df = load_dataframe(file_path)
+    validate_dataframe(df)
+    
+    if target_col:
+        validate_column_exists(df, target_col)
+    
+    numeric_cols = get_numeric_columns(df)
+    categorical_cols = get_categorical_columns(df)
+    
+    plots_generated = []
+    
+    # 1. Correlation heatmap
+    if len(numeric_cols) >= 2:
+        result = generate_interactive_correlation_heatmap(
+            file_path,
+            output_path=f"{output_dir}/correlation_heatmap.html"
+        )
+        if result["status"] == "success":
+            plots_generated.append(result)
+    
+    # 2. Box plots for outliers
+    if len(numeric_cols) > 0:
+        result = generate_interactive_box_plots(
+            file_path,
+            columns=numeric_cols[:10],  # Limit to 10 for performance
+            output_path=f"{output_dir}/box_plots.html"
+        )
+        if result["status"] == "success":
+            plots_generated.append(result)
+    
+    # 3. Target variable analysis if provided
+    if target_col and target_col in numeric_cols:
+        # Scatter plots against target
+        for col in numeric_cols[:5]:  # Top 5 features
+            if col != target_col:
+                result = generate_interactive_scatter(
+                    file_path,
+                    x_col=col,
+                    y_col=target_col,
+                    output_path=f"{output_dir}/scatter_{col}_vs_{target_col}.html"
+                )
+                if result["status"] == "success":
+                    plots_generated.append(result)
+    
+    # 4. Distribution plots for numeric features
+    for col in numeric_cols[:5]:  # Top 5 features
+        result = generate_interactive_histogram(
+            file_path,
+            column=col,
+            output_path=f"{output_dir}/histogram_{col}.html"
+        )
+        if result["status"] == "success":
+            plots_generated.append(result)
+    
+    return {
+        "status": "success",
+        "plots_generated": len(plots_generated),
+        "plots": plots_generated,
+        "output_dir": output_dir
+    }
diff --git a/src/tools/production_mlops.py b/src/tools/production_mlops.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2ba9519ab6c81921ce977d185bc7bc4d4617d67
--- /dev/null
+++ b/src/tools/production_mlops.py
@@ -0,0 +1,661 @@
+"""
+Production & MLOps Tools
+Tools for model monitoring, explainability, governance, and production readiness.
+"""
+
+import polars as pl
+import numpy as np
+from typing import Dict, Any, List, Optional, Tuple
+from pathlib import Path
+import sys
+import os
+import json
+import warnings
+from datetime import datetime
+import joblib
+
+warnings.filterwarnings('ignore')
+
+# Add parent directory to path for imports
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from scipy import stats
+from scipy.stats import ks_2samp, pearsonr
+import shap
+from lime import lime_tabular
+from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
+
+from utils.polars_helpers import load_dataframe, get_numeric_columns, split_features_target
+from utils.validation import validate_file_exists, validate_file_format, validate_dataframe, validate_column_exists
+
+
+def monitor_model_drift(
+    reference_data_path: str,
+    current_data_path: str,
+    target_col: Optional[str] = None,
+    threshold_psi: float = 0.2,
+    threshold_ks: float = 0.05,
+    output_path: Optional[str] = None
+) -> Dict[str, Any]:
+    """
+    Detect data drift and concept drift in production models.
+    
+    Args:
+        reference_data_path: Path to training/reference dataset
+        current_data_path: Path to production/current dataset
+        target_col: Target column (for concept drift detection)
+        threshold_psi: PSI threshold (>0.2 = significant drift)
+        threshold_ks: KS test p-value threshold (<0.05 = significant drift)
+        output_path: Path to save drift report
+        
+    Returns:
+        Dictionary with drift metrics and alerts
+    """
+    # Validation
+    validate_file_exists(reference_data_path)
+    validate_file_exists(current_data_path)
+    
+    # Load data
+    ref_df = load_dataframe(reference_data_path)
+    curr_df = load_dataframe(current_data_path)
+    
+    validate_dataframe(ref_df)
+    validate_dataframe(curr_df)
+    
+    print("🔍 Analyzing data drift...")
+    
+    # Get common columns
+    common_cols = list(set(ref_df.columns) & set(curr_df.columns))
+    numeric_cols = [col for col in get_numeric_columns(ref_df) if col in common_cols and col != target_col]
+    
+    # Calculate PSI (Population Stability Index) for each feature
+    drift_results = {}
+    alerts = []
+    
+    for col in numeric_cols:
+        try:
+            ref_data = ref_df[col].drop_nulls().to_numpy()
+            curr_data = curr_df[col].drop_nulls().to_numpy()
+            
+            # PSI calculation
+            # Create bins based on reference data
+            bins = np.percentile(ref_data, [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
+            bins = np.unique(bins)  # Remove duplicates
+            
+            ref_counts, _ = np.histogram(ref_data, bins=bins)
+            curr_counts, _ = np.histogram(curr_data, bins=bins)
+            
+            # Add small constant to avoid division by zero
+            ref_props = (ref_counts + 1e-6) / (len(ref_data) + len(bins) * 1e-6)
+            curr_props = (curr_counts + 1e-6) / (len(curr_data) + len(bins) * 1e-6)
+            
+            psi = np.sum((curr_props - ref_props) * np.log(curr_props / ref_props))
+            
+            # KS test (Kolmogorov-Smirnov)
+            ks_stat, ks_pval = ks_2samp(ref_data, curr_data)
+            
+            # Distribution statistics
+            ref_mean = float(np.mean(ref_data))
+            curr_mean = float(np.mean(curr_data))
+            mean_shift = float(abs(curr_mean - ref_mean) / (ref_mean + 1e-10))
+            
+            drift_results[col] = {
+                'psi': float(psi),
+                'ks_statistic': float(ks_stat),
+                'ks_pvalue': float(ks_pval),
+                'ref_mean': ref_mean,
+                'curr_mean': curr_mean,
+                'mean_shift_pct': mean_shift * 100,
+                'drift_detected': psi > threshold_psi or ks_pval < threshold_ks
+            }
+            
+            # Generate alerts
+            if psi > threshold_psi:
+                alerts.append({
+                    'feature': col,
+                    'type': 'data_drift',
+                    'severity': 'high' if psi > 0.5 else 'medium',
+                    'metric': 'PSI',
+                    'value': float(psi),
+                    'message': f"PSI = {psi:.3f} exceeds threshold {threshold_psi}"
+                })
+            
+            if ks_pval < threshold_ks:
+                alerts.append({
+                    'feature': col,
+                    'type': 'data_drift',
+                    'severity': 'high',
+                    'metric': 'KS_test',
+                    'value': float(ks_pval),
+                    'message': f"KS test p-value = {ks_pval:.4f} < {threshold_ks}"
+                })
+                
+        except Exception as e:
+            print(f"⚠️ Could not calculate drift for {col}: {str(e)}")
+    
+    # Concept drift (target distribution change)
+    concept_drift_result = None
+    if target_col and target_col in common_cols:
+        try:
+            ref_target = ref_df[target_col].drop_nulls().to_numpy()
+            curr_target = curr_df[target_col].drop_nulls().to_numpy()
+            
+            # Check if categorical
+            if len(np.unique(ref_target)) < 20:
+                # Categorical target - compare distributions
+                ref_dist = {str(val): np.sum(ref_target == val) / len(ref_target) for val in np.unique(ref_target)}
+                curr_dist = {str(val): np.sum(curr_target == val) / len(curr_target) for val in np.unique(curr_target)}
+                
+                concept_drift_result = {
+                    'ref_distribution': ref_dist,
+                    'curr_distribution': curr_dist,
+                    'drift_detected': True if len(set(ref_dist.keys()) - set(curr_dist.keys())) > 0 else False
+                }
+            else:
+                # Numeric target
+                ks_stat, ks_pval = ks_2samp(ref_target, curr_target)
+                concept_drift_result = {
+                    'ks_statistic': float(ks_stat),
+                    'ks_pvalue': float(ks_pval),
+                    'drift_detected': ks_pval < threshold_ks
+                }
+                
+            if concept_drift_result['drift_detected']:
+                alerts.append({
+                    'feature': target_col,
+                    'type': 'concept_drift',
+                    'severity': 'critical',
+                    'message': 'Target distribution has changed - model may need retraining'
+                })
+        except Exception as e:
+            print(f"⚠️ Could not detect concept drift: {str(e)}")
+    
+    # Summary
+    drifted_features = [col for col, result in drift_results.items() if result['drift_detected']]
+    
+    print(f"🚨 {len(alerts)} drift alerts | {len(drifted_features)} features with significant drift")
+    
+    # Save report
+    report = {
+        'timestamp': datetime.now().isoformat(),
+        'reference_samples': len(ref_df),
+        'current_samples': len(curr_df),
+        'features_analyzed': len(numeric_cols),
+        'drift_results': drift_results,
+        'concept_drift': concept_drift_result,
+        'alerts': alerts,
+        'drifted_features': drifted_features
+    }
+    
+    if output_path:
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        with open(output_path, 'w') as f:
+            json.dump(report, f, indent=2)
+        print(f"💾 Drift report saved to: {output_path}")
+    
+    return {
+        'status': 'success',
+        'features_analyzed': len(numeric_cols),
+        'drifted_features': drifted_features,
+        'n_alerts': len(alerts),
+        'alerts': alerts,
+        'concept_drift_detected': concept_drift_result['drift_detected'] if concept_drift_result else False,
+        'recommendation': 'Retrain model' if len(alerts) > 0 else 'No action needed',
+        'report_path': output_path
+    }
+
+
+def explain_predictions(
+    model_path: str,
+    data_path: str,
+    instance_indices: List[int],
+    method: str = "shap",
+    output_dir: Optional[str] = None
+) -> Dict[str, Any]:
+    """
+    Generate explainability reports for individual predictions using SHAP or LIME.
+    
+    Args:
+        model_path: Path to trained model (.pkl)
+        data_path: Path to dataset
+        instance_indices: List of row indices to explain
+        method: Explanation method ('shap', 'lime', or 'both')
+        output_dir: Directory to save explanation plots
+        
+    Returns:
+        Dictionary with explanations and feature importance
+    """
+    # Validation
+    validate_file_exists(model_path)
+    validate_file_exists(data_path)
+    
+    # Load model and data
+    model = joblib.load(model_path)
+    df = load_dataframe(data_path)
+    validate_dataframe(df)
+    
+    print(f"🔍 Generating {method} explanations for {len(instance_indices)} instances...")
+    
+    X = df.to_numpy()
+    feature_names = df.columns
+    
+    explanations = []
+    
+    # SHAP explanations
+    if method in ["shap", "both"]:
+        try:
+            # Create SHAP explainer
+            explainer = shap.Explainer(model, X)
+            shap_values = explainer(X[instance_indices])
+            
+            for idx, instance_idx in enumerate(instance_indices):
+                shap_exp = {
+                    'instance_idx': instance_idx,
+                    'method': 'shap',
+                    'prediction': model.predict(X[instance_idx:instance_idx+1])[0],
+                    'feature_contributions': {
+                        feature_names[i]: float(shap_values.values[idx, i])
+                        for i in range(len(feature_names))
+                    },
+                    'top_5_positive': sorted(
+                        [(feature_names[i], float(shap_values.values[idx, i])) 
+                         for i in range(len(feature_names))],
+                        key=lambda x: x[1], reverse=True
+                    )[:5],
+                    'top_5_negative': sorted(
+                        [(feature_names[i], float(shap_values.values[idx, i])) 
+                         for i in range(len(feature_names))],
+                        key=lambda x: x[1]
+                    )[:5]
+                }
+                explanations.append(shap_exp)
+                
+            # Save force plot if output_dir provided
+            if output_dir:
+                os.makedirs(output_dir, exist_ok=True)
+                for idx, instance_idx in enumerate(instance_indices):
+                    plot_path = os.path.join(output_dir, f"shap_force_plot_instance_{instance_idx}.html")
+                    shap.save_html(plot_path, shap.force_plot(
+                        explainer.expected_value,
+                        shap_values.values[idx],
+                        X[instance_idx],
+                        feature_names=feature_names
+                    ))
+                print(f"💾 SHAP plots saved to: {output_dir}")
+                
+        except Exception as e:
+            print(f"⚠️ SHAP failed: {str(e)}")
+    
+    # LIME explanations
+    if method in ["lime", "both"]:
+        try:
+            # Create LIME explainer
+            explainer = lime_tabular.LimeTabularExplainer(
+                X,
+                feature_names=feature_names,
+                mode='classification' if hasattr(model, 'predict_proba') else 'regression'
+            )
+            
+            for instance_idx in instance_indices:
+                exp = explainer.explain_instance(
+                    X[instance_idx],
+                    model.predict_proba if hasattr(model, 'predict_proba') else model.predict,
+                    num_features=len(feature_names)
+                )
+                
+                lime_exp = {
+                    'instance_idx': instance_idx,
+                    'method': 'lime',
+                    'prediction': model.predict(X[instance_idx:instance_idx+1])[0],
+                    'feature_contributions': dict(exp.as_list()),
+                    'top_features': exp.as_list()[:10]
+                }
+                explanations.append(lime_exp)
+                
+                # Save HTML if output_dir provided
+                if output_dir:
+                    plot_path = os.path.join(output_dir, f"lime_explanation_instance_{instance_idx}.html")
+                    exp.save_to_file(plot_path)
+                    
+        except Exception as e:
+            print(f"⚠️ LIME failed: {str(e)}")
+    
+    print(f"✅ Generated {len(explanations)} explanations")
+    
+    return {
+        'status': 'success',
+        'method': method,
+        'n_explanations': len(explanations),
+        'explanations': explanations,
+        'output_dir': output_dir
+    }
+
+
+def generate_model_card(
+    model_path: str,
+    train_data_path: str,
+    test_data_path: str,
+    target_col: str,
+    model_name: str,
+    model_description: str,
+    intended_use: str,
+    sensitive_attributes: Optional[List[str]] = None,
+    output_path: Optional[str] = None
+) -> Dict[str, Any]:
+    """
+    Generate comprehensive model card for governance and compliance.
+    
+    Args:
+        model_path: Path to trained model
+        train_data_path: Path to training data
+        test_data_path: Path to test data
+        target_col: Target column name
+        model_name: Name of the model
+        model_description: Description of model architecture
+        intended_use: Intended use case
+        sensitive_attributes: List of sensitive columns for fairness analysis
+        output_path: Path to save model card (JSON/HTML)
+        
+    Returns:
+        Dictionary with model card information
+    """
+    # Load model and data
+    model = joblib.load(model_path)
+    train_df = load_dataframe(train_data_path)
+    test_df = load_dataframe(test_data_path)
+    
+    X_train, y_train = split_features_target(train_df, target_col)
+    X_test, y_test = split_features_target(test_df, target_col)
+    
+    print("📋 Generating model card...")
+    
+    # Model performance
+    y_pred = model.predict(X_test)
+    
+    task_type = "classification" if len(np.unique(y_test)) < 20 else "regression"
+    
+    if task_type == "classification":
+        performance = {
+            'accuracy': float(accuracy_score(y_test, y_pred)),
+            'classification_report': classification_report(y_test, y_pred, output_dict=True)
+        }
+    else:
+        from sklearn.metrics import mean_squared_error, r2_score
+        performance = {
+            'rmse': float(np.sqrt(mean_squared_error(y_test, y_pred))),
+            'r2': float(r2_score(y_test, y_pred))
+        }
+    
+    # Fairness metrics
+    fairness_metrics = {}
+    if sensitive_attributes:
+        for attr in sensitive_attributes:
+            if attr in test_df.columns:
+                try:
+                    groups = test_df[attr].unique().to_list()
+                    group_metrics = {}
+                    
+                    for group in groups:
+                        mask = test_df[attr].to_numpy() == group
+                        group_pred = y_pred[mask]
+                        group_true = y_test[mask]
+                        
+                        if task_type == "classification":
+                            group_metrics[str(group)] = {
+                                'accuracy': float(accuracy_score(group_true, group_pred)),
+                                'sample_size': int(np.sum(mask))
+                            }
+                        else:
+                            group_metrics[str(group)] = {
+                                'rmse': float(np.sqrt(mean_squared_error(group_true, group_pred))),
+                                'sample_size': int(np.sum(mask))
+                            }
+                    
+                    fairness_metrics[attr] = group_metrics
+                except Exception as e:
+                    print(f"⚠️ Could not compute fairness for {attr}: {str(e)}")
+    
+    # Model card
+    model_card = {
+        'model_details': {
+            'name': model_name,
+            'description': model_description,
+            'version': '1.0',
+            'type': str(type(model).__name__),
+            'created_date': datetime.now().isoformat(),
+            'intended_use': intended_use
+        },
+        'training_data': {
+            'n_samples': len(train_df),
+            'n_features': len(train_df.columns) - 1,
+            'target_column': target_col
+        },
+        'performance': performance,
+        'fairness_metrics': fairness_metrics,
+        'limitations': [
+            f"Trained on {len(train_df)} samples",
+            "Performance may degrade on out-of-distribution data",
+            "Regular monitoring recommended"
+        ],
+        'ethical_considerations': [
+            "Model should not be used for discriminatory purposes",
+            "Predictions should be reviewed by domain experts",
+            "Consider societal impact before deployment"
+        ]
+    }
+    
+    # Save model card
+    if output_path:
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        with open(output_path, 'w') as f:
+            json.dump(model_card, f, indent=2)
+        print(f"💾 Model card saved to: {output_path}")
+    
+    return {
+        'status': 'success',
+        'model_card': model_card,
+        'output_path': output_path
+    }
+
+
+def perform_ab_test_analysis(
+    control_data_path: str,
+    treatment_data_path: str,
+    metric_col: str,
+    alpha: float = 0.05,
+    power: float = 0.8
+) -> Dict[str, Any]:
+    """
+    Perform A/B test statistical analysis with confidence intervals.
+    
+    Args:
+        control_data_path: Path to control group data
+        treatment_data_path: Path to treatment group data
+        metric_col: Metric column to compare
+        alpha: Significance level (default 0.05)
+        power: Statistical power (default 0.8)
+        
+    Returns:
+        Dictionary with A/B test results
+    """
+    # Load data
+    control_df = load_dataframe(control_data_path)
+    treatment_df = load_dataframe(treatment_data_path)
+    
+    validate_column_exists(control_df, metric_col)
+    validate_column_exists(treatment_df, metric_col)
+    
+    control = control_df[metric_col].drop_nulls().to_numpy()
+    treatment = treatment_df[metric_col].drop_nulls().to_numpy()
+    
+    print("📊 Performing A/B test analysis...")
+    
+    # Calculate statistics
+    control_mean = float(np.mean(control))
+    treatment_mean = float(np.mean(treatment))
+    
+    control_std = float(np.std(control, ddof=1))
+    treatment_std = float(np.std(treatment, ddof=1))
+    
+    # T-test
+    from scipy.stats import ttest_ind
+    t_stat, p_value = ttest_ind(treatment, control)
+    
+    # Effect size (Cohen's d)
+    pooled_std = np.sqrt(((len(control)-1)*control_std**2 + (len(treatment)-1)*treatment_std**2) / (len(control)+len(treatment)-2))
+    cohens_d = (treatment_mean - control_mean) / pooled_std
+    
+    # Confidence intervals
+    from scipy import stats as scipy_stats
+    control_ci = scipy_stats.t.interval(1-alpha, len(control)-1, loc=control_mean, scale=control_std/np.sqrt(len(control)))
+    treatment_ci = scipy_stats.t.interval(1-alpha, len(treatment)-1, loc=treatment_mean, scale=treatment_std/np.sqrt(len(treatment)))
+    
+    # Relative uplift
+    relative_uplift = ((treatment_mean - control_mean) / control_mean) * 100
+    
+    # Sample size recommendation
+    from scipy.stats import norm
+    z_alpha = norm.ppf(1 - alpha/2)
+    z_beta = norm.ppf(power)
+    
+    required_n = 2 * ((z_alpha + z_beta) * pooled_std / (treatment_mean - control_mean + 1e-10))**2
+    
+    # Statistical significance
+    is_significant = p_value < alpha
+    
+    result = {
+        'control_group': {
+            'n_samples': len(control),
+            'mean': control_mean,
+            'std': control_std,
+            'ci_95': [float(control_ci[0]), float(control_ci[1])]
+        },
+        'treatment_group': {
+            'n_samples': len(treatment),
+            'mean': treatment_mean,
+            'std': treatment_std,
+            'ci_95': [float(treatment_ci[0]), float(treatment_ci[1])]
+        },
+        'test_results': {
+            't_statistic': float(t_stat),
+            'p_value': float(p_value),
+            'is_significant': is_significant,
+            'alpha': alpha
+        },
+        'effect_size': {
+            'cohens_d': float(cohens_d),
+            'interpretation': 'large' if abs(cohens_d) > 0.8 else 'medium' if abs(cohens_d) > 0.5 else 'small'
+        },
+        'business_impact': {
+            'absolute_lift': float(treatment_mean - control_mean),
+            'relative_lift_pct': float(relative_uplift)
+        },
+        'sample_size_recommendation': {
+            'current_total': len(control) + len(treatment),
+            'recommended_per_group': int(required_n),
+            'is_sufficient': len(control) >= required_n and len(treatment) >= required_n
+        },
+        'conclusion': f"Treatment {'significantly' if is_significant else 'does not significantly'} outperform control (p={p_value:.4f})"
+    }
+    
+    print(f"{'✅' if is_significant else '❌'} {result['conclusion']}")
+    print(f"📈 Relative lift: {relative_uplift:+.2f}%")
+    
+    return {
+        'status': 'success',
+        **result
+    }
+
+
+def detect_feature_leakage(
+    data_path: str,
+    target_col: str,
+    time_col: Optional[str] = None,
+    correlation_threshold: float = 0.95
+) -> Dict[str, Any]:
+    """
+    Detect potential feature leakage (target leakage and temporal leakage).
+    
+    Args:
+        data_path: Path to dataset
+        target_col: Target column name
+        time_col: Time column for temporal leakage detection
+        correlation_threshold: Correlation threshold for leakage detection
+        
+    Returns:
+        Dictionary with potential leakage issues
+    """
+    # Load data
+    df = load_dataframe(data_path)
+    validate_dataframe(df)
+    validate_column_exists(df, target_col)
+    
+    print("🔍 Detecting feature leakage...")
+    
+    # Get numeric columns
+    numeric_cols = [col for col in get_numeric_columns(df) if col != target_col]
+    
+    # Target leakage detection (high correlation with target)
+    target_leakage = []
+    target_data = df[target_col].drop_nulls().to_numpy()
+    
+    for col in numeric_cols:
+        try:
+            col_data = df[col].drop_nulls().to_numpy()
+            
+            # Align lengths
+            min_len = min(len(target_data), len(col_data))
+            corr, pval = pearsonr(target_data[:min_len], col_data[:min_len])
+            
+            if abs(corr) > correlation_threshold:
+                target_leakage.append({
+                    'feature': col,
+                    'correlation': float(corr),
+                    'p_value': float(pval),
+                    'severity': 'critical' if abs(corr) > 0.99 else 'high',
+                    'recommendation': f'Remove or investigate {col} - suspiciously high correlation with target'
+                })
+        except Exception as e:
+            pass
+    
+    # Temporal leakage detection
+    temporal_leakage = []
+    if time_col and time_col in df.columns:
+        # Check for future information
+        # Features that shouldn't be available at prediction time
+        potential_future_cols = [col for col in df.columns if any(keyword in col.lower() for keyword in ['future', 'next', 'after', 'later'])]
+        
+        if potential_future_cols:
+            temporal_leakage.append({
+                'features': potential_future_cols,
+                'issue': 'potential_future_information',
+                'recommendation': 'Verify these features are available at prediction time'
+            })
+    
+    # Check for perfect predictors (100% correlation or zero variance when grouped by target)
+    perfect_predictors = []
+    for col in numeric_cols:
+        try:
+            grouped_variance = df.group_by(target_col).agg(pl.col(col).var())
+            if (grouped_variance[col].drop_nulls() < 1e-10).all():
+                perfect_predictors.append({
+                    'feature': col,
+                    'issue': 'zero_variance_per_class',
+                    'recommendation': f'{col} has zero variance within each target class - likely leakage'
+                })
+        except:
+            pass
+    
+    # Summary
+    total_issues = len(target_leakage) + len(temporal_leakage) + len(perfect_predictors)
+    
+    print(f"🚨 Found {total_issues} potential leakage issues")
+    
+    return {
+        'status': 'success',
+        'target_leakage': target_leakage,
+        'temporal_leakage': temporal_leakage,
+        'perfect_predictors': perfect_predictors,
+        'total_issues': total_issues,
+        'recommendation': 'Review and remove suspicious features before training' if total_issues > 0 else 'No obvious leakage detected'
+    }
diff --git a/src/tools/time_series.py b/src/tools/time_series.py
new file mode 100644
index 0000000000000000000000000000000000000000..64690e37d45a0f14febe97b81381d2ab1070a58c
--- /dev/null
+++ b/src/tools/time_series.py
@@ -0,0 +1,456 @@
+"""
+Time Series & Forecasting Tools
+Tools for time series analysis, forecasting, seasonality detection, and feature engineering.
+"""
+
+import polars as pl
+import numpy as np
+from typing import Dict, Any, List, Optional
+from pathlib import Path
+import sys
+import os
+import warnings
+
+warnings.filterwarnings('ignore')
+
+# Add parent directory to path for imports
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+# Lazy imports - only import when needed to avoid blocking app startup
+# from statsmodels.tsa.arima.model import ARIMA
+# from statsmodels.tsa.statespace.sarimax import SARIMAX
+# from statsmodels.tsa.holtwinters import ExponentialSmoothing
+# from statsmodels.tsa.seasonal import seasonal_decompose, STL
+# from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
+# from prophet import Prophet
+import pandas as pd
+
+from utils.polars_helpers import load_dataframe, save_dataframe
+from utils.validation import validate_file_exists, validate_file_format, validate_dataframe, validate_column_exists
+
+
+def forecast_time_series(
+    file_path: str,
+    time_col: str,
+    target_col: str,
+    forecast_horizon: int = 30,
+    method: str = "prophet",
+    seasonal_period: Optional[int] = None,
+    output_path: Optional[str] = None
+) -> Dict[str, Any]:
+    """
+    Forecast time series using ARIMA, SARIMA, Prophet, or Exponential Smoothing.
+    
+    Args:
+        file_path: Path to time series dataset
+        time_col: Time/date column name
+        target_col: Target variable to forecast
+        forecast_horizon: Number of periods to forecast ahead
+        method: Forecasting method ('arima', 'sarima', 'prophet', 'exponential_smoothing')
+        seasonal_period: Seasonal period (e.g., 7 for weekly, 12 for monthly)
+        output_path: Path to save forecast results
+        
+    Returns:
+        Dictionary with forecast values and metrics
+    """
+    # Load data
+    df = load_dataframe(file_path)
+    validate_dataframe(df)
+    validate_column_exists(df, time_col)
+    validate_column_exists(df, target_col)
+    
+    # Sort by time
+    df = df.sort(time_col)
+    
+    # Lazy import of time series libraries
+    try:
+        if method == "prophet":
+            from prophet import Prophet
+        elif method in ["arima", "sarima"]:
+            from statsmodels.tsa.arima.model import ARIMA
+            from statsmodels.tsa.statespace.sarimax import SARIMAX
+        elif method == "exponential_smoothing":
+            from statsmodels.tsa.holtwinters import ExponentialSmoothing
+    except ImportError as e:
+        return {
+            'status': 'error',
+            'message': f"Required library not installed for {method}: {str(e)}"
+        }
+    
+    print(f"📈 Forecasting with {method} (horizon={forecast_horizon})...")
+    
+    # Convert to pandas for time series libraries
+    df_pd = df.to_pandas()
+    
+    if method == "prophet":
+        # Prophet requires 'ds' and 'y' columns
+        prophet_df = pd.DataFrame({
+            'ds': pd.to_datetime(df_pd[time_col]),
+            'y': df_pd[target_col]
+        })
+        
+        model = Prophet(yearly_seasonality=True, weekly_seasonality=True, daily_seasonality=False)
+        model.fit(prophet_df)
+        
+        # Create future dataframe
+        future = model.make_future_dataframe(periods=forecast_horizon)
+        forecast = model.predict(future)
+        
+        # Extract forecast values
+        forecast_values = forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail(forecast_horizon)
+        
+        result = {
+            'method': 'prophet',
+            'forecast': forecast_values.to_dict('records'),
+            'model_components': {
+                'trend': forecast['trend'].tail(forecast_horizon).tolist(),
+                'weekly': forecast.get('weekly', pd.Series([0]*forecast_horizon)).tail(forecast_horizon).tolist()
+            }
+        }
+        
+    elif method == "arima":
+        # ARIMA model
+        ts_data = df_pd.set_index(time_col)[target_col]
+        
+        # Auto-determine order (p,d,q) - simplified version
+        model = ARIMA(ts_data, order=(1, 1, 1))
+        fitted_model = model.fit()
+        
+        # Forecast
+        forecast = fitted_model.forecast(steps=forecast_horizon)
+        forecast_index = pd.date_range(start=ts_data.index[-1], periods=forecast_horizon+1, freq='D')[1:]
+        
+        result = {
+            'method': 'arima',
+            'order': '(1,1,1)',
+            'forecast': [{'date': str(date), 'value': float(val)} for date, val in zip(forecast_index, forecast)],
+            'aic': float(fitted_model.aic),
+            'bic': float(fitted_model.bic)
+        }
+        
+    elif method == "sarima":
+        if not seasonal_period:
+            seasonal_period = 7  # Default weekly
+        
+        ts_data = df_pd.set_index(time_col)[target_col]
+        
+        # SARIMA model
+        model = SARIMAX(ts_data, order=(1, 1, 1), seasonal_order=(1, 1, 1, seasonal_period))
+        fitted_model = model.fit(disp=False)
+        
+        # Forecast
+        forecast = fitted_model.forecast(steps=forecast_horizon)
+        forecast_index = pd.date_range(start=ts_data.index[-1], periods=forecast_horizon+1, freq='D')[1:]
+        
+        result = {
+            'method': 'sarima',
+            'order': '(1,1,1)',
+            'seasonal_order': f'(1,1,1,{seasonal_period})',
+            'forecast': [{'date': str(date), 'value': float(val)} for date, val in zip(forecast_index, forecast)],
+            'aic': float(fitted_model.aic)
+        }
+        
+    elif method == "exponential_smoothing":
+        ts_data = df_pd.set_index(time_col)[target_col]
+        
+        # Exponential Smoothing
+        model = ExponentialSmoothing(
+            ts_data,
+            seasonal_periods=seasonal_period if seasonal_period else 12,
+            trend='add',
+            seasonal='add' if seasonal_period else None
+        )
+        fitted_model = model.fit()
+        
+        # Forecast
+        forecast = fitted_model.forecast(steps=forecast_horizon)
+        forecast_index = pd.date_range(start=ts_data.index[-1], periods=forecast_horizon+1, freq='D')[1:]
+        
+        result = {
+            'method': 'exponential_smoothing',
+            'forecast': [{'date': str(date), 'value': float(val)} for date, val in zip(forecast_index, forecast)]
+        }
+    else:
+        raise ValueError(f"Unsupported method: {method}")
+    
+    # Save forecast
+    if output_path:
+        forecast_df = pl.DataFrame(result['forecast'])
+        save_dataframe(forecast_df, output_path)
+        print(f"💾 Forecast saved to: {output_path}")
+    
+    result['status'] = 'success'
+    result['forecast_horizon'] = forecast_horizon
+    result['output_path'] = output_path
+    
+    return result
+
+
+def detect_seasonality_trends(
+    file_path: str,
+    time_col: str,
+    target_col: str,
+    period: Optional[int] = None,
+    method: str = "stl",
+    output_path: Optional[str] = None
+) -> Dict[str, Any]:
+    """
+    Detect seasonality and trends in time series using STL decomposition.
+    
+    Args:
+        file_path: Path to time series dataset
+        time_col: Time/date column
+        target_col: Target variable
+        period: Seasonal period (None = auto-detect)
+        method: Decomposition method ('stl', 'classical')
+        output_path: Path to save decomposition results
+        
+    Returns:
+        Dictionary with trend, seasonal, and residual components
+    """
+    # Load data
+    df = load_dataframe(file_path)
+    validate_dataframe(df)
+    validate_column_exists(df, time_col)
+    validate_column_exists(df, target_col)
+    
+    # Sort by time
+    df = df.sort(time_col)
+    
+    # Lazy import of time series libraries
+    try:
+        if method == "stl":
+            from statsmodels.tsa.seasonal import STL
+        else:
+            from statsmodels.tsa.seasonal import seasonal_decompose
+    except ImportError as e:
+        return {
+            'status': 'error',
+            'message': f"Required library not installed: {str(e)}"
+        }
+    
+    print(f"🔍 Detecting seasonality and trends using {method}...")
+    
+    # Convert to pandas
+    df_pd = df.to_pandas()
+    ts_data = df_pd.set_index(time_col)[target_col]
+    
+    # Auto-detect period using FFT if not provided
+    if period is None:
+        from scipy.fft import fft
+        from scipy.signal import find_peaks
+        
+        # Remove trend
+        detrended = ts_data - ts_data.rolling(window=min(len(ts_data)//10, 30), center=True).mean()
+        detrended = detrended.fillna(method='bfill').fillna(method='ffill')
+        
+        # FFT
+        fft_vals = np.abs(fft(detrended.values))
+        freqs = np.fft.fftfreq(len(fft_vals))
+        
+        # Find peaks
+        peaks, _ = find_peaks(fft_vals[:len(fft_vals)//2], height=np.max(fft_vals)*0.1)
+        
+        if len(peaks) > 0:
+            # Get dominant frequency
+            dominant_freq = freqs[peaks[0]]
+            period = int(1 / abs(dominant_freq)) if dominant_freq != 0 else 7
+        else:
+            period = 7  # Default weekly
+        
+        print(f"📊 Auto-detected period: {period}")
+    
+    # Perform decomposition
+    if method == "stl":
+        # STL decomposition (more robust)
+        stl = STL(ts_data, seasonal=period*2+1, trend=period*4+1)
+        result_decomp = stl.fit()
+        
+        trend = result_decomp.trend
+        seasonal = result_decomp.seasonal
+        residual = result_decomp.resid
+        
+    else:
+        # Classical decomposition
+        result_decomp = seasonal_decompose(ts_data, model='additive', period=period)
+        trend = result_decomp.trend
+        seasonal = result_decomp.seasonal
+        residual = result_decomp.resid
+    
+    # Calculate seasonality strength
+    var_resid = np.var(residual.dropna())
+    var_seasonal_resid = np.var((seasonal + residual).dropna())
+    seasonality_strength = 1 - (var_resid / var_seasonal_resid) if var_seasonal_resid > 0 else 0
+    
+    # Calculate trend strength
+    var_detrended = np.var((ts_data - trend).dropna())
+    trend_strength = 1 - (var_resid / var_detrended) if var_detrended > 0 else 0
+    
+    # Autocorrelation analysis
+    from statsmodels.tsa.stattools import acf
+    acf_values = acf(ts_data.dropna(), nlags=min(40, len(ts_data)//2))
+    
+    # Create decomposition dataframe
+    decomp_df = pl.DataFrame({
+        'time': df[time_col].to_list(),
+        'original': ts_data.values,
+        'trend': trend.fillna(0).values,
+        'seasonal': seasonal.fillna(0).values,
+        'residual': residual.fillna(0).values
+    })
+    
+    # Save if output path provided
+    if output_path:
+        save_dataframe(decomp_df, output_path)
+        print(f"💾 Decomposition saved to: {output_path}")
+    
+    return {
+        'status': 'success',
+        'method': method,
+        'detected_period': period,
+        'seasonality_strength': float(seasonality_strength),
+        'trend_strength': float(trend_strength),
+        'interpretation': {
+            'seasonality': 'strong' if seasonality_strength > 0.6 else 'moderate' if seasonality_strength > 0.3 else 'weak',
+            'trend': 'strong' if trend_strength > 0.6 else 'moderate' if trend_strength > 0.3 else 'weak'
+        },
+        'autocorrelation': acf_values[:min(10, len(acf_values))].tolist(),
+        'output_path': output_path
+    }
+
+
+def create_time_series_features(
+    file_path: str,
+    time_col: str,
+    target_col: str,
+    lag_periods: Optional[List[int]] = None,
+    rolling_windows: Optional[List[int]] = None,
+    add_holiday_features: bool = True,
+    country: str = "US",
+    output_path: Optional[str] = None
+) -> Dict[str, Any]:
+    """
+    Create comprehensive time series features including lags, rolling stats, and calendar features.
+    
+    Args:
+        file_path: Path to time series dataset
+        time_col: Time/date column
+        target_col: Target variable
+        lag_periods: Lag periods to create (e.g., [1, 7, 30])
+        rolling_windows: Rolling window sizes (e.g., [7, 14, 30])
+        add_holiday_features: Add holiday indicators
+        country: Country for holiday calendar
+        output_path: Path to save dataset with new features
+        
+    Returns:
+        Dictionary with feature engineering results
+    """
+    # Load data
+    df = load_dataframe(file_path)
+    validate_dataframe(df)
+    validate_column_exists(df, time_col)
+    validate_column_exists(df, target_col)
+    
+    # Sort by time
+    df = df.sort(time_col)
+    
+    print("⏰ Creating time series features...")
+    
+    # Convert to pandas for easier datetime handling
+    df_pd = df.to_pandas()
+    df_pd[time_col] = pd.to_datetime(df_pd[time_col])
+    df_pd = df_pd.set_index(time_col)
+    
+    created_features = []
+    
+    # Lag features
+    if lag_periods is None:
+        lag_periods = [1, 7, 14, 30]
+    
+    for lag in lag_periods:
+        df_pd[f'{target_col}_lag_{lag}'] = df_pd[target_col].shift(lag)
+        created_features.append(f'{target_col}_lag_{lag}')
+    
+    # Rolling window features
+    if rolling_windows is None:
+        rolling_windows = [7, 14, 30]
+    
+    for window in rolling_windows:
+        df_pd[f'{target_col}_rolling_mean_{window}'] = df_pd[target_col].rolling(window=window).mean()
+        df_pd[f'{target_col}_rolling_std_{window}'] = df_pd[target_col].rolling(window=window).std()
+        df_pd[f'{target_col}_rolling_min_{window}'] = df_pd[target_col].rolling(window=window).min()
+        df_pd[f'{target_col}_rolling_max_{window}'] = df_pd[target_col].rolling(window=window).max()
+        
+        created_features.extend([
+            f'{target_col}_rolling_mean_{window}',
+            f'{target_col}_rolling_std_{window}',
+            f'{target_col}_rolling_min_{window}',
+            f'{target_col}_rolling_max_{window}'
+        ])
+    
+    # Exponential moving average
+    df_pd[f'{target_col}_ema_7'] = df_pd[target_col].ewm(span=7).mean()
+    df_pd[f'{target_col}_ema_30'] = df_pd[target_col].ewm(span=30).mean()
+    created_features.extend([f'{target_col}_ema_7', f'{target_col}_ema_30'])
+    
+    # Calendar features
+    df_pd['year'] = df_pd.index.year
+    df_pd['month'] = df_pd.index.month
+    df_pd['day'] = df_pd.index.day
+    df_pd['dayofweek'] = df_pd.index.dayofweek
+    df_pd['dayofyear'] = df_pd.index.dayofyear
+    df_pd['quarter'] = df_pd.index.quarter
+    df_pd['is_weekend'] = (df_pd.index.dayofweek >= 5).astype(int)
+    df_pd['is_month_start'] = df_pd.index.is_month_start.astype(int)
+    df_pd['is_month_end'] = df_pd.index.is_month_end.astype(int)
+    
+    # Cyclical encoding for periodic features
+    df_pd['month_sin'] = np.sin(2 * np.pi * df_pd['month'] / 12)
+    df_pd['month_cos'] = np.cos(2 * np.pi * df_pd['month'] / 12)
+    df_pd['day_sin'] = np.sin(2 * np.pi * df_pd['day'] / 31)
+    df_pd['day_cos'] = np.cos(2 * np.pi * df_pd['day'] / 31)
+    df_pd['dayofweek_sin'] = np.sin(2 * np.pi * df_pd['dayofweek'] / 7)
+    df_pd['dayofweek_cos'] = np.cos(2 * np.pi * df_pd['dayofweek'] / 7)
+    
+    created_features.extend([
+        'year', 'month', 'day', 'dayofweek', 'dayofyear', 'quarter',
+        'is_weekend', 'is_month_start', 'is_month_end',
+        'month_sin', 'month_cos', 'day_sin', 'day_cos',
+        'dayofweek_sin', 'dayofweek_cos'
+    ])
+    
+    # Holiday features
+    if add_holiday_features:
+        try:
+            import holidays
+            country_holidays = holidays.country_holidays(country)
+            df_pd['is_holiday'] = df_pd.index.map(lambda x: 1 if x in country_holidays else 0)
+            
+            # Days until next holiday
+            holiday_dates = sorted([date for date in country_holidays if date >= df_pd.index.min()])
+            df_pd['days_to_next_holiday'] = df_pd.index.map(
+                lambda x: min([abs((hol - x).days) for hol in holiday_dates if hol >= x], default=365)
+            )
+            
+            created_features.extend(['is_holiday', 'days_to_next_holiday'])
+        except Exception as e:
+            print(f"⚠️ Could not add holiday features: {str(e)}")
+    
+    # Convert back to polars
+    df_pd = df_pd.reset_index()
+    df_result = pl.from_pandas(df_pd)
+    
+    # Save if output path provided
+    if output_path:
+        save_dataframe(df_result, output_path)
+        print(f"💾 Dataset with time series features saved to: {output_path}")
+    
+    return {
+        'status': 'success',
+        'features_created': len(created_features),
+        'feature_names': created_features,
+        'lag_periods': lag_periods,
+        'rolling_windows': rolling_windows,
+        'holiday_features_added': add_holiday_features,
+        'output_path': output_path
+    }
diff --git a/src/tools/tools_registry.py b/src/tools/tools_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..490c3371c8370debf8e13392db4b86e094aa4424
--- /dev/null
+++ b/src/tools/tools_registry.py
@@ -0,0 +1,1700 @@
+"""
+Complete Tools Registry for Groq Function Calling - All 67 Tools
+Defines all available tools in Groq's function calling format.
+"""
+
+TOOLS = [
+    # ============================================
+    # BASIC TOOLS (16)
+    # ============================================
+    
+    # Data Profiling Tools (3)
+    {
+        "type": "function",
+        "function": {
+            "name": "profile_dataset",
+            "description": "Get comprehensive statistics about a dataset including shape, data types, memory usage, null counts, and unique values. Use this as the first step to understand any new dataset.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {
+                        "type": "string",
+                        "description": "Absolute or relative path to the CSV or Parquet file"
+                    }
+                },
+                "required": ["file_path"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "detect_data_quality_issues",
+            "description": "Detect data quality issues including outliers (using IQR method), duplicate rows, inconsistent formats, and data anomalies. Returns a prioritized list of issues with severity levels.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {
+                        "type": "string",
+                        "description": "Path to the dataset file"
+                    }
+                },
+                "required": ["file_path"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "analyze_correlations",
+            "description": "Compute correlation matrix and identify top correlations. If a target column is specified, shows features most correlated with the target. Useful for feature selection and understanding relationships.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {
+                        "type": "string",
+                        "description": "Path to the dataset file"
+                    },
+                    "target": {
+                        "type": "string",
+                        "description": "Optional target column name to analyze correlations with"
+                    }
+                },
+                "required": ["file_path"]
+            }
+        }
+    },
+    
+    # Data Cleaning Tools (3)
+    {
+        "type": "function",
+        "function": {
+            "name": "clean_missing_values",
+            "description": "Handle missing values using appropriate strategies based on column type. Strategies include median/mean for numeric, mode for categorical, forward_fill for time series, or drop. In 'auto' mode, first drops columns with >threshold missing (default 40%), then imputes remaining columns. Will not impute ID columns.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {
+                        "type": "string",
+                        "description": "Path to the dataset file"
+                    },
+                    "strategy": {
+                        "oneOf": [
+                            {
+                                "type": "string",
+                                "enum": ["auto"],
+                                "description": "Use 'auto' to automatically decide strategies for all columns based on data type. First drops columns with >threshold missing, then imputes remaining columns."
+                            },
+                            {
+                                "type": "object",
+                                "description": "Dictionary mapping column names to strategies ('median', 'mean', 'mode', 'forward_fill', 'drop')",
+                                "additionalProperties": {"type": "string"}
+                            }
+                        ],
+                        "description": "Either 'auto' (string) to automatically handle all missing values, or a dictionary mapping specific columns to strategies"
+                    },
+                    "output_path": {
+                        "type": "string",
+                        "description": "Path to save cleaned dataset"
+                    },
+                    "threshold": {
+                        "type": "number",
+                        "description": "For 'auto' mode: drop columns with missing percentage above this threshold (default: 0.4 = 40%). Range: 0.0 to 1.0. For example, 0.7 means drop columns with >70% missing values."
+                    }
+                },
+                "required": ["file_path", "strategy", "output_path"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "handle_outliers",
+            "description": "Detect and handle outliers in numeric columns using IQR method. Methods: 'clip' (cap at boundaries), 'winsorize' (cap at percentiles), or 'remove' (delete rows).",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {
+                        "type": "string",
+                        "description": "Path to the dataset file"
+                    },
+                    "method": {
+                        "type": "string",
+                        "enum": ["clip", "winsorize", "remove"],
+                        "description": "Method to handle outliers"
+                    },
+                    "columns": {
+                        "type": "array",
+                        "items": {"type": "string"},
+                        "description": "List of column names to check for outliers. Use 'all' to check all numeric columns."
+                    },
+                    "output_path": {
+                        "type": "string",
+                        "description": "Path to save cleaned dataset"
+                    }
+                },
+                "required": ["file_path", "method", "columns", "output_path"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "fix_data_types",
+            "description": "Auto-detect and fix incorrect data types. Handles dates, booleans, categoricals, and numeric columns. Fixes common issues like 'null' strings and mixed types.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {
+                        "type": "string",
+                        "description": "Path to the dataset file"
+                    },
+                    "type_mapping": {
+                        "type": "object",
+                        "description": "Optional dictionary mapping column names to target types ('int', 'float', 'string', 'date', 'bool', 'category'). Use 'auto' for automatic detection.",
+                        "additionalProperties": {"type": "string"}
+                    },
+                    "output_path": {
+                        "type": "string",
+                        "description": "Path to save dataset with fixed types"
+                    }
+                },
+                "required": ["file_path", "output_path"]
+            }
+        }
+    },
+    
+    # Data Type Conversion Tools (2)
+    {
+        "type": "function",
+        "function": {
+            "name": "force_numeric_conversion",
+            "description": "CRITICAL TOOL: Force convert columns to numeric type even if detected as strings/objects. Essential for datasets with numeric columns stored as strings (with commas, spaces, currency symbols). Use this BEFORE encoding when you see 'no numeric features' errors.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {
+                        "type": "string",
+                        "description": "Path to the dataset file"
+                    },
+                    "columns": {
+                        "type": "array",
+                        "items": {"type": "string"},
+                        "description": "List of column names to force convert to numeric. Use ['all'] to auto-detect and convert all non-ID columns that look numeric."
+                    },
+                    "output_path": {
+                        "type": "string",
+                        "description": "Path to save dataset with converted types"
+                    },
+                    "errors": {
+                        "type": "string",
+                        "enum": ["coerce", "raise"],
+                        "description": "How to handle conversion errors. 'coerce' makes invalid values null (recommended), 'raise' throws error."
+                    }
+                },
+                "required": ["file_path", "columns", "output_path"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "smart_type_inference",
+            "description": "Intelligently infer and fix data types for all columns by analyzing patterns. Goes beyond basic type detection to understand semantic meaning. Use when dataset has widespread type issues.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {
+                        "type": "string",
+                        "description": "Path to the dataset file"
+                    },
+                    "output_path": {
+                        "type": "string",
+                        "description": "Path to save dataset with inferred types"
+                    },
+                    "aggressive": {
+                        "type": "boolean",
+                        "description": "If true, attempts aggressive conversion on ambiguous columns. Recommended for messy datasets."
+                    }
+                },
+                "required": ["file_path", "output_path"]
+            }
+        }
+    },
+    
+    # Feature Engineering Tools (2)
+    {
+        "type": "function",
+        "function": {
+            "name": "create_time_features",
+            "description": "Extract comprehensive time-based features from datetime columns including year, month, day, day_of_week, quarter, is_weekend, and cyclical encodings (sin/cos for month and hour).",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {
+                        "type": "string",
+                        "description": "Path to the dataset file"
+                    },
+                    "date_col": {
+                        "type": "string",
+                        "description": "Name of the datetime column to extract features from"
+                    },
+                    "output_path": {
+                        "type": "string",
+                        "description": "Path to save dataset with new features"
+                    }
+                },
+                "required": ["file_path", "date_col", "output_path"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "encode_categorical",
+            "description": "Encode categorical variables using one-hot encoding, target encoding, or frequency encoding. Handles high-cardinality columns intelligently. Use method='auto' to automatically choose the best encoding.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {
+                        "type": "string",
+                        "description": "Path to the dataset file"
+                    },
+                    "method": {
+                        "type": "string",
+                        "enum": ["one_hot", "target", "frequency", "auto"],
+                        "description": "Encoding method to use. 'auto' automatically selects the best method."
+                    },
+                    "columns": {
+                        "type": "array",
+                        "items": {"type": "string"},
+                        "description": "List of categorical columns to encode. Use ['all'] to encode all categorical columns. If not specified, defaults to all categorical columns."
+                    },
+                    "target_col": {
+                        "type": "string",
+                        "description": "Required for target encoding: name of the target column"
+                    },
+                    "output_path": {
+                        "type": "string",
+                        "description": "Path to save dataset with encoded features"
+                    }
+                },
+                "required": ["file_path", "output_path"]
+            }
+        }
+    },
+    
+    # Model Training Tools (2)
+    {
+        "type": "function",
+        "function": {
+            "name": "train_baseline_models",
+            "description": "Train multiple baseline models (Logistic Regression, Random Forest, XGBoost) and compare their performance. Automatically detects task type (classification/regression) and returns the best model with metrics.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {
+                        "type": "string",
+                        "description": "Path to the prepared dataset file"
+                    },
+                    "target_col": {
+                        "type": "string",
+                        "description": "Name of the target column to predict"
+                    },
+                    "task_type": {
+                        "type": "string",
+                        "enum": ["classification", "regression", "auto"],
+                        "description": "Type of ML task. Use 'auto' to detect automatically."
+                    },
+                    "test_size": {
+                        "type": "number",
+                        "description": "Proportion of data to use for testing (default: 0.2)"
+                    },
+                    "random_state": {
+                        "type": "integer",
+                        "description": "Random seed for reproducibility (default: 42)"
+                    }
+                },
+                "required": ["file_path", "target_col"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "generate_model_report",
+            "description": "Generate comprehensive model evaluation report including metrics, confusion matrix (for classification), feature importance, and SHAP values for top features. Saves report as JSON.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "model_path": {
+                        "type": "string",
+                        "description": "Path to saved model file (.pkl or .joblib)"
+                    },
+                    "test_data_path": {
+                        "type": "string",
+                        "description": "Path to test dataset file"
+                    },
+                    "target_col": {
+                        "type": "string",
+                        "description": "Name of the target column"
+                    },
+                    "output_path": {
+                        "type": "string",
+                        "description": "Path to save the report JSON file"
+                    }
+                },
+                "required": ["model_path", "test_data_path", "target_col", "output_path"]
+            }
+        }
+    },
+    
+    # New Data Wrangling Tools (3)
+    {
+        "type": "function",
+        "function": {
+            "name": "get_smart_summary",
+            "description": "Generate an LLM-friendly smart summary of a dataset with per-column missing value percentages (sorted by severity), unique value counts, sample data, and numeric statistics. Much more detailed than profile_dataset for decision-making.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {
+                        "type": "string",
+                        "description": "Path to the CSV or Parquet file to summarize"
+                    },
+                    "n_samples": {
+                        "type": "integer",
+                        "description": "Number of sample rows to include in the summary (default: 5)"
+                    }
+                },
+                "required": ["file_path"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "merge_datasets",
+            "description": "Merge two datasets using SQL-like join operations (inner, left, right, outer, cross). Supports joining on single or multiple columns with same or different names. Automatically handles duplicate columns with suffixes.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "left_path": {
+                        "type": "string",
+                        "description": "Path to the left (first) dataset file"
+                    },
+                    "right_path": {
+                        "type": "string",
+                        "description": "Path to the right (second) dataset file"
+                    },
+                    "output_path": {
+                        "type": "string",
+                        "description": "Path to save the merged dataset"
+                    },
+                    "how": {
+                        "type": "string",
+                        "enum": ["inner", "left", "right", "outer", "cross"],
+                        "description": "Join type: 'inner' (only matching rows), 'left' (all left + matching right), 'right' (all right + matching left), 'outer' (all rows from both), 'cross' (cartesian product)"
+                    },
+                    "on": {
+                        "type": ["string", "array"],
+                        "items": {"type": "string"},
+                        "description": "Column name(s) to join on (must exist in both datasets). Can be a single column name or list of columns. Use this when join columns have the same name in both datasets."
+                    },
+                    "left_on": {
+                        "type": ["string", "array"],
+                        "items": {"type": "string"},
+                        "description": "Column name(s) in left dataset to join on. Use with right_on when join columns have different names."
+                    },
+                    "right_on": {
+                        "type": ["string", "array"],
+                        "items": {"type": "string"},
+                        "description": "Column name(s) in right dataset to join on. Use with left_on when join columns have different names."
+                    }
+                },
+                "required": ["left_path", "right_path", "output_path"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "concat_datasets",
+            "description": "Concatenate multiple datasets either vertically (stacking rows, useful for monthly data) or horizontally (adding columns side-by-side). Validates schema compatibility for vertical concat.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_paths": {
+                        "type": "array",
+                        "items": {"type": "string"},
+                        "description": "List of paths to dataset files to concatenate (minimum 2 files)"
+                    },
+                    "output_path": {
+                        "type": "string",
+                        "description": "Path to save the concatenated dataset"
+                    },
+                    "axis": {
+                        "type": "string",
+                        "enum": ["vertical", "horizontal"],
+                        "description": "'vertical' to stack rows (union, for monthly data), 'horizontal' to add columns side-by-side (default: 'vertical')"
+                    }
+                },
+                "required": ["file_paths", "output_path"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "reshape_dataset",
+            "description": "Transform dataset structure using pivot (long→wide format), melt (wide→long format), or transpose (swap rows and columns) operations.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {
+                        "type": "string",
+                        "description": "Path to the dataset file to reshape"
+                    },
+                    "output_path": {
+                        "type": "string",
+                        "description": "Path to save the reshaped dataset"
+                    },
+                    "operation": {
+                        "type": "string",
+                        "enum": ["pivot", "melt", "transpose"],
+                        "description": "Reshape operation: 'pivot' (long→wide, requires index/columns/values), 'melt' (wide→long, requires id_vars/value_vars), 'transpose' (swap rows/columns)"
+                    },
+                    "index": {
+                        "type": "string",
+                        "description": "Column to use as row index (for pivot operation)"
+                    },
+                    "columns": {
+                        "type": "string",
+                        "description": "Column whose values become new column names (for pivot operation)"
+                    },
+                    "values": {
+                        "type": "string",
+                        "description": "Column whose values populate the pivoted table (for pivot operation)"
+                    },
+                    "id_vars": {
+                        "type": "array",
+                        "items": {"type": "string"},
+                        "description": "Columns to keep as identifiers (for melt operation)"
+                    },
+                    "value_vars": {
+                        "type": "array",
+                        "items": {"type": "string"},
+                        "description": "Columns to unpivot (for melt operation). If not specified, uses all columns except id_vars."
+                    }
+                },
+                "required": ["file_path", "output_path", "operation"]
+            }
+        }
+    },
+    
+    # ============================================
+    # ADVANCED ANALYSIS TOOLS (5)
+    # ============================================
+    {
+        "type": "function",
+        "function": {
+            "name": "perform_eda_analysis",
+            "description": "Comprehensive Exploratory Data Analysis with visualizations, distribution analysis, and automated insights. Generates HTML report with plots.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to dataset"},
+                    "target_col": {"type": "string", "description": "Optional target column for supervised analysis"},
+                    "output_dir": {"type": "string", "description": "Directory to save EDA report and plots"}
+                },
+                "required": ["file_path", "output_dir"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "detect_model_issues",
+            "description": "Detect overfitting, underfitting, class imbalance, and other model performance issues. Provides diagnostic recommendations.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "model_path": {"type": "string", "description": "Path to trained model"},
+                    "train_data_path": {"type": "string", "description": "Path to training data"},
+                    "test_data_path": {"type": "string", "description": "Path to test data"},
+                    "target_col": {"type": "string", "description": "Target column name"}
+                },
+                "required": ["model_path", "train_data_path", "test_data_path", "target_col"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "detect_anomalies",
+            "description": "Detect anomalies using Isolation Forest, LOF, or statistical methods. Returns anomaly scores and flags.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to dataset"},
+                    "method": {"type": "string", "enum": ["isolation_forest", "lof", "statistical"], "description": "Anomaly detection method"},
+                    "contamination": {"type": "number", "description": "Expected proportion of anomalies (default: 0.1)"},
+                    "output_path": {"type": "string", "description": "Path to save dataset with anomaly scores"}
+                },
+                "required": ["file_path", "method", "output_path"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "detect_and_handle_multicollinearity",
+            "description": "Detect and handle multicollinearity using VIF (Variance Inflation Factor). Removes highly correlated features automatically.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to dataset"},
+                    "threshold": {"type": "number", "description": "VIF threshold (default: 10)"},
+                    "method": {"type": "string", "enum": ["drop", "combine"], "description": "How to handle correlated features"},
+                    "output_path": {"type": "string", "description": "Path to save cleaned dataset"}
+                },
+                "required": ["file_path", "output_path"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "perform_statistical_tests",
+            "description": "Perform statistical hypothesis tests (t-test, chi-square, ANOVA) to analyze relationships between features and target.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to dataset"},
+                    "target_col": {"type": "string", "description": "Target column name"},
+                    "test_type": {"type": "string", "enum": ["auto", "ttest", "chi2", "anova"], "description": "Type of statistical test"}
+                },
+                "required": ["file_path", "target_col"]
+            }
+        }
+    },
+    
+    # ============================================
+    # ADVANCED FEATURE ENGINEERING (4)
+    # ============================================
+    {
+        "type": "function",
+        "function": {
+            "name": "create_interaction_features",
+            "description": "Create polynomial, PCA, or cross-product interaction features to capture non-linear relationships.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to dataset"},
+                    "method": {"type": "string", "enum": ["polynomial", "pca", "cross"], "description": "Interaction method"},
+                    "degree": {"type": "integer", "description": "Polynomial degree (default: 2)"},
+                    "max_features": {"type": "integer", "description": "Maximum new features to create (default: 50)"},
+                    "output_path": {"type": "string", "description": "Path to save enhanced dataset"}
+                },
+                "required": ["file_path", "method", "output_path"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "create_aggregation_features",
+            "description": "Create aggregation features (mean, sum, count, etc.) grouped by categorical columns. Useful for customer/transaction data.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to dataset"},
+                    "group_cols": {"type": "array", "items": {"type": "string"}, "description": "Columns to group by"},
+                    "agg_cols": {"type": "array", "items": {"type": "string"}, "description": "Columns to aggregate"},
+                    "agg_functions": {"type": "array", "items": {"type": "string"}, "description": "Aggregation functions (mean, sum, count, etc.)"},
+                    "output_path": {"type": "string", "description": "Path to save dataset with aggregations"}
+                },
+                "required": ["file_path", "group_cols", "agg_cols", "output_path"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "engineer_text_features",
+            "description": "Extract features from text columns: TF-IDF, word counts, sentiment, readability scores, and embeddings.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to dataset"},
+                    "text_col": {"type": "string", "description": "Text column name"},
+                    "methods": {"type": "array", "items": {"type": "string"}, "description": "Feature extraction methods (tfidf, count, sentiment, readability)"},
+                    "max_features": {"type": "integer", "description": "Max TF-IDF features (default: 100)"},
+                    "output_path": {"type": "string", "description": "Path to save dataset with text features"}
+                },
+                "required": ["file_path", "text_col", "methods", "output_path"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "auto_feature_engineering",
+            "description": "Use LLM (Gemini/Groq) to automatically generate creative feature engineering ideas and implement them. Works without API key if environment variables are set.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to dataset"},
+                    "target_col": {"type": "string", "description": "Target column name"},
+                    "groq_api_key": {"type": "string", "description": "Groq API key (optional - uses environment variable if not provided)"},
+                    "max_suggestions": {"type": "integer", "description": "Maximum feature suggestions to generate (default: 10)"},
+                    "implement_top_k": {"type": "integer", "description": "Number of top suggestions to implement (default: 5)"},
+                    "output_path": {"type": "string", "description": "Path to save dataset with new features"}
+                },
+                "required": ["file_path", "target_col", "output_path"]
+            }
+        }
+    },
+    
+    # ============================================
+    # ADVANCED PREPROCESSING (3)
+    # ============================================
+    {
+        "type": "function",
+        "function": {
+            "name": "handle_imbalanced_data",
+            "description": "Handle class imbalance using SMOTE, ADASYN, or class weights. Critical for classification tasks.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to dataset"},
+                    "target_col": {"type": "string", "description": "Target column name"},
+                    "method": {"type": "string", "enum": ["smote", "adasyn", "random_oversample", "random_undersample"], "description": "Balancing method"},
+                    "sampling_strategy": {"type": "string", "description": "Sampling ratio (auto, minority, majority)"},
+                    "output_path": {"type": "string", "description": "Path to save balanced dataset"}
+                },
+                "required": ["file_path", "target_col", "method", "output_path"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "perform_feature_scaling",
+            "description": "Scale features using StandardScaler, MinMaxScaler, or RobustScaler. Essential for distance-based algorithms.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to dataset"},
+                    "method": {"type": "string", "enum": ["standard", "minmax", "robust"], "description": "Scaling method"},
+                    "columns": {"type": "array", "items": {"type": "string"}, "description": "Columns to scale (None = all numeric)"},
+                    "output_path": {"type": "string", "description": "Path to save scaled dataset"}
+                },
+                "required": ["file_path", "method", "output_path"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "split_data_strategically",
+            "description": "Split data with stratification, time-based splitting, or group-based splitting for better validation.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to dataset"},
+                    "target_col": {"type": "string", "description": "Target column for stratification"},
+                    "method": {"type": "string", "enum": ["stratified", "time_based", "group_based"], "description": "Split method"},
+                    "test_size": {"type": "number", "description": "Test set proportion (default: 0.2)"},
+                    "time_col": {"type": "string", "description": "Time column for time-based split"},
+                    "group_col": {"type": "string", "description": "Group column for group-based split"}
+                },
+                "required": ["file_path", "method"]
+            }
+        }
+    },
+    
+    # ============================================
+    # ADVANCED TRAINING (3)
+    # ============================================
+    {
+        "type": "function",
+        "function": {
+            "name": "hyperparameter_tuning",
+            "description": "Optimize model hyperparameters using Optuna (Bayesian optimization). Finds best model configuration automatically.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to prepared dataset"},
+                    "target_col": {"type": "string", "description": "Target column name"},
+                    "model_type": {"type": "string", "enum": ["random_forest", "xgboost", "lightgbm"], "description": "Model to tune"},
+                    "n_trials": {"type": "integer", "description": "Number of tuning trials (default: 100)"},
+                    "task_type": {"type": "string", "enum": ["classification", "regression", "auto"], "description": "ML task type"},
+                    "output_path": {"type": "string", "description": "Path to save tuned model"}
+                },
+                "required": ["file_path", "target_col", "model_type", "output_path"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "train_ensemble_models",
+            "description": "Train ensemble models using stacking, voting, or blending. Combines multiple models for better performance.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to prepared dataset"},
+                    "target_col": {"type": "string", "description": "Target column name"},
+                    "ensemble_method": {"type": "string", "enum": ["stacking", "voting", "blending"], "description": "Ensemble technique"},
+                    "base_models": {"type": "array", "items": {"type": "string"}, "description": "Base model types to ensemble"},
+                    "task_type": {"type": "string", "enum": ["classification", "regression", "auto"], "description": "ML task type"},
+                    "output_path": {"type": "string", "description": "Path to save ensemble model"}
+                },
+                "required": ["file_path", "target_col", "ensemble_method", "output_path"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "perform_cross_validation",
+            "description": "Perform k-fold cross-validation to get robust model performance estimates. Returns mean and std of metrics.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to dataset"},
+                    "target_col": {"type": "string", "description": "Target column name"},
+                    "model_type": {"type": "string", "description": "Model type (random_forest, xgboost, logistic, ridge)"},
+                    "n_splits": {"type": "integer", "description": "Number of CV folds/splits (default: 5)"},
+                    "task_type": {"type": "string", "enum": ["classification", "regression", "auto"], "description": "ML task type"},
+                    "cv_strategy": {"type": "string", "enum": ["kfold", "stratified", "timeseries"], "description": "Cross-validation strategy (default: kfold)"},
+                    "save_oof": {"type": "boolean", "description": "Whether to save out-of-fold predictions (default: false)"}
+                },
+                "required": ["file_path", "target_col", "model_type"]
+            }
+        }
+    },
+    
+    # ============================================
+    # BUSINESS INTELLIGENCE (4)
+    # ============================================
+    {
+        "type": "function",
+        "function": {
+            "name": "perform_cohort_analysis",
+            "description": "Analyze user cohorts over time (retention, revenue, engagement). Essential for SaaS and e-commerce businesses.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to transaction/event data"},
+                    "user_col": {"type": "string", "description": "User ID column"},
+                    "date_col": {"type": "string", "description": "Date/timestamp column"},
+                    "metric_col": {"type": "string", "description": "Metric to analyze (revenue, events, etc.)"},
+                    "cohort_period": {"type": "string", "enum": ["daily", "weekly", "monthly"], "description": "Cohort grouping period"},
+                    "output_path": {"type": "string", "description": "Path to save cohort analysis results"}
+                },
+                "required": ["file_path", "user_col", "date_col", "metric_col", "output_path"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "perform_rfm_analysis",
+            "description": "RFM (Recency, Frequency, Monetary) analysis for customer segmentation. Identifies best/worst customers.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to transaction data"},
+                    "customer_col": {"type": "string", "description": "Customer ID column"},
+                    "date_col": {"type": "string", "description": "Transaction date column"},
+                    "amount_col": {"type": "string", "description": "Transaction amount column"},
+                    "output_path": {"type": "string", "description": "Path to save RFM segments"}
+                },
+                "required": ["file_path", "customer_col", "date_col", "amount_col", "output_path"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "detect_causal_relationships",
+            "description": "Detect potential causal relationships between features using Granger causality and correlation analysis.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to dataset"},
+                    "target_col": {"type": "string", "description": "Target/effect column"},
+                    "feature_cols": {"type": "array", "items": {"type": "string"}, "description": "Potential cause columns"},
+                    "method": {"type": "string", "enum": ["granger", "correlation"], "description": "Causality detection method"}
+                },
+                "required": ["file_path", "target_col"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "generate_business_insights",
+            "description": "Generate automated business insights using descriptive statistics, trends, and anomaly detection. Creates executive summary.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to business data"},
+                    "metric_cols": {"type": "array", "items": {"type": "string"}, "description": "Key business metrics to analyze"},
+                    "date_col": {"type": "string", "description": "Date column for trend analysis"},
+                    "output_path": {"type": "string", "description": "Path to save insights report"}
+                },
+                "required": ["file_path", "metric_cols", "output_path"]
+            }
+        }
+    },
+    
+    # ============================================
+    # COMPUTER VISION (3)
+    # ============================================
+    {
+        "type": "function",
+        "function": {
+            "name": "extract_image_features",
+            "description": "Extract features from images using pre-trained CNNs (ResNet, VGG). Converts images to feature vectors for ML.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "image_dir": {"type": "string", "description": "Directory containing images"},
+                    "model": {"type": "string", "enum": ["resnet", "vgg", "mobilenet"], "description": "Pre-trained model to use"},
+                    "output_path": {"type": "string", "description": "Path to save feature vectors CSV"}
+                },
+                "required": ["image_dir", "model", "output_path"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "perform_image_clustering",
+            "description": "Cluster images based on visual similarity using K-means or DBSCAN on extracted features.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "image_dir": {"type": "string", "description": "Directory containing images"},
+                    "n_clusters": {"type": "integer", "description": "Number of clusters (default: auto-detect)"},
+                    "method": {"type": "string", "enum": ["kmeans", "dbscan"], "description": "Clustering method"},
+                    "output_path": {"type": "string", "description": "Path to save clustering results"}
+                },
+                "required": ["image_dir", "method", "output_path"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "analyze_tabular_image_hybrid",
+            "description": "Combine tabular data with image features for hybrid ML models. Useful for e-commerce/medical data.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "tabular_path": {"type": "string", "description": "Path to tabular data CSV"},
+                    "image_dir": {"type": "string", "description": "Directory with images"},
+                    "image_id_col": {"type": "string", "description": "Column linking tabular data to images"},
+                    "output_path": {"type": "string", "description": "Path to save combined features"}
+                },
+                "required": ["tabular_path", "image_dir", "image_id_col", "output_path"]
+            }
+        }
+    },
+    
+    # ============================================
+    # NLP/TEXT ANALYTICS (4)
+    # ============================================
+    {
+        "type": "function",
+        "function": {
+            "name": "perform_topic_modeling",
+            "description": "Discover topics in text documents using LDA or NMF. Extract themes from customer reviews, articles, etc.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to dataset with text"},
+                    "text_col": {"type": "string", "description": "Text column name"},
+                    "n_topics": {"type": "integer", "description": "Number of topics to extract (default: 5)"},
+                    "method": {"type": "string", "enum": ["lda", "nmf"], "description": "Topic modeling method"},
+                    "output_path": {"type": "string", "description": "Path to save topics and document-topic matrix"}
+                },
+                "required": ["file_path", "text_col", "method", "output_path"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "perform_named_entity_recognition",
+            "description": "Extract named entities (person, organization, location) from text using NER models.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to dataset with text"},
+                    "text_col": {"type": "string", "description": "Text column name"},
+                    "output_path": {"type": "string", "description": "Path to save dataset with extracted entities"}
+                },
+                "required": ["file_path", "text_col", "output_path"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "analyze_sentiment_advanced",
+            "description": "Perform advanced sentiment analysis with aspect-based sentiment (what features customers like/dislike).",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to dataset with text"},
+                    "text_col": {"type": "string", "description": "Text column name"},
+                    "aspects": {"type": "array", "items": {"type": "string"}, "description": "Aspects to analyze sentiment for (e.g., 'price', 'quality')"},
+                    "output_path": {"type": "string", "description": "Path to save sentiment scores"}
+                },
+                "required": ["file_path", "text_col", "output_path"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "perform_text_similarity",
+            "description": "Calculate text similarity using cosine similarity, Jaccard, or semantic embeddings. Find duplicate/similar documents.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to dataset with text"},
+                    "text_col": {"type": "string", "description": "Text column name"},
+                    "method": {"type": "string", "enum": ["cosine", "jaccard", "semantic"], "description": "Similarity method"},
+                    "threshold": {"type": "number", "description": "Similarity threshold (0-1)"},
+                    "output_path": {"type": "string", "description": "Path to save similarity matrix"}
+                },
+                "required": ["file_path", "text_col", "method", "output_path"]
+            }
+        }
+    },
+    
+    # ============================================
+    # PRODUCTION/MLOPS (5)
+    # ============================================
+    {
+        "type": "function",
+        "function": {
+            "name": "monitor_model_drift",
+            "description": "Detect data drift and concept drift in production models. Compare training vs production data distributions.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "train_data_path": {"type": "string", "description": "Path to original training data"},
+                    "production_data_path": {"type": "string", "description": "Path to recent production data"},
+                    "features": {"type": "array", "items": {"type": "string"}, "description": "Features to monitor for drift"},
+                    "output_path": {"type": "string", "description": "Path to save drift report"}
+                },
+                "required": ["train_data_path", "production_data_path", "output_path"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "explain_predictions",
+            "description": "Explain model predictions using SHAP or LIME. Generate feature importance explanations for individual predictions.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "model_path": {"type": "string", "description": "Path to trained model"},
+                    "data_path": {"type": "string", "description": "Path to data to explain"},
+                    "method": {"type": "string", "enum": ["shap", "lime"], "description": "Explanation method"},
+                    "n_samples": {"type": "integer", "description": "Number of samples to explain (default: 10)"},
+                    "output_path": {"type": "string", "description": "Path to save explanations"}
+                },
+                "required": ["model_path", "data_path", "method", "output_path"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "generate_model_card",
+            "description": "Generate model card documentation with model details, performance metrics, bias analysis, and usage guidelines.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "model_path": {"type": "string", "description": "Path to trained model"},
+                    "train_data_path": {"type": "string", "description": "Path to training data"},
+                    "test_data_path": {"type": "string", "description": "Path to test data"},
+                    "target_col": {"type": "string", "description": "Target column name"},
+                    "output_path": {"type": "string", "description": "Path to save model card JSON"}
+                },
+                "required": ["model_path", "train_data_path", "test_data_path", "target_col", "output_path"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "perform_ab_test_analysis",
+            "description": "Analyze A/B test results with statistical significance testing. Determine if variant B is better than control A.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to A/B test data"},
+                    "variant_col": {"type": "string", "description": "Column indicating variant (A/B)"},
+                    "metric_col": {"type": "string", "description": "Success metric column"},
+                    "confidence_level": {"type": "number", "description": "Confidence level for significance (default: 0.95)"}
+                },
+                "required": ["file_path", "variant_col", "metric_col"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "detect_feature_leakage",
+            "description": "Detect potential feature leakage by analyzing feature importance and temporal relationships. Prevents data leakage bugs.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to dataset"},
+                    "target_col": {"type": "string", "description": "Target column name"},
+                    "date_col": {"type": "string", "description": "Optional date column for temporal analysis"}
+                },
+                "required": ["file_path", "target_col"]
+            }
+        }
+    },
+    
+    # ============================================
+    # TIME SERIES (3)
+    # ============================================
+    {
+        "type": "function",
+        "function": {
+            "name": "forecast_time_series",
+            "description": "Forecast future values using ARIMA, Prophet, or LSTM models. Handles seasonal and trend components.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to time series data"},
+                    "date_col": {"type": "string", "description": "Date/timestamp column"},
+                    "value_col": {"type": "string", "description": "Value column to forecast"},
+                    "forecast_periods": {"type": "integer", "description": "Number of periods to forecast"},
+                    "method": {"type": "string", "enum": ["arima", "prophet", "lstm"], "description": "Forecasting method"},
+                    "output_path": {"type": "string", "description": "Path to save forecast results"}
+                },
+                "required": ["file_path", "date_col", "value_col", "forecast_periods", "method", "output_path"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "detect_seasonality_trends",
+            "description": "Detect seasonality patterns and trends in time series data using STL decomposition and statistical tests.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to time series data"},
+                    "date_col": {"type": "string", "description": "Date/timestamp column"},
+                    "value_col": {"type": "string", "description": "Value column to analyze"},
+                    "period": {"type": "integer", "description": "Expected seasonal period (e.g., 12 for monthly)"}
+                },
+                "required": ["file_path", "date_col", "value_col"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "create_time_series_features",
+            "description": "Create comprehensive time series features: lags, rolling stats, exponential moving averages, and Fourier features.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to time series data"},
+                    "date_col": {"type": "string", "description": "Date/timestamp column"},
+                    "value_col": {"type": "string", "description": "Value column"},
+                    "lags": {"type": "array", "items": {"type": "integer"}, "description": "Lag periods to create (e.g., [1, 7, 30])"},
+                    "windows": {"type": "array", "items": {"type": "integer"}, "description": "Rolling window sizes (e.g., [7, 30])"},
+                    "output_path": {"type": "string", "description": "Path to save dataset with time series features"}
+                },
+                "required": ["file_path", "date_col", "value_col", "output_path"]
+            }
+        }
+    },
+    
+    # ============================================
+    # ADVANCED INSIGHTS TOOLS (6) - NEW
+    # ============================================
+    
+    {
+        "type": "function",
+        "function": {
+            "name": "analyze_root_cause",
+            "description": "Perform root cause analysis to identify why a metric dropped or changed. Analyzes correlations, temporal patterns, and identifies top influencing factors.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to dataset"},
+                    "target_col": {"type": "string", "description": "Column to analyze (e.g., 'sales')"},
+                    "time_col": {"type": "string", "description": "Optional time column for trend analysis"},
+                    "threshold_drop": {"type": "number", "description": "Percentage drop to flag as significant (default 0.15)"}
+                },
+                "required": ["file_path", "target_col"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "detect_trends_and_seasonality",
+            "description": "Detect trends and seasonal patterns in time series data using statistical methods and autocorrelation.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to dataset"},
+                    "value_col": {"type": "string", "description": "Column with values to analyze"},
+                    "time_col": {"type": "string", "description": "Column with timestamps"},
+                    "seasonal_period": {"type": "integer", "description": "Expected seasonal period (auto-detected if None)"}
+                },
+                "required": ["file_path", "value_col", "time_col"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "detect_anomalies_advanced",
+            "description": "Detect anomalies with confidence scores using Isolation Forest or statistical methods.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to dataset"},
+                    "columns": {"type": "array", "items": {"type": "string"}, "description": "Columns to analyze (all numeric if None)"},
+                    "contamination": {"type": "number", "description": "Expected proportion of outliers (default 0.1)"},
+                    "method": {"type": "string", "enum": ["isolation_forest", "statistical"], "description": "Detection method"}
+                },
+                "required": ["file_path"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "perform_hypothesis_testing",
+            "description": "Perform statistical hypothesis testing (t-test, ANOVA, chi-square) to compare groups.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to dataset"},
+                    "group_col": {"type": "string", "description": "Column defining groups"},
+                    "value_col": {"type": "string", "description": "Column with values to compare"},
+                    "test_type": {"type": "string", "enum": ["t-test", "anova", "chi-square", "auto"], "description": "Test type (auto-detected if 'auto')"}
+                },
+                "required": ["file_path", "group_col", "value_col"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "analyze_distribution",
+            "description": "Analyze distribution of a column including normality tests, skewness, and kurtosis.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to dataset"},
+                    "column": {"type": "string", "description": "Column to analyze"},
+                    "tests": {"type": "array", "items": {"type": "string"}, "description": "Tests to perform (normality, skewness)"}
+                },
+                "required": ["file_path", "column"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "perform_segment_analysis",
+            "description": "Perform cluster-based customer/data segmentation using K-means and profile each segment.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to dataset"},
+                    "n_segments": {"type": "integer", "description": "Number of segments to create (default 5)"},
+                    "features": {"type": "array", "items": {"type": "string"}, "description": "Features for clustering (all numeric if None)"}
+                },
+                "required": ["file_path"]
+            }
+        }
+    },
+    
+    # ============================================
+    # AUTOMATED PIPELINE TOOLS (2) - NEW
+    # ============================================
+    
+    {
+        "type": "function",
+        "function": {
+            "name": "auto_ml_pipeline",
+            "description": "Fully automated ML pipeline: auto-detect types, clean missing values, handle outliers, encode categorical, engineer features, and select best features. Zero configuration required!",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to input dataset"},
+                    "target_col": {"type": "string", "description": "Target column name"},
+                    "task_type": {"type": "string", "enum": ["classification", "regression", "auto"], "description": "Task type (auto-detected if 'auto')"},
+                    "output_path": {"type": "string", "description": "Where to save processed data"},
+                    "feature_engineering_level": {"type": "string", "enum": ["basic", "intermediate", "advanced"], "description": "Feature engineering depth"}
+                },
+                "required": ["file_path", "target_col"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "auto_feature_selection",
+            "description": "Automatically select the best features for modeling using mutual information or F-statistics.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to dataset"},
+                    "target_col": {"type": "string", "description": "Target column"},
+                    "task_type": {"type": "string", "enum": ["classification", "regression", "auto"], "description": "Task type"},
+                    "max_features": {"type": "integer", "description": "Maximum features to keep (default 50)"},
+                    "method": {"type": "string", "enum": ["mutual_info", "f_test", "auto"], "description": "Selection method"},
+                    "output_path": {"type": "string", "description": "Where to save selected features"}
+                },
+                "required": ["file_path", "target_col"]
+            }
+        }
+    },
+    
+    # ============================================
+    # VISUALIZATION TOOLS (3) - NEW
+    # ============================================
+    
+    {
+        "type": "function",
+        "function": {
+            "name": "generate_all_plots",
+            "description": "Generate ALL plots for a dataset automatically: data quality, EDA, distributions, and correlations. Creates interactive HTML plots.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to dataset"},
+                    "target_col": {"type": "string", "description": "Optional target column"},
+                    "output_dir": {"type": "string", "description": "Directory to save plots (default ./outputs/plots)"}
+                },
+                "required": ["file_path"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "generate_data_quality_plots",
+            "description": "Generate data quality visualizations: missing values, data types, and outlier detection plots.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to dataset"},
+                    "output_dir": {"type": "string", "description": "Directory to save plots"}
+                },
+                "required": ["file_path", "output_dir"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "generate_eda_plots",
+            "description": "Generate exploratory data analysis plots: correlation heatmap, feature relationships, and pairplots.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to dataset"},
+                    "target_col": {"type": "string", "description": "Optional target column"},
+                    "output_dir": {"type": "string", "description": "Directory to save plots"}
+                },
+                "required": ["file_path", "output_dir"]
+            }
+        }
+    },
+    
+    # ============================================
+    # INTERACTIVE PLOTLY VISUALIZATIONS (6)
+    # ============================================
+    {
+        "type": "function",
+        "function": {
+            "name": "generate_interactive_scatter",
+            "description": "Create interactive scatter plot with zoom, pan, and hover capabilities. Great for exploring relationships between variables.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to dataset"},
+                    "x_col": {"type": "string", "description": "Column for X-axis"},
+                    "y_col": {"type": "string", "description": "Column for Y-axis"},
+                    "color_col": {"type": "string", "description": "Optional column for color coding points"},
+                    "size_col": {"type": "string", "description": "Optional column for bubble size"},
+                    "output_path": {"type": "string", "description": "Path to save HTML file (default: ./outputs/plots/interactive/scatter.html)"}
+                },
+                "required": ["file_path", "x_col", "y_col"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "generate_interactive_histogram",
+            "description": "Create interactive histogram with box plot overlay. Users can explore distribution interactively.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to dataset"},
+                    "column": {"type": "string", "description": "Column to plot distribution"},
+                    "bins": {"type": "integer", "description": "Number of bins (default: 30)"},
+                    "color_col": {"type": "string", "description": "Optional column for grouped histograms"},
+                    "output_path": {"type": "string", "description": "Path to save HTML file"}
+                },
+                "required": ["file_path", "column"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "generate_interactive_correlation_heatmap",
+            "description": "Create interactive correlation heatmap with hover values. Better than static matplotlib version.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to dataset"},
+                    "output_path": {"type": "string", "description": "Path to save HTML file"}
+                },
+                "required": ["file_path"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "generate_interactive_box_plots",
+            "description": "Create interactive box plots for outlier detection. Supports grouping by categorical variable.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to dataset"},
+                    "columns": {"type": "array", "items": {"type": "string"}, "description": "Columns to plot (all numeric if not specified)"},
+                    "group_by": {"type": "string", "description": "Optional categorical column for grouping"},
+                    "output_path": {"type": "string", "description": "Path to save HTML file"}
+                },
+                "required": ["file_path"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "generate_interactive_time_series",
+            "description": "Create interactive time series plot with range slider and zoom. Perfect for temporal data analysis.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to dataset"},
+                    "time_col": {"type": "string", "description": "Column with datetime values"},
+                    "value_cols": {"type": "array", "items": {"type": "string"}, "description": "Columns to plot over time"},
+                    "output_path": {"type": "string", "description": "Path to save HTML file"}
+                },
+                "required": ["file_path", "time_col", "value_cols"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "generate_plotly_dashboard",
+            "description": "Generate complete interactive dashboard with multiple visualizations: correlation heatmap, box plots, scatter plots, histograms. One-stop visualization solution.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to dataset"},
+                    "target_col": {"type": "string", "description": "Optional target column for supervised analysis"},
+                    "output_dir": {"type": "string", "description": "Directory to save all plots (default: ./outputs/plots/interactive)"}
+                },
+                "required": ["file_path"]
+            }
+        }
+    },
+    # EDA Report Generation (3) - NEW PHASE 2
+    {
+        "type": "function",
+        "function": {
+            "name": "generate_sweetviz_report",
+            "description": "Generate beautiful HTML EDA report using Sweetviz. Creates stunning visualizations with target analysis, feature distributions, correlations, missing values. Fast and visually appealing. Supports dataset comparison (train vs test).",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to the dataset CSV/Parquet file"},
+                    "output_path": {"type": "string", "description": "Where to save HTML report (default: ./outputs/reports/sweetviz_report.html)"},
+                    "target_column": {"type": "string", "description": "Optional target variable for association analysis"},
+                    "compare_file_path": {"type": "string", "description": "Optional second dataset to compare (e.g., train vs test)"}
+                },
+                "required": ["file_path"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "generate_ydata_profiling_report",
+            "description": "Generate comprehensive HTML report using ydata-profiling (formerly pandas-profiling). Provides extensive analysis: overview, variable statistics, interactions, correlations (Pearson, Spearman, Cramér's V), missing values matrix, duplicate analysis, and more. Most detailed profiling tool.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to the dataset CSV/Parquet file"},
+                    "output_path": {"type": "string", "description": "Where to save HTML report (default: ./outputs/reports/ydata_profile.html)"},
+                    "minimal": {"type": "boolean", "description": "If true, generates faster minimal report (useful for large datasets, default: false)"},
+                    "title": {"type": "string", "description": "Report title (default: 'Data Profiling Report')"}
+                },
+                "required": ["file_path"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "generate_combined_eda_report",
+            "description": "Generate BOTH Sweetviz and ydata-profiling reports in one call. Best of both worlds: Sweetviz for beautiful fast visualizations + ydata-profiling for comprehensive detailed analysis. Recommended for complete EDA.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "Path to the dataset CSV/Parquet file"},
+                    "output_dir": {"type": "string", "description": "Directory to save both reports (default: ./outputs/reports)"},
+                    "target_column": {"type": "string", "description": "Optional target variable for Sweetviz analysis"},
+                    "minimal": {"type": "boolean", "description": "If true, uses minimal mode for ydata-profiling (default: false)"}
+                },
+                "required": ["file_path"]
+            }
+        }
+    },
+    # ========================================
+    # CODE INTERPRETER - THE GAME CHANGER 🚀
+    # ========================================
+    {
+        "type": "function",
+        "function": {
+            "name": "execute_python_code",
+            "description": "⭐ CRITICAL TOOL - Execute custom Python code for ANY data science task not covered by existing tools. This is what makes you a TRUE AI AGENT, not just a function-calling bot. Use this when user requests: 1) Custom visualizations (specific Plotly plots, interactive dashboards, unique chart types) 2) Domain-specific calculations 3) Custom data transformations 4) Specific export formats 5) Interactive widgets/filters. Code has access to pandas, polars, numpy, matplotlib, seaborn, plotly. ALWAYS save outputs to files and return file paths.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "code": {
+                        "type": "string",
+                        "description": "Python code to execute. Auto-imported: pandas as pd, polars as pl, numpy as np, matplotlib.pyplot as plt, seaborn as sns, plotly.express as px, plotly.graph_objects as go. Code should save outputs to files in working_directory. Example: fig.write_html('./outputs/code/plot.html')"
+                    },
+                    "working_directory": {
+                        "type": "string",
+                        "description": "Directory to run code in (default: ./outputs/code). Code can read from ./temp/ and write to this directory."
+                    },
+                    "timeout": {
+                        "type": "integer",
+                        "description": "Maximum execution time in seconds (default: 60)"
+                    }
+                },
+                "required": ["code"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "execute_code_from_file",
+            "description": "Execute Python code from an existing .py file. Useful when code is too long to pass as string, or when running pre-written scripts. Same capabilities as execute_python_code.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {
+                        "type": "string",
+                        "description": "Path to .py file to execute"
+                    },
+                    "working_directory": {
+                        "type": "string",
+                        "description": "Directory to run code in (default: ./outputs/code)"
+                    },
+                    "timeout": {
+                        "type": "integer",
+                        "description": "Maximum execution time in seconds (default: 60)"
+                    }
+                },
+                "required": ["file_path"]
+            }
+        }
+    },
+    
+    # ============================================
+    # CLOUD DATA SOURCES (4) - NEW
+    # ============================================
+    
+    {
+        "type": "function",
+        "function": {
+            "name": "load_bigquery_table",
+            "description": "Load data from Google BigQuery table into a Polars DataFrame. Supports sampling via LIMIT and column selection. Returns CSV path for downstream tools. Use profile_bigquery_table first for large tables.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "project_id": {
+                        "type": "string",
+                        "description": "Google Cloud project ID"
+                    },
+                    "dataset": {
+                        "type": "string",
+                        "description": "BigQuery dataset name"
+                    },
+                    "table": {
+                        "type": "string",
+                        "description": "BigQuery table name"
+                    },
+                    "limit": {
+                        "type": "integer",
+                        "description": "Optional row limit for sampling (e.g., 10000 for large tables)"
+                    },
+                    "columns": {
+                        "type": "array",
+                        "items": {"type": "string"},
+                        "description": "Optional list of column names to load"
+                    },
+                    "where_clause": {
+                        "type": "string",
+                        "description": "Optional SQL WHERE clause for filtering (without WHERE keyword)"
+                    }
+                },
+                "required": ["project_id", "dataset", "table"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "write_bigquery_table",
+            "description": "Write predictions or processed data from CSV/Parquet file to BigQuery table. Supports append, overwrite, or fail modes.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {
+                        "type": "string",
+                        "description": "Path to CSV or Parquet file to write"
+                    },
+                    "project_id": {
+                        "type": "string",
+                        "description": "Google Cloud project ID"
+                    },
+                    "dataset": {
+                        "type": "string",
+                        "description": "BigQuery dataset name"
+                    },
+                    "table": {
+                        "type": "string",
+                        "description": "BigQuery table name"
+                    },
+                    "mode": {
+                        "type": "string",
+                        "enum": ["append", "overwrite", "fail"],
+                        "description": "Write mode: append (add rows), overwrite (replace), fail (error if exists)"
+                    }
+                },
+                "required": ["file_path", "project_id", "dataset", "table"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "profile_bigquery_table",
+            "description": "Profile a BigQuery table without loading all data. Returns row count, column types, null counts (sampled), table size, and load recommendations. Use this BEFORE load_bigquery_table for large tables.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "project_id": {
+                        "type": "string",
+                        "description": "Google Cloud project ID"
+                    },
+                    "dataset": {
+                        "type": "string",
+                        "description": "BigQuery dataset name"
+                    },
+                    "table": {
+                        "type": "string",
+                        "description": "BigQuery table name"
+                    }
+                },
+                "required": ["project_id", "dataset", "table"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "query_bigquery",
+            "description": "Execute custom BigQuery SQL query and return results as DataFrame. Useful for complex aggregations, joins, or transformations before analysis.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "project_id": {
+                        "type": "string",
+                        "description": "Google Cloud project ID"
+                    },
+                    "query": {
+                        "type": "string",
+                        "description": "SQL query to execute"
+                    },
+                    "output_path": {
+                        "type": "string",
+                        "description": "Optional path to save results (default: auto-generated)"
+                    },
+                    "limit": {
+                        "type": "integer",
+                        "description": "Optional row limit to append to query"
+                    }
+                },
+                "required": ["project_id", "query"]
+            }
+        }
+    }
+]
+
+
+def get_tool_by_name(tool_name: str) -> dict:
+    """Get tool definition by name."""
+    for tool in TOOLS:
+        if tool["function"]["name"] == tool_name:
+            return tool
+    raise ValueError(f"Tool '{tool_name}' not found in registry")
+
+
+def get_all_tool_names() -> list:
+    """Get list of all tool names."""
+    return [tool["function"]["name"] for tool in TOOLS]
+
+
+def get_tools_by_category() -> dict:
+    """Get tools organized by category."""
+    return {
+        "basic": [t["function"]["name"] for t in TOOLS[:16]],
+        "advanced_analysis": [t["function"]["name"] for t in TOOLS[16:21]],
+        "advanced_feature_engineering": [t["function"]["name"] for t in TOOLS[21:25]],
+        "advanced_preprocessing": [t["function"]["name"] for t in TOOLS[25:28]],
+        "advanced_training": [t["function"]["name"] for t in TOOLS[28:31]],
+        "business_intelligence": [t["function"]["name"] for t in TOOLS[31:35]],
+        "computer_vision": [t["function"]["name"] for t in TOOLS[35:38]],
+        "nlp_text_analytics": [t["function"]["name"] for t in TOOLS[38:42]],
+        "production_mlops": [t["function"]["name"] for t in TOOLS[42:47]],
+        "time_series": [t["function"]["name"] for t in TOOLS[47:50]],
+        "cloud_data_sources": [t["function"]["name"] for t in TOOLS[50:54]]
+    }
diff --git a/src/tools/tools_registry_old.py b/src/tools/tools_registry_old.py
new file mode 100644
index 0000000000000000000000000000000000000000..73ccb043854328403e1365266cc480b055b87882
--- /dev/null
+++ b/src/tools/tools_registry_old.py
@@ -0,0 +1,309 @@
+"""
+Tools Registry for Groq Function Calling
+Defines all available tools in Groq's function calling format.
+"""
+
+TOOLS = [
+    # Data Profiling Tools
+    {
+        "type": "function",
+        "function": {
+            "name": "profile_dataset",
+            "description": "Get comprehensive statistics about a dataset including shape, data types, memory usage, null counts, and unique values. Use this as the first step to understand any new dataset.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {
+                        "type": "string",
+                        "description": "Absolute or relative path to the CSV or Parquet file"
+                    }
+                },
+                "required": ["file_path"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "detect_data_quality_issues",
+            "description": "Detect data quality issues including outliers (using IQR method), duplicate rows, inconsistent formats, and data anomalies. Returns a prioritized list of issues with severity levels.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {
+                        "type": "string",
+                        "description": "Path to the dataset file"
+                    }
+                },
+                "required": ["file_path"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "analyze_correlations",
+            "description": "Compute correlation matrix and identify top correlations. If a target column is specified, shows features most correlated with the target. Useful for feature selection and understanding relationships.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {
+                        "type": "string",
+                        "description": "Path to the dataset file"
+                    },
+                    "target": {
+                        "type": "string",
+                        "description": "Optional target column name to analyze correlations with"
+                    }
+                },
+                "required": ["file_path"]
+            }
+        }
+    },
+    
+    # Data Cleaning Tools
+    {
+        "type": "function",
+        "function": {
+            "name": "clean_missing_values",
+            "description": "Handle missing values using appropriate strategies based on column type. Strategies include median/mean for numeric, mode for categorical, forward_fill for time series, or drop. Will not impute ID columns.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {
+                        "type": "string",
+                        "description": "Path to the dataset file"
+                    },
+                    "strategy": {
+                        "type": "object",
+                        "description": "Dictionary mapping column names to strategies ('median', 'mean', 'mode', 'forward_fill', 'drop'). Use 'auto' to let the tool decide based on data type.",
+                        "additionalProperties": {
+                            "type": "string"
+                        }
+                    },
+                    "output_path": {
+                        "type": "string",
+                        "description": "Path to save cleaned dataset"
+                    }
+                },
+                "required": ["file_path", "strategy", "output_path"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "handle_outliers",
+            "description": "Detect and handle outliers in numeric columns using IQR method. Methods: 'clip' (cap at boundaries), 'winsorize' (cap at percentiles), or 'remove' (delete rows).",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {
+                        "type": "string",
+                        "description": "Path to the dataset file"
+                    },
+                    "method": {
+                        "type": "string",
+                        "enum": ["clip", "winsorize", "remove"],
+                        "description": "Method to handle outliers"
+                    },
+                    "columns": {
+                        "type": "array",
+                        "items": {"type": "string"},
+                        "description": "List of column names to check for outliers. Use 'all' to check all numeric columns."
+                    },
+                    "output_path": {
+                        "type": "string",
+                        "description": "Path to save cleaned dataset"
+                    }
+                },
+                "required": ["file_path", "method", "columns", "output_path"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "fix_data_types",
+            "description": "Auto-detect and fix incorrect data types. Handles dates, booleans, categoricals, and numeric columns. Fixes common issues like 'null' strings and mixed types.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {
+                        "type": "string",
+                        "description": "Path to the dataset file"
+                    },
+                    "type_mapping": {
+                        "type": "object",
+                        "description": "Optional dictionary mapping column names to target types ('int', 'float', 'string', 'date', 'bool', 'category'). Use 'auto' for automatic detection.",
+                        "additionalProperties": {
+                            "type": "string"
+                        }
+                    },
+                    "output_path": {
+                        "type": "string",
+                        "description": "Path to save dataset with fixed types"
+                    }
+                },
+                "required": ["file_path", "output_path"]
+            }
+        }
+    },
+    
+    # Feature Engineering Tools
+    {
+        "type": "function",
+        "function": {
+            "name": "create_time_features",
+            "description": "Extract comprehensive time-based features from datetime columns including year, month, day, day_of_week, quarter, is_weekend, and cyclical encodings (sin/cos for month and hour).",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {
+                        "type": "string",
+                        "description": "Path to the dataset file"
+                    },
+                    "date_col": {
+                        "type": "string",
+                        "description": "Name of the datetime column to extract features from"
+                    },
+                    "output_path": {
+                        "type": "string",
+                        "description": "Path to save dataset with new features"
+                    }
+                },
+                "required": ["file_path", "date_col", "output_path"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "encode_categorical",
+            "description": "Encode categorical variables using one-hot encoding, target encoding, or frequency encoding. Handles high-cardinality columns intelligently.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {
+                        "type": "string",
+                        "description": "Path to the dataset file"
+                    },
+                    "method": {
+                        "type": "string",
+                        "enum": ["one_hot", "target", "frequency"],
+                        "description": "Encoding method to use"
+                    },
+                    "columns": {
+                        "type": "array",
+                        "items": {"type": "string"},
+                        "description": "List of categorical columns to encode. Use 'all' to encode all categorical columns."
+                    },
+                    "target_col": {
+                        "type": "string",
+                        "description": "Required for target encoding: name of the target column"
+                    },
+                    "output_path": {
+                        "type": "string",
+                        "description": "Path to save dataset with encoded features"
+                    }
+                },
+                "required": ["file_path", "method", "columns", "output_path"]
+            }
+        }
+    },
+    
+    # Model Training Tools
+    {
+        "type": "function",
+        "function": {
+            "name": "train_baseline_models",
+            "description": "Train multiple baseline models (Logistic Regression, Random Forest, XGBoost) and compare their performance. Automatically detects task type (classification/regression) and returns the best model with metrics.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {
+                        "type": "string",
+                        "description": "Path to the prepared dataset file"
+                    },
+                    "target_col": {
+                        "type": "string",
+                        "description": "Name of the target column to predict"
+                    },
+                    "task_type": {
+                        "type": "string",
+                        "enum": ["classification", "regression", "auto"],
+                        "description": "Type of ML task. Use 'auto' to detect automatically."
+                    },
+                    "test_size": {
+                        "type": "number",
+                        "description": "Proportion of data to use for testing (default: 0.2)"
+                    },
+                    "random_state": {
+                        "type": "integer",
+                        "description": "Random seed for reproducibility (default: 42)"
+                    }
+                },
+                "required": ["file_path", "target_col"]
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "generate_model_report",
+            "description": "Generate comprehensive model evaluation report including metrics, confusion matrix (for classification), feature importance, and SHAP values for top features. Saves report as JSON.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "model_path": {
+                        "type": "string",
+                        "description": "Path to saved model file (.pkl or .joblib)"
+                    },
+                    "test_data_path": {
+                        "type": "string",
+                        "description": "Path to test dataset file"
+                    },
+                    "target_col": {
+                        "type": "string",
+                        "description": "Name of the target column"
+                    },
+                    "output_path": {
+                        "type": "string",
+                        "description": "Path to save the report JSON file"
+                    }
+                },
+                "required": ["model_path", "test_data_path", "target_col", "output_path"]
+            }
+        }
+    }
+]
+
+
+def get_tool_by_name(tool_name: str) -> dict:
+    """
+    Get tool definition by name.
+    
+    Args:
+        tool_name: Name of the tool
+        
+    Returns:
+        Tool definition dictionary
+        
+    Raises:
+        ValueError: If tool not found
+    """
+    for tool in TOOLS:
+        if tool["function"]["name"] == tool_name:
+            return tool
+    
+    raise ValueError(f"Tool '{tool_name}' not found in registry")
+
+
+def get_all_tool_names() -> list:
+    """
+    Get list of all tool names.
+    
+    Returns:
+        List of tool names
+    """
+    return [tool["function"]["name"] for tool in TOOLS]
diff --git a/src/tools/visualization_engine.py b/src/tools/visualization_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..102e7e282a7cd87e5b130aaec3d7e05191d8b448
--- /dev/null
+++ b/src/tools/visualization_engine.py
@@ -0,0 +1,559 @@
+"""
+Comprehensive Visualization Engine (Matplotlib + Seaborn)
+Automatically generate all relevant plots for data analysis and model evaluation.
+
+All functions now return matplotlib Figure objects for Gradio compatibility.
+"""
+
+import matplotlib
+matplotlib.use('Agg')  # Non-interactive backend for Gradio
+
+import matplotlib.pyplot as plt
+import seaborn as sns
+import polars as pl
+import numpy as np
+import pandas as pd
+from typing import Dict, Any, List, Optional, Tuple
+from pathlib import Path
+import sys
+import os
+from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve
+
+# Add parent directory to path
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from utils.polars_helpers import load_dataframe
+from utils.validation import validate_file_exists
+
+# Import matplotlib visualization functions
+try:
+    from .matplotlib_visualizations import (
+        create_scatter_plot,
+        create_bar_chart,
+        create_histogram,
+        create_boxplot,
+        create_correlation_heatmap,
+        create_distribution_plot,
+        create_roc_curve,
+        create_confusion_matrix,
+        create_feature_importance,
+        create_residual_plot,
+        create_missing_values_heatmap,
+        create_missing_values_bar,
+        create_outlier_detection_boxplot,
+        save_figure,
+        close_figure
+    )
+except ImportError:
+    # Fallback for direct execution
+    from matplotlib_visualizations import (
+        create_scatter_plot,
+        create_bar_chart,
+        create_histogram,
+        create_boxplot,
+        create_correlation_heatmap,
+        create_distribution_plot,
+        create_roc_curve,
+        create_confusion_matrix,
+        create_feature_importance,
+        create_residual_plot,
+        create_missing_values_heatmap,
+        create_missing_values_bar,
+        create_outlier_detection_boxplot,
+        save_figure,
+        close_figure
+    )
+
+# Set global style
+sns.set_style('whitegrid')
+
+
+def generate_all_plots(file_path: str,
+                       target_col: Optional[str] = None,
+                       output_dir: str = "./outputs/plots") -> Dict[str, Any]:
+    """
+    Generate ALL plots for a dataset automatically.
+    
+    Generates:
+    - Data quality plots
+    - EDA plots
+    - Distribution plots
+    - Correlation plots
+    
+    Args:
+        file_path: Path to dataset
+        target_col: Optional target column
+        output_dir: Directory to save plots
+        
+    Returns:
+        Dictionary with Figure objects and saved file paths
+    """
+    validate_file_exists(file_path)
+    Path(output_dir).mkdir(parents=True, exist_ok=True)
+    
+    results = {
+        "output_directory": output_dir,
+        "plots_generated": [],
+        "figure_objects": [],  # Store Figure objects
+        "plot_categories": {}
+    }
+    
+    print(f"🎨 Generating comprehensive visualizations...")
+    
+    # 1. Data Quality Plots
+    quality_plots = generate_data_quality_plots(file_path, output_dir)
+    results["plot_categories"]["data_quality"] = quality_plots
+    results["plots_generated"].extend(quality_plots.get("plot_paths", []))
+    results["figure_objects"].extend(quality_plots.get("figures", []))
+    
+    # 2. EDA Plots
+    eda_plots = generate_eda_plots(file_path, target_col, output_dir)
+    results["plot_categories"]["eda"] = eda_plots
+    results["plots_generated"].extend(eda_plots.get("plot_paths", []))
+    results["figure_objects"].extend(eda_plots.get("figures", []))
+    
+    # 3. Distribution Plots
+    dist_plots = generate_distribution_plots(file_path, output_dir)
+    results["plot_categories"]["distributions"] = dist_plots
+    results["plots_generated"].extend(dist_plots.get("plot_paths", []))
+    results["figure_objects"].extend(dist_plots.get("figures", []))
+    
+    results["total_plots"] = len(results["plots_generated"])
+    print(f"✅ Generated {results['total_plots']} plots in {output_dir}")
+    
+    return results
+
+
+def generate_data_quality_plots(file_path: str, output_dir: str) -> Dict[str, Any]:
+    """Generate plots related to data quality using Matplotlib."""
+    df = load_dataframe(file_path).to_pandas()
+    plots = []
+    figures = []
+    
+    # 1. Missing values bar chart
+    missing_data = df.isnull().sum()
+    if missing_data.sum() > 0:
+        fig = create_missing_values_bar(
+            df=df,
+            title="Missing Values by Column",
+            figsize=(10, 6)
+        )
+        if fig is not None:
+            path = f"{output_dir}/missing_values.png"
+            save_figure(fig, path)
+            plots.append(path)
+            figures.append(fig)
+            print(f"   ✓ Missing values plot")
+    
+    # 2. Data types distribution (pie chart alternative - bar chart)
+    dtype_counts = df.dtypes.astype(str).value_counts()
+    fig = create_bar_chart(
+        categories=dtype_counts.index.tolist(),
+        values=dtype_counts.values,
+        title="Data Types Distribution",
+        xlabel="Data Type",
+        ylabel="Count",
+        figsize=(8, 6),
+        color='steelblue'
+    )
+    if fig is not None:
+        path = f"{output_dir}/data_types.png"
+        save_figure(fig, path)
+        plots.append(path)
+        figures.append(fig)
+        print(f"   ✓ Data types plot")
+    
+    # 3. Outlier detection (box plots)
+    numeric_cols = df.select_dtypes(include=[np.number]).columns[:6]  # Limit to 6
+    if len(numeric_cols) > 0:
+        fig = create_boxplot(
+            data=df[numeric_cols],
+            title="Outlier Detection (Box Plots)",
+            figsize=(12, 6)
+        )
+        if fig is not None:
+            path = f"{output_dir}/outliers_boxplot.png"
+            save_figure(fig, path)
+            plots.append(path)
+            figures.append(fig)
+            print(f"   ✓ Outlier detection plot")
+    
+    return {"plot_paths": plots, "figures": figures, "n_plots": len(plots)}
+
+
+def generate_eda_plots(file_path: str, target_col: Optional[str], output_dir: str) -> Dict[str, Any]:
+    """Generate exploratory data analysis plots using Matplotlib."""
+    df = load_dataframe(file_path).to_pandas()
+    plots = []
+    figures = []
+    
+    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+    if target_col and target_col in numeric_cols:
+        numeric_cols.remove(target_col)
+    
+    # 1. Correlation heatmap
+    if len(numeric_cols) > 1:
+        fig = create_correlation_heatmap(
+            data=df[numeric_cols[:15]],  # Limit to 15 features
+            title="Feature Correlation Matrix",
+            figsize=(12, 10),
+            annot=True,
+            cmap='RdBu_r'
+        )
+        if fig is not None:
+            path = f"{output_dir}/correlation_heatmap.png"
+            save_figure(fig, path)
+            plots.append(path)
+            figures.append(fig)
+            print(f"   ✓ Correlation heatmap")
+    
+    # 2. Feature relationships with target (scatter plots)
+    if target_col and target_col in df.columns and len(numeric_cols) > 0:
+        top_features = numeric_cols[:4]  # Top 4 features
+        
+        # Create multiple scatter plots
+        fig, axes = plt.subplots(2, 2, figsize=(14, 12))
+        axes = axes.flatten()
+        
+        for i, col in enumerate(top_features):
+            ax = axes[i]
+            ax.scatter(df[col], df[target_col], alpha=0.5, s=30, 
+                      c='steelblue', edgecolors='black', linewidth=0.5)
+            ax.set_xlabel(col, fontsize=11)
+            ax.set_ylabel(target_col, fontsize=11)
+            ax.set_title(f"{col} vs {target_col}", fontsize=12, fontweight='bold')
+            ax.grid(True, alpha=0.3, linestyle='--')
+        
+        fig.suptitle(f"Top Features vs {target_col}", fontsize=14, fontweight='bold', y=0.995)
+        plt.tight_layout()
+        
+        path = f"{output_dir}/feature_relationships.png"
+        save_figure(fig, path)
+        plots.append(path)
+        figures.append(fig)
+        print(f"   ✓ Feature relationships plot")
+    
+    # 3. Pairplot for top features (sample data for performance)
+    if len(numeric_cols) >= 3:
+        sample_size = min(1000, len(df))
+        sample_df = df[numeric_cols[:3]].sample(sample_size)
+        
+        # Create pairplot using seaborn
+        pair_grid = sns.pairplot(sample_df, corner=True, diag_kind='kde', 
+                                plot_kws={'alpha': 0.6, 's': 20})
+        fig = pair_grid.fig
+        fig.suptitle("Feature Pairplot (Top 3 Features)", fontsize=14, 
+                    fontweight='bold', y=1.01)
+        
+        path = f"{output_dir}/pairplot.png"
+        save_figure(fig, path)
+        plots.append(path)
+        figures.append(fig)
+        print(f"   ✓ Pairplot")
+    
+    return {"plot_paths": plots, "figures": figures, "n_plots": len(plots)}
+
+
+def generate_distribution_plots(file_path: str, output_dir: str) -> Dict[str, Any]:
+    """Generate distribution analysis plots using Matplotlib."""
+    df = load_dataframe(file_path).to_pandas()
+    plots = []
+    figures = []
+    
+    numeric_cols = df.select_dtypes(include=[np.number]).columns[:6]
+    
+    if len(numeric_cols) > 0:
+        # Histograms for numeric features in a grid
+        n_cols = min(3, len(numeric_cols))
+        n_rows = (len(numeric_cols) + n_cols - 1) // n_cols
+        
+        fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, n_rows * 4))
+        axes = axes.flatten() if n_rows * n_cols > 1 else [axes]
+        
+        for i, col in enumerate(numeric_cols):
+            ax = axes[i]
+            data = df[col].dropna()
+            
+            # Create histogram with KDE
+            ax.hist(data, bins=30, color='steelblue', edgecolor='black', 
+                   alpha=0.7, density=True)
+            
+            # Add KDE
+            try:
+                sns.kdeplot(data, ax=ax, color='darkred', linewidth=2)
+            except:
+                pass  # Skip KDE if it fails
+            
+            ax.set_title(col[:25], fontsize=11, fontweight='bold')
+            ax.set_xlabel('Value', fontsize=10)
+            ax.set_ylabel('Density', fontsize=10)
+            ax.grid(True, alpha=0.3, linestyle='--')
+        
+        # Hide unused subplots
+        for i in range(len(numeric_cols), len(axes)):
+            axes[i].axis('off')
+        
+        fig.suptitle("Feature Distributions", fontsize=14, fontweight='bold', y=0.995)
+        plt.tight_layout()
+        
+        path = f"{output_dir}/distributions_histogram.png"
+        save_figure(fig, path)
+        plots.append(path)
+        figures.append(fig)
+        print(f"   ✓ Distribution histograms")
+    
+    return {"plot_paths": plots, "figures": figures, "n_plots": len(plots)}
+
+
+def generate_model_performance_plots(y_true, y_pred, y_pred_proba=None,
+                                     task_type="regression",
+                                     model_name="Model",
+                                     output_dir="./outputs/plots") -> Dict[str, Any]:
+    """
+    Generate model performance plots using Matplotlib.
+    
+    Args:
+        y_true: True labels
+        y_pred: Predicted labels
+        y_pred_proba: Predicted probabilities (for classification)
+        task_type: 'classification' or 'regression'
+        model_name: Name of the model
+        output_dir: Output directory
+        
+    Returns:
+        Dictionary with plot paths and figure objects
+    """
+    Path(output_dir).mkdir(parents=True, exist_ok=True)
+    plots = []
+    figures = []
+    
+    if task_type == "classification":
+        # 1. Confusion Matrix
+        cm = confusion_matrix(y_true, y_pred)
+        class_names = [f"Class {i}" for i in range(len(cm))]
+        
+        fig = create_confusion_matrix(
+            cm=cm,
+            class_names=class_names,
+            title=f"Confusion Matrix - {model_name}",
+            show_percentages=True,
+            figsize=(10, 8)
+        )
+        if fig is not None:
+            path = f"{output_dir}/confusion_matrix_{model_name}.png"
+            save_figure(fig, path)
+            plots.append(path)
+            figures.append(fig)
+            print(f"   ✓ Confusion matrix")
+        
+        # 2. ROC Curve (if probabilities provided)
+        if y_pred_proba is not None and len(np.unique(y_true)) == 2:
+            y_proba = y_pred_proba[:, 1] if y_pred_proba.ndim > 1 else y_pred_proba
+            fpr, tpr, _ = roc_curve(y_true, y_proba)
+            roc_auc = auc(fpr, tpr)
+            
+            models_data = {model_name: (fpr, tpr, roc_auc)}
+            fig = create_roc_curve(
+                models_data=models_data,
+                title=f"ROC Curve - {model_name}",
+                figsize=(10, 8)
+            )
+            if fig is not None:
+                path = f"{output_dir}/roc_curve_{model_name}.png"
+                save_figure(fig, path)
+                plots.append(path)
+                figures.append(fig)
+                print(f"   ✓ ROC curve")
+    
+    else:  # Regression
+        # 1. Residual plot (Predicted vs Actual + Residuals)
+        fig = create_residual_plot(
+            y_true=y_true,
+            y_pred=y_pred,
+            title=f"Residual Analysis - {model_name}",
+            figsize=(10, 6)
+        )
+        if fig is not None:
+            path = f"{output_dir}/residuals_{model_name}.png"
+            save_figure(fig, path)
+            plots.append(path)
+            figures.append(fig)
+            print(f"   ✓ Residual plot")
+        
+        # 2. Residuals distribution
+        residuals = y_true - y_pred
+        fig = create_histogram(
+            data=residuals,
+            title=f"Residuals Distribution - {model_name}",
+            xlabel="Residuals",
+            ylabel="Frequency",
+            bins=30,
+            kde=True,
+            figsize=(10, 6)
+        )
+        if fig is not None:
+            path = f"{output_dir}/residuals_dist_{model_name}.png"
+            save_figure(fig, path)
+            plots.append(path)
+            figures.append(fig)
+            print(f"   ✓ Residuals distribution")
+    
+    return {"plot_paths": plots, "figures": figures, "n_plots": len(plots)}
+
+
+def generate_feature_importance_plot(feature_importances: Dict[str, float],
+                                     output_path: str = "./outputs/plots/feature_importance.png",
+                                     top_n: int = 20) -> str:
+    """
+    Generate feature importance plot using Matplotlib.
+    
+    Args:
+        feature_importances: Dictionary of feature: importance
+        output_path: Where to save the plot
+        top_n: Number of top features to show
+        
+    Returns:
+        Path to saved plot
+    """
+    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+    
+    # Convert dict to lists
+    features = list(feature_importances.keys())
+    importances = np.array(list(feature_importances.values()))
+    
+    # Create plot
+    fig = create_feature_importance(
+        feature_names=features,
+        importances=importances,
+        title=f"Top {top_n} Feature Importances",
+        top_n=top_n,
+        figsize=(10, max(8, top_n * 0.4))
+    )
+    
+    if fig is not None:
+        save_figure(fig, output_path)
+        print(f"   ✓ Feature importance plot")
+        close_figure(fig)
+        return output_path
+    
+    return None
+
+
+def generate_learning_curve(train_sizes, train_scores, val_scores,
+                           model_name="Model",
+                           output_path="./outputs/plots/learning_curve.png") -> str:
+    """Generate learning curve plot using Matplotlib."""
+    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+    
+    # Calculate mean and std
+    if isinstance(train_scores, list):
+        train_scores = np.array(train_scores)
+        val_scores = np.array(val_scores)
+    
+    if train_scores.ndim > 1:
+        train_scores_mean = np.mean(train_scores, axis=1)
+        train_scores_std = np.std(train_scores, axis=1)
+        val_scores_mean = np.mean(val_scores, axis=1)
+        val_scores_std = np.std(val_scores, axis=1)
+    else:
+        train_scores_mean = train_scores
+        train_scores_std = np.zeros_like(train_scores)
+        val_scores_mean = val_scores
+        val_scores_std = np.zeros_like(val_scores)
+    
+    # Create plot
+    from .matplotlib_visualizations import create_learning_curve as mlp_learning_curve
+    
+    fig = mlp_learning_curve(
+        train_sizes=train_sizes,
+        train_scores_mean=train_scores_mean,
+        train_scores_std=train_scores_std,
+        val_scores_mean=val_scores_mean,
+        val_scores_std=val_scores_std,
+        title=f"Learning Curve - {model_name}",
+        figsize=(10, 6)
+    )
+    
+    if fig is not None:
+        save_figure(fig, output_path)
+        close_figure(fig)
+        return output_path
+    
+    return None
+
+
+def create_plot_gallery_html(plot_paths: List[str], output_path: str = "./outputs/plots/gallery.html") -> str:
+    """
+    Create an HTML gallery page showing all plots (now as PNG images).
+    
+    Args:
+        plot_paths: List of paths to plot files (now PNG instead of HTML)
+        output_path: Where to save the gallery
+        
+    Returns:
+        Path to gallery HTML
+    """
+    html_content = """
+    <!DOCTYPE html>
+    <html>
+    <head>
+        <title>Data Analysis Plot Gallery</title>
+        <style>
+            body {
+                font-family: Arial, sans-serif;
+                margin: 20px;
+                background-color: #f5f5f5;
+            }
+            h1 {
+                color: #333;
+                text-align: center;
+            }
+            .plot-container {
+                background: white;
+                margin: 20px 0;
+                padding: 20px;
+                border-radius: 8px;
+                box-shadow: 0 2px 4px rgba(0,0,0,0.1);
+            }
+            .plot-image {
+                width: 100%;
+                max-width: 1200px;
+                height: auto;
+                display: block;
+                margin: 0 auto;
+            }
+            .plot-title {
+                font-size: 18px;
+                font-weight: bold;
+                margin-bottom: 10px;
+                color: #555;
+            }
+        </style>
+    </head>
+    <body>
+        <h1>📊 Data Analysis Visualization Gallery</h1>
+        <p style="text-align: center; color: #666;">Total Plots: {}</p>
+    """.format(len(plot_paths))
+    
+    for i, plot_path in enumerate(plot_paths, 1):
+        plot_name = Path(plot_path).stem.replace('_', ' ').title()
+        rel_path = os.path.relpath(plot_path, os.path.dirname(output_path))
+        
+        html_content += f"""
+        <div class="plot-container">
+            <div class="plot-title">{i}. {plot_name}</div>
+            <img src="{rel_path}" alt="{plot_name}" class="plot-image">
+        </div>
+        """
+    
+    html_content += """
+    </body>
+    </html>
+    """
+    
+    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+    with open(output_path, 'w') as f:
+        f.write(html_content)
+    
+    print(f"✅ Created plot gallery: {output_path}")
+    return output_path
diff --git a/src/utils/__init__.py b/src/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e889f867c0a84fb7aaffe4e1421487500cacb94
--- /dev/null
+++ b/src/utils/__init__.py
@@ -0,0 +1,42 @@
+"""Utils module initialization."""
+
+from .polars_helpers import (
+    load_dataframe,
+    save_dataframe,
+    get_numeric_columns,
+    get_categorical_columns,
+    get_datetime_columns,
+    detect_id_columns,
+    get_column_info,
+    calculate_memory_usage,
+    split_features_target,
+)
+
+from .validation import (
+    ValidationError,
+    validate_file_exists,
+    validate_file_format,
+    validate_dataframe,
+    validate_column_exists,
+    validate_columns_exist,
+    validate_target_column,
+)
+
+__all__ = [
+    "load_dataframe",
+    "save_dataframe",
+    "get_numeric_columns",
+    "get_categorical_columns",
+    "get_datetime_columns",
+    "detect_id_columns",
+    "get_column_info",
+    "calculate_memory_usage",
+    "split_features_target",
+    "ValidationError",
+    "validate_file_exists",
+    "validate_file_format",
+    "validate_dataframe",
+    "validate_column_exists",
+    "validate_columns_exist",
+    "validate_target_column",
+]
diff --git a/src/utils/polars_helpers.py b/src/utils/polars_helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e0ed84e478ab0e4558730b47ec503221f91e655
--- /dev/null
+++ b/src/utils/polars_helpers.py
@@ -0,0 +1,248 @@
+"""
+Polars utility functions for data manipulation.
+"""
+
+import polars as pl
+from typing import List, Dict, Any, Optional
+
+
+def load_dataframe(file_path: str) -> pl.DataFrame:
+    """
+    Load a dataframe from CSV or Parquet file.
+    
+    Args:
+        file_path: Path to file
+        
+    Returns:
+        Polars DataFrame
+    """
+    if file_path.endswith('.parquet'):
+        return pl.read_parquet(file_path)
+    elif file_path.endswith('.csv'):
+        # Use longer schema inference to handle mixed types better
+        # and ignore errors to handle problematic rows gracefully
+        return pl.read_csv(
+            file_path, 
+            try_parse_dates=True,
+            infer_schema_length=10000,  # Scan more rows for better type inference
+            ignore_errors=True  # Skip problematic rows instead of failing
+        )
+    else:
+        raise ValueError(f"Unsupported file format: {file_path}")
+
+
+def save_dataframe(df: pl.DataFrame, file_path: str) -> None:
+    """
+    Save dataframe to CSV or Parquet file.
+    
+    Args:
+        df: Polars DataFrame
+        file_path: Output path
+    """
+    if file_path.endswith('.parquet'):
+        df.write_parquet(file_path)
+    elif file_path.endswith('.csv'):
+        df.write_csv(file_path)
+    else:
+        raise ValueError(f"Unsupported file format: {file_path}")
+
+
+def get_numeric_columns(df: pl.DataFrame) -> List[str]:
+    """
+    Get list of numeric column names.
+    
+    Args:
+        df: Polars DataFrame
+        
+    Returns:
+        List of numeric column names
+    """
+    return [col for col in df.columns if df[col].dtype in pl.NUMERIC_DTYPES]
+
+
+def get_categorical_columns(df: pl.DataFrame) -> List[str]:
+    """
+    Get list of categorical/string column names.
+    
+    Args:
+        df: Polars DataFrame
+        
+    Returns:
+        List of categorical column names
+    """
+    return [col for col in df.columns if df[col].dtype in [pl.Utf8, pl.Categorical]]
+
+
+def get_datetime_columns(df: pl.DataFrame) -> List[str]:
+    """
+    Get list of datetime column names.
+    
+    Args:
+        df: Polars DataFrame
+        
+    Returns:
+        List of datetime column names
+    """
+    return [col for col in df.columns if df[col].dtype in [pl.Date, pl.Datetime]]
+
+
+def detect_id_columns(df: pl.DataFrame) -> List[str]:
+    """
+    Detect columns that are likely IDs (unique values, low information content).
+    
+    Args:
+        df: Polars DataFrame
+        
+    Returns:
+        List of likely ID column names
+    """
+    id_columns = []
+    
+    for col in df.columns:
+        # Check if column name suggests it's an ID
+        col_lower = col.lower()
+        if any(id_term in col_lower for id_term in ['id', '_id', 'key', 'index']):
+            id_columns.append(col)
+            continue
+        
+        # Check if column has mostly unique values (>95% unique)
+        n_unique = df[col].n_unique()
+        n_total = len(df)
+        if n_total > 0 and (n_unique / n_total) > 0.95:
+            id_columns.append(col)
+    
+    return id_columns
+
+
+def safe_cast_numeric(df: pl.DataFrame, columns: List[str]) -> pl.DataFrame:
+    """
+    Safely cast columns to numeric, handling errors gracefully.
+    
+    Args:
+        df: Polars DataFrame
+        columns: List of columns to cast
+        
+    Returns:
+        DataFrame with columns cast to numeric where possible
+    """
+    result = df.clone()
+    
+    for col in columns:
+        try:
+            result = result.with_columns(
+                pl.col(col).cast(pl.Float64).alias(col)
+            )
+        except Exception:
+            # If casting fails, keep original column
+            pass
+    
+    return result
+
+
+def get_column_info(df: pl.DataFrame, col: str) -> Dict[str, Any]:
+    """
+    Get comprehensive information about a column.
+    
+    Args:
+        df: Polars DataFrame
+        col: Column name
+        
+    Returns:
+        Dictionary with column statistics
+    """
+    col_data = df[col]
+    
+    info = {
+        "name": col,
+        "dtype": str(col_data.dtype),
+        "null_count": col_data.null_count(),
+        "null_percentage": round(col_data.null_count() / len(df) * 100, 2),
+        "unique_count": col_data.n_unique(),
+        "unique_percentage": round(col_data.n_unique() / len(df) * 100, 2),
+    }
+    
+    # Add numeric-specific stats
+    if col_data.dtype in pl.NUMERIC_DTYPES:
+        info.update({
+            "mean": float(col_data.mean()) if col_data.mean() is not None else None,
+            "std": float(col_data.std()) if col_data.std() is not None else None,
+            "min": float(col_data.min()) if col_data.min() is not None else None,
+            "max": float(col_data.max()) if col_data.max() is not None else None,
+            "median": float(col_data.median()) if col_data.median() is not None else None,
+        })
+    
+    # Add categorical-specific stats
+    if col_data.dtype in [pl.Utf8, pl.Categorical]:
+        value_counts = col_data.value_counts().limit(5)
+        info["top_values"] = [
+            {"value": str(row[0]), "count": int(row[1])}
+            for row in value_counts.iter_rows()
+        ]
+    
+    return info
+
+
+def calculate_memory_usage(df: pl.DataFrame) -> Dict[str, Any]:
+    """
+    Calculate memory usage of dataframe.
+    
+    Args:
+        df: Polars DataFrame
+        
+    Returns:
+        Dictionary with memory usage statistics
+    """
+    total_bytes = df.estimated_size()
+    
+    return {
+        "total_mb": round(total_bytes / (1024 * 1024), 2),
+        "total_bytes": total_bytes,
+        "rows": len(df),
+        "columns": len(df.columns),
+        "bytes_per_row": round(total_bytes / len(df), 2) if len(df) > 0 else 0,
+    }
+
+
+def split_features_target(df: pl.DataFrame, target_col: str) -> tuple:
+    """
+    Split dataframe into features and target.
+    
+    Args:
+        df: Polars DataFrame
+        target_col: Name of target column
+        
+    Returns:
+        Tuple of (X, y) where X is features and y is target
+    """
+    if target_col not in df.columns:
+        raise ValueError(f"Target column '{target_col}' not found in dataframe")
+    
+    X = df.drop(target_col)
+    y = df[target_col]
+    
+    return X, y
+
+
+def remove_low_variance_features(df: pl.DataFrame, threshold: float = 0.01) -> pl.DataFrame:
+    """
+    Remove features with low variance.
+    
+    Args:
+        df: Polars DataFrame
+        threshold: Variance threshold (default 0.01)
+        
+    Returns:
+        DataFrame with low variance columns removed
+    """
+    numeric_cols = get_numeric_columns(df)
+    
+    cols_to_keep = []
+    for col in numeric_cols:
+        variance = df[col].var()
+        if variance is not None and variance > threshold:
+            cols_to_keep.append(col)
+    
+    # Keep non-numeric columns
+    non_numeric_cols = [col for col in df.columns if col not in numeric_cols]
+    
+    return df.select(cols_to_keep + non_numeric_cols)
diff --git a/src/utils/validation.py b/src/utils/validation.py
new file mode 100644
index 0000000000000000000000000000000000000000..251d25406f45a6bf24a4015fd246522fb361ea2b
--- /dev/null
+++ b/src/utils/validation.py
@@ -0,0 +1,270 @@
+"""
+Validation utilities for data science operations.
+"""
+
+import polars as pl
+from typing import List, Dict, Any, Optional
+from pathlib import Path
+
+
+class ValidationError(Exception):
+    """Custom exception for validation errors."""
+    pass
+
+
+def validate_file_exists(file_path: str) -> None:
+    """
+    Validate that a file exists.
+    
+    Args:
+        file_path: Path to file
+        
+    Raises:
+        ValidationError: If file doesn't exist
+    """
+    if not Path(file_path).exists():
+        raise ValidationError(f"File not found: {file_path}")
+
+
+def validate_file_format(file_path: str, allowed_formats: List[str] = None) -> None:
+    """
+    Validate file format.
+    
+    Args:
+        file_path: Path to file
+        allowed_formats: List of allowed extensions (default: ['.csv', '.parquet'])
+        
+    Raises:
+        ValidationError: If file format is not supported
+    """
+    if allowed_formats is None:
+        allowed_formats = ['.csv', '.parquet']
+    
+    file_ext = Path(file_path).suffix.lower()
+    if file_ext not in allowed_formats:
+        raise ValidationError(
+            f"Unsupported file format: {file_ext}. Allowed: {', '.join(allowed_formats)}"
+        )
+
+
+def validate_dataframe(df: pl.DataFrame) -> None:
+    """
+    Validate that dataframe is valid and not empty.
+    
+    Args:
+        df: Polars DataFrame
+        
+    Raises:
+        ValidationError: If dataframe is invalid or empty
+    """
+    if df is None:
+        raise ValidationError("DataFrame is None")
+    
+    if len(df) == 0:
+        raise ValidationError("DataFrame is empty (0 rows)")
+    
+    if len(df.columns) == 0:
+        raise ValidationError("DataFrame has no columns")
+
+
+def validate_column_exists(df: pl.DataFrame, column: str) -> None:
+    """
+    Validate that a column exists in dataframe.
+    
+    Args:
+        df: Polars DataFrame
+        column: Column name
+        
+    Raises:
+        ValidationError: If column doesn't exist
+    """
+    if column not in df.columns:
+        raise ValidationError(
+            f"Column '{column}' not found. Available columns: {', '.join(df.columns)}"
+        )
+
+
+def validate_columns_exist(df: pl.DataFrame, columns: List[str]) -> None:
+    """
+    Validate that multiple columns exist in dataframe.
+    
+    Args:
+        df: Polars DataFrame
+        columns: List of column names
+        
+    Raises:
+        ValidationError: If any column doesn't exist
+    """
+    missing = [col for col in columns if col not in df.columns]
+    if missing:
+        raise ValidationError(
+            f"Columns not found: {', '.join(missing)}. "
+            f"Available: {', '.join(df.columns)}"
+        )
+
+
+def validate_numeric_column(df: pl.DataFrame, column: str) -> None:
+    """
+    Validate that a column is numeric.
+    
+    Args:
+        df: Polars DataFrame
+        column: Column name
+        
+    Raises:
+        ValidationError: If column is not numeric
+    """
+    validate_column_exists(df, column)
+    
+    if df[column].dtype not in pl.NUMERIC_DTYPES:
+        raise ValidationError(
+            f"Column '{column}' is not numeric (dtype: {df[column].dtype})"
+        )
+
+
+def validate_categorical_column(df: pl.DataFrame, column: str) -> None:
+    """
+    Validate that a column is categorical.
+    
+    Args:
+        df: Polars DataFrame
+        column: Column name
+        
+    Raises:
+        ValidationError: If column is not categorical
+    """
+    validate_column_exists(df, column)
+    
+    if df[column].dtype not in [pl.Utf8, pl.Categorical]:
+        raise ValidationError(
+            f"Column '{column}' is not categorical (dtype: {df[column].dtype})"
+        )
+
+
+def validate_datetime_column(df: pl.DataFrame, column: str) -> None:
+    """
+    Validate that a column is datetime.
+    
+    Args:
+        df: Polars DataFrame
+        column: Column name
+        
+    Raises:
+        ValidationError: If column is not datetime
+    """
+    validate_column_exists(df, column)
+    
+    if df[column].dtype not in [pl.Date, pl.Datetime]:
+        raise ValidationError(
+            f"Column '{column}' is not datetime (dtype: {df[column].dtype})"
+        )
+
+
+def validate_target_column(df: pl.DataFrame, target_col: str, 
+                          task_type: Optional[str] = None) -> str:
+    """
+    Validate target column and infer task type if not provided.
+    
+    Args:
+        df: Polars DataFrame
+        target_col: Target column name
+        task_type: Optional task type ('classification' or 'regression')
+        
+    Returns:
+        Inferred or validated task type
+        
+    Raises:
+        ValidationError: If target column is invalid
+    """
+    validate_column_exists(df, target_col)
+    
+    target = df[target_col]
+    n_unique = target.n_unique()
+    
+    # Infer task type if not provided
+    if task_type is None:
+        if target.dtype in pl.NUMERIC_DTYPES and n_unique > 10:
+            task_type = "regression"
+        else:
+            task_type = "classification"
+    
+    # Validate task type
+    if task_type not in ["classification", "regression"]:
+        raise ValidationError(
+            f"Invalid task_type: {task_type}. Must be 'classification' or 'regression'"
+        )
+    
+    # Validate target column matches task type
+    if task_type == "classification":
+        if n_unique > 100:
+            raise ValidationError(
+                f"Classification target has too many unique values ({n_unique}). "
+                f"Consider regression or check if this is the correct target."
+            )
+    
+    if task_type == "regression":
+        if target.dtype not in pl.NUMERIC_DTYPES:
+            raise ValidationError(
+                f"Regression target must be numeric (dtype: {target.dtype})"
+            )
+    
+    return task_type
+
+
+def validate_train_test_split(X_train: Any, X_test: Any, 
+                               y_train: Any, y_test: Any) -> None:
+    """
+    Validate train/test split data.
+    
+    Args:
+        X_train: Training features
+        X_test: Test features
+        y_train: Training target
+        y_test: Test target
+        
+    Raises:
+        ValidationError: If split data is invalid
+    """
+    if len(X_train) == 0:
+        raise ValidationError("X_train is empty")
+    
+    if len(X_test) == 0:
+        raise ValidationError("X_test is empty")
+    
+    if len(y_train) == 0:
+        raise ValidationError("y_train is empty")
+    
+    if len(y_test) == 0:
+        raise ValidationError("y_test is empty")
+    
+    if len(X_train) != len(y_train):
+        raise ValidationError(
+            f"X_train ({len(X_train)}) and y_train ({len(y_train)}) have different lengths"
+        )
+    
+    if len(X_test) != len(y_test):
+        raise ValidationError(
+            f"X_test ({len(X_test)}) and y_test ({len(y_test)}) have different lengths"
+        )
+
+
+def validate_strategy_config(strategy: Dict[str, Any], 
+                             required_keys: List[str]) -> None:
+    """
+    Validate strategy configuration dictionary.
+    
+    Args:
+        strategy: Strategy configuration
+        required_keys: List of required keys
+        
+    Raises:
+        ValidationError: If configuration is invalid
+    """
+    if not isinstance(strategy, dict):
+        raise ValidationError(f"Strategy must be a dictionary, got {type(strategy)}")
+    
+    missing = [key for key in required_keys if key not in strategy]
+    if missing:
+        raise ValidationError(
+            f"Missing required strategy keys: {', '.join(missing)}"
+        )
diff --git a/start.ps1 b/start.ps1
new file mode 100644
index 0000000000000000000000000000000000000000..ec070cc4adce1661618be903901937a4f60cab40
--- /dev/null
+++ b/start.ps1
@@ -0,0 +1,45 @@
+# Quick Start Script for Data Science Agent
+
+Write-Host "Data Science Agent - Quick Start" -ForegroundColor Cyan
+Write-Host "===================================" -ForegroundColor Cyan
+Write-Host ""
+
+# Check if frontend is built
+if (-Not (Test-Path "FRRONTEEEND\dist")) {
+    Write-Host "Frontend not built. Building now..." -ForegroundColor Yellow
+    Write-Host ""
+    
+    Set-Location FRRONTEEEND
+    
+    Write-Host "Installing frontend dependencies..." -ForegroundColor Gray
+    npm.cmd install
+    if ($LASTEXITCODE -ne 0) {
+        Write-Host "Failed to install frontend dependencies!" -ForegroundColor Red
+        exit 1
+    }
+    
+    Write-Host "Building frontend..." -ForegroundColor Gray
+    npm.cmd run build
+    if ($LASTEXITCODE -ne 0) {
+        Write-Host "Failed to build frontend!" -ForegroundColor Red
+        exit 1
+    }
+    
+    Set-Location ..
+    Write-Host ""
+    Write-Host "Frontend built successfully!" -ForegroundColor Green
+} else {
+    Write-Host "Frontend already built" -ForegroundColor Green
+}
+
+Write-Host ""
+Write-Host "Starting Python backend..." -ForegroundColor Yellow
+Write-Host ""
+Write-Host "Make sure you have set the following environment variables:" -ForegroundColor Gray
+Write-Host "  - GOOGLE_API_KEY (required for Gemini)" -ForegroundColor Gray
+Write-Host ""
+Write-Host "Starting server at http://localhost:8080" -ForegroundColor Cyan
+Write-Host "Press Ctrl+C to stop" -ForegroundColor Gray
+Write-Host ""
+
+py src\api\app.py
diff --git a/start.sh b/start.sh
new file mode 100644
index 0000000000000000000000000000000000000000..07d7d6d11c6fff8b13b0e855d801836d936bfbff
--- /dev/null
+++ b/start.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+# Quick Start Script for Data Science Agent
+
+echo "🚀 Data Science Agent - Quick Start"
+echo "==================================="
+echo ""
+
+# Check if frontend is built
+if [ ! -d "FRRONTEEEND/dist" ]; then
+    echo "📦 Frontend not built. Building now..."
+    echo ""
+    
+    cd FRRONTEEEND
+    
+    echo "Installing frontend dependencies..."
+    npm install
+    if [ $? -ne 0 ]; then
+        echo "❌ Failed to install frontend dependencies!"
+        exit 1
+    fi
+    
+    echo "Building frontend..."
+    npm run build
+    if [ $? -ne 0 ]; then
+        echo "❌ Failed to build frontend!"
+        exit 1
+    fi
+    
+    cd ..
+    echo ""
+    echo "✅ Frontend built successfully!"
+else
+    echo "✅ Frontend already built"
+fi
+
+echo ""
+echo "🐍 Starting Python backend..."
+echo ""
+echo "Make sure you have set the following environment variables:"
+echo "  - GOOGLE_API_KEY (required for Gemini)"
+echo ""
+echo "Starting server at http://localhost:8080"
+echo "Press Ctrl+C to stop"
+echo ""
+
+python src/api/app.py
diff --git a/test_data/sample.csv b/test_data/sample.csv
new file mode 100644
index 0000000000000000000000000000000000000000..29debd7f7f25c96d59309af366a85fd8230f22f2
--- /dev/null
+++ b/test_data/sample.csv
@@ -0,0 +1,16 @@
+﻿age,income,score,purchased
+25,50000,75,1
+30,60000,82,1
+22,45000,68,0
+35,75000,88,1
+28,55000,79,1
+40,90000,92,1
+23,42000,65,0
+32,68000,85,1
+27,52000,76,1
+38,82000,90,1
+24,48000,71,0
+31,65000,84,1
+26,51000,77,1
+29,58000,80,1
+33,72000,87,1
diff --git a/test_environment.py b/test_environment.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6e84f74fe2c5485119bf8ab80ee4501604dd4f5
--- /dev/null
+++ b/test_environment.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python3
+"""
+Quick test script to verify all core imports work
+"""
+
+print("Testing core imports...")
+
+try:
+    print("  ✓ Python standard library")
+    import sys
+    import os
+    
+    print("  ✓ Data processing")
+    import polars as pl
+    import pandas as pd
+    import numpy as np
+    
+    print("  ✓ Machine learning")
+    import sklearn
+    import xgboost
+    import lightgbm
+    
+    print("  ✓ Visualization")
+    import matplotlib
+    import seaborn
+    import plotly
+    
+    print("  ✓ LLM clients")
+    import groq
+    
+    print("  ✓ Web framework")
+    import gradio
+    import fastapi
+    
+    print("\n✅ All core dependencies installed successfully!")
+    print(f"\nPython version: {sys.version}")
+    print(f"Gradio version: {gradio.__version__}")
+    print(f"Polars version: {pl.__version__}")
+    print(f"Pandas version: {pd.__version__}")
+    print(f"NumPy version: {np.__version__}")
+    print(f"Scikit-learn version: {sklearn.__version__}")
+    
+except ImportError as e:
+    print(f"\n❌ Import failed: {e}")
+    sys.exit(1)
+
+print("\n🎉 Environment setup complete! You can now run:")
+print("   .venv/bin/python chat_ui.py")