Spaces:

Pulastya0
/

Data-Science-Agent

Running

App Files Files Community

Pulastya B commited on Dec 20, 2025

Commit

226ac39

0 Parent(s):

feat: Initial commit - Data Science Agent with React frontend and FastAPI backend

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.dockerignore +74 -0
.env.example +19 -0
.gcloudignore +59 -0
.gitignore +71 -0
BIGQUERY_SCHEMAS.md +691 -0
CHECKLIST.md +97 -0
DEPLOYMENT.md +495 -0
Dockerfile +78 -0
FRONTEND_INTEGRATION.md +234 -0
FRRONTEEEND/.env.production +3 -0
FRRONTEEEND/.gitignore +24 -0
FRRONTEEEND/App.tsx +59 -0
FRRONTEEEND/README.md +20 -0
FRRONTEEEND/components/BackgroundPaths.tsx +148 -0
FRRONTEEEND/components/ChatInterface.tsx +571 -0
FRRONTEEEND/components/Footer.tsx +171 -0
FRRONTEEEND/components/HeroGeometric.tsx +213 -0
FRRONTEEEND/components/KeyCapabilities.tsx +91 -0
FRRONTEEEND/components/Logo.tsx +92 -0
FRRONTEEEND/components/ProblemSolution.tsx +70 -0
FRRONTEEEND/components/Process.tsx +70 -0
FRRONTEEEND/components/ShadowSection.tsx +222 -0
FRRONTEEEND/components/TechStack.tsx +36 -0
FRRONTEEEND/index.html +59 -0
FRRONTEEEND/index.tsx +16 -0
FRRONTEEEND/lib/utils.ts +7 -0
FRRONTEEEND/metadata.json +5 -0
FRRONTEEEND/package-lock.json +0 -0
FRRONTEEEND/package.json +26 -0
FRRONTEEEND/tsconfig.json +29 -0
FRRONTEEEND/vite.config.ts +29 -0
GEMINI_UPDATE.md +93 -0
MIGRATION_COMPLETE.md +325 -0
QUICK_REFERENCE.txt +71 -0
README.md +632 -0
build-and-deploy.ps1 +39 -0
build-and-deploy.sh +33 -0
cache_db/.gitkeep +0 -0
chat_ui.py +1073 -0
cloudbuild.yaml +69 -0
data/.gitkeep +0 -0
deploy.sh +171 -0
examples/titanic_example.py +166 -0
requirements.txt +98 -0
setup-deployment.sh +78 -0
src/__init__.py +7 -0
src/api/__init__.py +4 -0
src/api/app.py +513 -0
src/cache/__init__.py +5 -0
src/cache/cache_manager.py +292 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,74 @@

+# Python cache and environment
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+.venv/
+venv/
+ENV/
+env/
+# Development files
+.git/
+.gitignore
+.env
+.env.local
+*.log
+# Output directories (not needed in container)
+outputs/
+cache_db/
+temp/
+test_data/
+data/
+# Frontend development files (will be built in Docker)
+FRRONTEEEND/node_modules/
+FRRONTEEEND/.env
+FRRONTEEEND/.env.local
+# Documentation and tests
+*.md
+!README.md
+tests/
+test_*.py
+check_*.py
+# Old Gradio UI (no longer used)
+chat_ui.py
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# OS files
+.DS_Store
+Thumbs.db
+# Jupyter notebooks
+*.ipynb
+.ipynb_checkpoints/
+# Large model files (if any)
+*.pkl
+*.joblib
+*.h5
+*.pt
+*.pth
+# Documentation
+docs/
+PHASE*.md
+PROJECT*.md
+TOKEN*.md
+TOOL*.md
+FEATURE*.md
+IMPLEMENTATION*.md
+MIGRATION*.md
+EDA_REPORTS*.md
+GITHUB*.md
+BIGQUERY*.md

.env.example ADDED Viewed

	@@ -0,0 +1,19 @@

+# Google Gemini API Configuration
+GOOGLE_API_KEY=your_google_api_key_here
+# Model Configuration
+LLM_PROVIDER=gemini
+REASONING_EFFORT=medium
+# Cache Configuration
+CACHE_DB_PATH=./cache_db/cache.db
+CACHE_TTL_SECONDS=86400
+# Output Configuration
+OUTPUT_DIR=./outputs
+DATA_DIR=./data
+# Performance Configuration
+MAX_PARALLEL_TOOLS=5
+MAX_RETRIES=3
+TIMEOUT_SECONDS=300

.gcloudignore ADDED Viewed

	@@ -0,0 +1,59 @@

+# This file specifies files that are *not* uploaded to Google Cloud
+# using gcloud. It follows the same syntax as .gitignore
+.gcloudignore
+.git
+.gitignore
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+.venv/
+venv/
+ENV/
+env/
+# Local development
+.env
+.env.local
+*.log
+# Outputs and cache (regenerated in cloud)
+outputs/
+cache_db/
+temp/
+test_data/
+data/
+# Documentation
+*.md
+!README.md
+# Tests
+tests/
+test_*.py
+check_*.py
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+# OS
+.DS_Store
+Thumbs.db
+# Jupyter
+*.ipynb
+.ipynb_checkpoints/
+# Build artifacts
+*.pkl
+*.joblib
+*.h5
+*.pt
+*.pth

.gitignore ADDED Viewed

	@@ -0,0 +1,71 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Virtual Environment
+venv/
+.venv/
+env/
+ENV/
+# Environment Variables
+.env
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# Cache & Outputs
+cache_db/*.db
+cache_db/*.db-journal
+cache_db/
+outputs/
+temp/
+*.pkl
+*.joblib
+# Data files (except examples)
+data/*.csv
+data/*.parquet
+!data/.gitkeep
+# Cloud Run URL
+.cloud_run_url
+# Jupyter
+.ipynb_checkpoints/
+*.ipynb
+# OS
+.DS_Store
+Thumbs.db
+# Testing
+.pytest_cache/
+.coverage
+htmlcov/
+.tox/
+# Logs
+*.log

BIGQUERY_SCHEMAS.md ADDED Viewed

	@@ -0,0 +1,691 @@

+# BigQuery Output Schemas for Looker Compatibility
+**Purpose**: Define stable BigQuery table schemas that BI tools (Looker, Data Studio) can query reliably.
+**Design Principles**:
+- ✅ **Stable Schema**: No breaking changes without versioning
+- ✅ **Consistent Naming**: snake_case columns, clear dimension/metric separation
+- ✅ **BI-Friendly Types**: Standard SQL types, no complex nested structures
+- ✅ **Documented Grain**: Clear primary keys and update patterns
+- ✅ **Dashboard-Ready**: Metrics aligned with common visualizations
+---
+## 📊 Table 1: `model_metrics`
+**Description**: Model performance metrics tracked over time for monitoring and comparison.
+**Use Cases**:
+- Performance dashboards
+- Model comparison reports
+- Drift detection alerts
+- A/B test analysis
+**Update Frequency**: On every model training run
+**Grain**: One row per model training execution
+### Schema
+| Column Name | Type | Description | Dimension/Metric | Example |
+|------------|------|-------------|------------------|---------|
+| `project_id` | STRING | Google Cloud project ID | Dimension | `my-ml-project` |
+| `dataset_id` | STRING | BigQuery dataset name | Dimension | `ml_models` |
+| `model_id` | STRING | Unique model identifier | Dimension (Primary Key) | `xgboost_churn_20251223_153045` |
+| `model_name` | STRING | Human-readable model name | Dimension | `Customer Churn Predictor` |
+| `model_type` | STRING | Algorithm used | Dimension | `XGBoost`, `RandomForest`, `LightGBM` |
+| `task_type` | STRING | ML task category | Dimension | `classification`, `regression` |
+| `training_dataset` | STRING | Source table/file reference | Dimension | `project.dataset.train_data` |
+| `target_column` | STRING | Prediction target name | Dimension | `churn`, `price`, `survived` |
+| `created_at` | TIMESTAMP | Model training timestamp | Dimension (Time) | `2025-12-23 15:30:45 UTC` |
+| `created_date` | DATE | Training date (for partitioning) | Dimension (Time) | `2025-12-23` |
+| `feature_count` | INTEGER | Number of features used | Metric | `42` |
+| `training_rows` | INTEGER | Training set size | Metric | `10000` |
+| `test_rows` | INTEGER | Test set size | Metric | `2500` |
+| `training_duration_seconds` | FLOAT | Time to train model | Metric | `123.45` |
+| `accuracy` | FLOAT | Overall accuracy (0-1) | Metric | `0.95` |
+| `precision` | FLOAT | Precision score (0-1) | Metric | `0.92` |
+| `recall` | FLOAT | Recall score (0-1) | Metric | `0.88` |
+| `f1_score` | FLOAT | F1 score (0-1) | Metric | `0.90` |
+| `roc_auc` | FLOAT | ROC AUC score (0-1) | Metric | `0.94` |
+| `pr_auc` | FLOAT | Precision-Recall AUC (0-1) | Metric | `0.91` |
+| `mae` | FLOAT | Mean Absolute Error (regression) | Metric | `1234.56` |
+| `mse` | FLOAT | Mean Squared Error (regression) | Metric | `567890.12` |
+| `rmse` | FLOAT | Root Mean Squared Error (regression) | Metric | `753.59` |
+| `r2_score` | FLOAT | R² coefficient (regression) | Metric | `0.85` |
+| `cross_val_mean` | FLOAT | Mean CV score | Metric | `0.93` |
+| `cross_val_std` | FLOAT | CV score std deviation | Metric | `0.02` |
+| `hyperparameters` | STRING (JSON) | Model hyperparameters | Metadata | `{"max_depth": 6, "n_estimators": 100}` |
+| `version` | STRING | Model version tag | Dimension | `v1.2.3` |
+| `environment` | STRING | Training environment | Dimension | `production`, `staging`, `development` |
+| `user_email` | STRING | User who trained model | Dimension | `data-scientist@company.com` |
+### Partitioning & Clustering
+```sql
+-- Recommended table setup
+CREATE TABLE `project.dataset.model_metrics`
+(
+  -- columns as above
+)
+PARTITION BY created_date
+CLUSTER BY model_type, task_type, environment
+OPTIONS(
+  description="Model performance metrics for BI dashboards",
+  require_partition_filter=true
+);
+```
+### Primary Dimensions for Looker
+- **Time**: `created_at`, `created_date`
+- **Model**: `model_type`, `model_name`, `task_type`
+- **Performance Tier**: CASE expression on `accuracy`/`f1_score`
+  - `Excellent` (>0.90)
+  - `Good` (0.80-0.90)
+  - `Fair` (0.70-0.80)
+  - `Poor` (<0.70)
+### Sample Looker View
+```lookml
+view: model_metrics {
+  sql_table_name: `project.dataset.model_metrics` ;;
+  dimension: model_id {
+    primary_key: yes
+    type: string
+    sql: ${TABLE}.model_id ;;
+  }
+  dimension_group: created {
+    type: time
+    timeframes: [date, week, month, quarter, year]
+    sql: ${TABLE}.created_at ;;
+  }
+  dimension: model_type {
+    type: string
+    sql: ${TABLE}.model_type ;;
+  }
+  dimension: performance_tier {
+    type: string
+    sql: CASE
+      WHEN ${TABLE}.accuracy >= 0.90 THEN 'Excellent'
+      WHEN ${TABLE}.accuracy >= 0.80 THEN 'Good'
+      WHEN ${TABLE}.accuracy >= 0.70 THEN 'Fair'
+      ELSE 'Poor'
+    END ;;
+  }
+  measure: count {
+    type: count
+  }
+  measure: avg_accuracy {
+    type: average
+    sql: ${TABLE}.accuracy ;;
+    value_format_name: percent_2
+  }
+  measure: avg_f1_score {
+    type: average
+    sql: ${TABLE}.f1_score ;;
+    value_format_name: percent_2
+  }
+}
+```
+---
+## 🎯 Table 2: `feature_importance`
+**Description**: Feature importance scores for model interpretability.
+**Use Cases**:
+- Feature impact analysis
+- Feature selection dashboards
+- Model explainability reports
+**Update Frequency**: On every model training run
+**Grain**: One row per feature per model
+### Schema
+| Column Name | Type | Description | Dimension/Metric | Example |
+|------------|------|-------------|------------------|---------|
+| `model_id` | STRING | Foreign key to model_metrics | Dimension (Foreign Key) | `xgboost_churn_20251223_153045` |
+| `feature_name` | STRING | Name of the feature | Dimension (Primary Key) | `age`, `total_purchases`, `days_since_last_login` |
+| `importance_score` | FLOAT | Importance value (0-1) | Metric | `0.35` |
+| `importance_rank` | INTEGER | Rank by importance (1=most important) | Metric | `1`, `2`, `3` |
+| `importance_type` | STRING | Calculation method | Dimension | `gain`, `weight`, `cover`, `shap` |
+| `feature_type` | STRING | Data type category | Dimension | `numeric`, `categorical`, `datetime`, `text` |
+| `is_engineered` | BOOLEAN | Created by feature engineering? | Dimension | `true`, `false` |
+| `created_at` | TIMESTAMP | When importance was calculated | Dimension (Time) | `2025-12-23 15:30:45 UTC` |
+| `created_date` | DATE | Calculation date | Dimension (Time) | `2025-12-23` |
+### Partitioning & Clustering
+```sql
+CREATE TABLE `project.dataset.feature_importance`
+(
+  -- columns as above
+)
+PARTITION BY created_date
+CLUSTER BY model_id, importance_rank
+OPTIONS(
+  description="Feature importance scores for model explainability",
+  require_partition_filter=false  -- Allow cross-model queries
+);
+```
+### Primary Dimensions for Looker
+- **Feature**: `feature_name`, `feature_type`, `is_engineered`
+- **Model**: `model_id` (join to model_metrics)
+- **Importance**: `importance_rank`, `importance_type`
+### Sample Looker View
+```lookml
+view: feature_importance {
+  sql_table_name: `project.dataset.feature_importance` ;;
+  dimension: compound_key {
+    primary_key: yes
+    hidden: yes
+    sql: CONCAT(${TABLE}.model_id, '|', ${TABLE}.feature_name) ;;
+  }
+  dimension: feature_name {
+    type: string
+    sql: ${TABLE}.feature_name ;;
+  }
+  dimension: is_top_10 {
+    type: yesno
+    sql: ${TABLE}.importance_rank <= 10 ;;
+  }
+  measure: avg_importance {
+    type: average
+    sql: ${TABLE}.importance_score ;;
+    value_format_name: percent_2
+  }
+  measure: count_features {
+    type: count_distinct
+    sql: ${TABLE}.feature_name ;;
+  }
+}
+```
+---
+## 🔮 Table 3: `predictions`
+**Description**: Model predictions with actuals for monitoring and evaluation.
+**Use Cases**:
+- Prediction monitoring
+- Accuracy tracking over time
+- Segment performance analysis
+- Business impact measurement
+**Update Frequency**: Real-time or batch (daily/hourly)
+**Grain**: One row per prediction
+### Schema
+| Column Name | Type | Description | Dimension/Metric | Example |
+|------------|------|-------------|------------------|---------|
+| `prediction_id` | STRING | Unique prediction identifier | Dimension (Primary Key) | `pred_abc123xyz` |
+| `model_id` | STRING | Model used for prediction | Dimension (Foreign Key) | `xgboost_churn_20251223_153045` |
+| `entity_id` | STRING | Entity being predicted (customer_id, product_id, etc.) | Dimension | `customer_12345` |
+| `predicted_at` | TIMESTAMP | When prediction was made | Dimension (Time) | `2025-12-23 15:30:45 UTC` |
+| `predicted_date` | DATE | Prediction date (for partitioning) | Dimension (Time) | `2025-12-23` |
+| `prediction_value` | FLOAT | Predicted value | Metric | `0.85` (probability), `49.99` (price) |
+| `prediction_class` | STRING | Predicted class (classification) | Dimension | `churn`, `not_churn` |
+| `prediction_confidence` | FLOAT | Model confidence (0-1) | Metric | `0.92` |
+| `actual_value` | FLOAT | True value (when available) | Metric | `1.0` (churned), `52.50` (actual price) |
+| `actual_class` | STRING | True class (when available) | Dimension | `churn`, `not_churn` |
+| `actual_recorded_at` | TIMESTAMP | When actual became known | Dimension (Time) | `2025-12-30 10:00:00 UTC` |
+| `is_correct` | BOOLEAN | Prediction was correct? | Dimension | `true`, `false` |
+| `absolute_error` | FLOAT | \|predicted - actual\| | Metric | `2.51` |
+| `squared_error` | FLOAT | (predicted - actual)² | Metric | `6.30` |
+| `feature_values` | STRING (JSON) | Input features used | Metadata | `{"age": 35, "tenure": 24}` |
+| `segment` | STRING | Business segment | Dimension | `enterprise`, `smb`, `consumer` |
+| `region` | STRING | Geographic region | Dimension | `us-west`, `eu-central` |
+| `model_version` | STRING | Model version | Dimension | `v1.2.3` |
+| `prediction_latency_ms` | FLOAT | Inference time | Metric | `23.4` |
+### Partitioning & Clustering
+```sql
+CREATE TABLE `project.dataset.predictions`
+(
+  -- columns as above
+)
+PARTITION BY predicted_date
+CLUSTER BY model_id, segment, is_correct
+OPTIONS(
+  description="Model predictions with actuals for monitoring",
+  require_partition_filter=true,
+  partition_expiration_days=730  -- 2 years retention
+);
+```
+### Primary Dimensions for Looker
+- **Time**: `predicted_date`, days since prediction
+- **Model**: `model_id`, `model_version`
+- **Segment**: `segment`, `region`
+- **Accuracy**: `is_correct`, error buckets
+### Sample Looker View
+```lookml
+view: predictions {
+  sql_table_name: `project.dataset.predictions` ;;
+  dimension: prediction_id {
+    primary_key: yes
+    type: string
+    sql: ${TABLE}.prediction_id ;;
+  }
+  dimension_group: predicted {
+    type: time
+    timeframes: [date, week, month]
+    sql: ${TABLE}.predicted_at ;;
+  }
+  dimension: segment {
+    type: string
+    sql: ${TABLE}.segment ;;
+  }
+  dimension: error_bucket {
+    type: string
+    sql: CASE
+      WHEN ${TABLE}.absolute_error IS NULL THEN 'No Actual Yet'
+      WHEN ${TABLE}.absolute_error <= 0.1 THEN '0-10%'
+      WHEN ${TABLE}.absolute_error <= 0.2 THEN '10-20%'
+      ELSE '>20%'
+    END ;;
+  }
+  measure: count {
+    type: count
+  }
+  measure: accuracy_rate {
+    type: average
+    sql: CAST(${TABLE}.is_correct AS FLOAT64) ;;
+    value_format_name: percent_1
+  }
+  measure: avg_confidence {
+    type: average
+    sql: ${TABLE}.prediction_confidence ;;
+    value_format_name: percent_2
+  }
+  measure: mae {
+    type: average
+    sql: ${TABLE}.absolute_error ;;
+    value_format_name: decimal_2
+  }
+}
+```
+---
+## 📋 Table 4: `data_profile_summary`
+**Description**: Dataset profiling statistics for data quality monitoring.
+**Use Cases**:
+- Data quality dashboards
+- Schema drift detection
+- Data validation reports
+- Column-level monitoring
+**Update Frequency**: Daily or on-demand
+**Grain**: One row per column per dataset per run
+### Schema
+| Column Name | Type | Description | Dimension/Metric | Example |
+|------------|------|-------------|------------------|---------|
+| `profile_id` | STRING | Unique profile run identifier | Dimension (Primary Key) | `profile_abc123xyz` |
+| `dataset_name` | STRING | Source table/file name | Dimension | `project.dataset.customers` |
+| `column_name` | STRING | Column being profiled | Dimension | `age`, `email`, `signup_date` |
+| `profiled_at` | TIMESTAMP | When profiling ran | Dimension (Time) | `2025-12-23 15:30:45 UTC` |
+| `profiled_date` | DATE | Profiling date | Dimension (Time) | `2025-12-23` |
+| `data_type` | STRING | Column data type | Dimension | `INTEGER`, `STRING`, `FLOAT`, `TIMESTAMP` |
+| `inferred_type` | STRING | Smart type inference | Dimension | `numeric`, `categorical`, `datetime`, `text`, `email` |
+| `row_count` | INTEGER | Total rows in dataset | Metric | `10000` |
+| `non_null_count` | INTEGER | Non-null values | Metric | `9850` |
+| `null_count` | INTEGER | Null values | Metric | `150` |
+| `null_percentage` | FLOAT | % null (0-100) | Metric | `1.5` |
+| `unique_count` | INTEGER | Distinct values | Metric | `450` |
+| `uniqueness_percentage` | FLOAT | % unique (0-100) | Metric | `4.5` |
+| `min_value` | STRING | Minimum value (as string) | Metadata | `18`, `2020-01-01` |
+| `max_value` | STRING | Maximum value (as string) | Metadata | `95`, `2025-12-23` |
+| `mean_value` | FLOAT | Mean (numeric only) | Metric | `42.5` |
+| `median_value` | FLOAT | Median (numeric only) | Metric | `38.0` |
+| `std_dev` | FLOAT | Standard deviation (numeric only) | Metric | `15.2` |
+| `skewness` | FLOAT | Distribution skewness | Metric | `0.85` |
+| `kurtosis` | FLOAT | Distribution kurtosis | Metric | `2.1` |
+| `top_value` | STRING | Most common value | Metadata | `male`, `active` |
+| `top_value_frequency` | INTEGER | Count of most common value | Metric | `6500` |
+| `top_value_percentage` | FLOAT | % of most common value | Metric | `65.0` |
+| `has_outliers` | BOOLEAN | Outliers detected? | Dimension | `true`, `false` |
+| `outlier_count` | INTEGER | Number of outliers | Metric | `23` |
+| `outlier_percentage` | FLOAT | % outliers | Metric | `0.23` |
+| `quality_score` | FLOAT | Overall quality score (0-100) | Metric | `92.5` |
+| `quality_issues` | STRING (JSON) | Detected issues | Metadata | `["high_nulls", "duplicate_values"]` |
+| `validation_status` | STRING | Quality check result | Dimension | `pass`, `warn`, `fail` |
+### Partitioning & Clustering
+```sql
+CREATE TABLE `project.dataset.data_profile_summary`
+(
+  -- columns as above
+)
+PARTITION BY profiled_date
+CLUSTER BY dataset_name, validation_status
+OPTIONS(
+  description="Dataset profiling for data quality monitoring",
+  require_partition_filter=true,
+  partition_expiration_days=90  -- 3 months retention
+);
+```
+### Primary Dimensions for Looker
+- **Dataset**: `dataset_name`
+- **Column**: `column_name`, `data_type`, `inferred_type`
+- **Quality**: `validation_status`, `quality_score` buckets
+- **Time**: `profiled_date`
+### Sample Looker View
+```lookml
+view: data_profile_summary {
+  sql_table_name: `project.dataset.data_profile_summary` ;;
+  dimension: compound_key {
+    primary_key: yes
+    hidden: yes
+    sql: CONCAT(${TABLE}.profile_id, '|', ${TABLE}.column_name) ;;
+  }
+  dimension: column_name {
+    type: string
+    sql: ${TABLE}.column_name ;;
+  }
+  dimension: quality_tier {
+    type: string
+    sql: CASE
+      WHEN ${TABLE}.quality_score >= 90 THEN 'Excellent'
+      WHEN ${TABLE}.quality_score >= 75 THEN 'Good'
+      WHEN ${TABLE}.quality_score >= 60 THEN 'Fair'
+      ELSE 'Poor'
+    END ;;
+  }
+  dimension: has_quality_issues {
+    type: yesno
+    sql: ${TABLE}.validation_status IN ('warn', 'fail') ;;
+  }
+  measure: count_columns {
+    type: count_distinct
+    sql: ${TABLE}.column_name ;;
+  }
+  measure: avg_quality_score {
+    type: average
+    sql: ${TABLE}.quality_score ;;
+    value_format_name: decimal_1
+  }
+  measure: avg_null_percentage {
+    type: average
+    sql: ${TABLE}.null_percentage ;;
+    value_format_name: percent_1
+  }
+  measure: columns_with_issues {
+    type: count_distinct
+    sql: ${TABLE}.column_name ;;
+    filters: [has_quality_issues: "yes"]
+  }
+}
+```
+---
+## 🔄 Schema Evolution Guidelines
+### ✅ **SAFE Changes** (Non-Breaking)
+1. **Add new columns** (always nullable or with defaults)
+   ```sql
+   ALTER TABLE `project.dataset.model_metrics`
+   ADD COLUMN IF NOT EXISTS new_metric FLOAT64;
+   ```
+2. **Add new tables** (doesn't affect existing dashboards)
+3. **Lengthen STRING columns** (VARCHAR(50) → VARCHAR(100))
+4. **Add indexes/clustering** (performance only)
+5. **Add column descriptions**
+   ```sql
+   ALTER TABLE `project.dataset.model_metrics`
+   ALTER COLUMN accuracy SET OPTIONS (description='Model accuracy (0-1)');
+   ```
+### ❌ **BREAKING Changes** (Require Dashboard Updates)
+1. **Rename columns** → Use views for backward compatibility:
+   ```sql
+   CREATE OR REPLACE VIEW `project.dataset.model_metrics_v2` AS
+   SELECT
+     model_id,
+     accuracy AS acc,  -- renamed column
+     ...
+   FROM `project.dataset.model_metrics`;
+   ```
+2. **Change data types** → Create new column, migrate, deprecate old:
+   ```sql
+   -- Step 1: Add new column
+   ALTER TABLE model_metrics ADD COLUMN created_at_new TIMESTAMP;
+   -- Step 2: Backfill
+   UPDATE model_metrics SET created_at_new = CAST(created_at AS TIMESTAMP) WHERE true;
+   -- Step 3: Update dashboards to use new column
+   -- Step 4: Drop old column after validation period
+   ALTER TABLE model_metrics DROP COLUMN created_at;
+   ```
+3. **Remove columns** → Deprecate first, remove after 90 days
+4. **Change partitioning** → Requires table recreation
+### 🔄 **Versioning Strategy**
+For major schema changes, create versioned tables:
+```
+project.dataset.model_metrics_v1  (deprecated, keep 90 days)
+project.dataset.model_metrics_v2  (current)
+project.dataset.model_metrics     (view pointing to latest version)
+```
+---
+## 📊 Dashboard-Ready Metrics Catalog
+### Model Performance Metrics
+| Metric Name | Calculation | Use Case |
+|------------|-------------|----------|
+| **Model Count** | `COUNT(DISTINCT model_id)` | Total models trained |
+| **Avg Accuracy** | `AVG(accuracy)` | Overall model quality |
+| **Accuracy Trend** | `AVG(accuracy) OVER (ORDER BY created_date)` | Performance over time |
+| **Best Model** | `model_id WHERE accuracy = MAX(accuracy)` | Top performer |
+| **Models by Type** | `COUNT(*) GROUP BY model_type` | Algorithm distribution |
+| **Training Time** | `AVG(training_duration_seconds)` | Resource usage |
+| **Recent Models** | `WHERE created_date >= DATE_SUB(CURRENT_DATE(), INTERVAL 7 DAY)` | Latest activity |
+### Feature Importance Metrics
+| Metric Name | Calculation | Use Case |
+|------------|-------------|----------|
+| **Top Features** | `WHERE importance_rank <= 10` | Most impactful features |
+| **Avg Importance** | `AVG(importance_score)` | Feature impact distribution |
+| **Engineered Features** | `COUNT(*) WHERE is_engineered = true` | Feature engineering effectiveness |
+| **Feature Stability** | `STDDEV(importance_score) GROUP BY feature_name` | Consistent predictors |
+### Prediction Metrics
+| Metric Name | Calculation | Use Case |
+|------------|-------------|----------|
+| **Accuracy Rate** | `AVG(CAST(is_correct AS FLOAT64))` | Real-world performance |
+| **MAE** | `AVG(absolute_error)` | Average error magnitude |
+| **RMSE** | `SQRT(AVG(squared_error))` | Error with outlier penalty |
+| **Predictions/Day** | `COUNT(*) GROUP BY predicted_date` | Volume tracking |
+| **Confidence Distribution** | `APPROX_QUANTILES(prediction_confidence, 10)` | Model calibration |
+| **Segment Performance** | `AVG(is_correct) GROUP BY segment` | Fairness check |
+### Data Quality Metrics
+| Metric Name | Calculation | Use Case |
+|------------|-------------|----------|
+| **Data Quality Score** | `AVG(quality_score)` | Overall health |
+| **Null Rate** | `AVG(null_percentage)` | Completeness |
+| **Columns with Issues** | `COUNT(DISTINCT column_name) WHERE validation_status != 'pass'` | Problem areas |
+| **Quality Trend** | `AVG(quality_score) OVER (ORDER BY profiled_date)` | Improving/degrading? |
+---
+## 🎯 Sample Looker Explores
+### Explore 1: Model Performance Analysis
+```lookml
+explore: model_metrics {
+  label: "Model Performance"
+  description: "Track model accuracy, training time, and comparison"
+  join: feature_importance {
+    type: left_outer
+    sql_on: ${model_metrics.model_id} = ${feature_importance.model_id} ;;
+    relationship: one_to_many
+  }
+}
+```
+### Explore 2: Prediction Monitoring
+```lookml
+explore: predictions {
+  label: "Prediction Monitoring"
+  description: "Real-time prediction accuracy and drift"
+  join: model_metrics {
+    type: left_outer
+    sql_on: ${predictions.model_id} = ${model_metrics.model_id} ;;
+    relationship: many_to_one
+  }
+}
+```
+### Explore 3: Data Quality Dashboard
+```lookml
+explore: data_profile_summary {
+  label: "Data Quality"
+  description: "Monitor data health and schema drift"
+}
+```
+---
+## 📝 Implementation Checklist
+### Phase 1: Setup (Week 1)
+- [ ] Create all 4 BigQuery tables with partitioning
+- [ ] Set up service account permissions
+- [ ] Configure table expiration policies
+- [ ] Document table owners and update SLAs
+### Phase 2: Integration (Week 2)
+- [ ] Update tools to write to these schemas
+- [ ] Add schema validation in CI/CD
+- [ ] Create data dictionary in Looker
+- [ ] Set up table monitoring alerts
+### Phase 3: BI Layer (Week 3)
+- [ ] Create Looker views for all 4 tables
+- [ ] Build explores with joins
+- [ ] Create initial dashboards
+- [ ] Set up scheduled data refreshes
+### Phase 4: Validation (Week 4)
+- [ ] Backfill historical data
+- [ ] Verify dashboard accuracy
+- [ ] Train stakeholders on dashboards
+- [ ] Document runbooks for common issues
+---
+## 🔗 Related Tools
+**BigQuery Write Tools** (src/bigquery/):
+- `bigquery_write_results()` - Generic write function
+- Helper: `bigquery_write_model_metrics()` - Specialized writer
+- Helper: `bigquery_write_feature_importance()` - Specialized writer
+- Helper: `bigquery_write_predictions()` - Specialized writer
+- Helper: `bigquery_write_data_profile()` - Specialized writer
+**Example Usage**:
+```python
+from src.bigquery import bigquery_write_results
+# Write model metrics
+bigquery_write_results(
+    data=metrics_df,
+    table_id="project.dataset.model_metrics",
+    write_disposition="WRITE_APPEND"
+)
+```
+---
+## 📚 Additional Resources
+- [BigQuery Best Practices](https://cloud.google.com/bigquery/docs/best-practices)
+- [Looker LookML Reference](https://cloud.google.com/looker/docs/reference/lookml-quick-reference)
+- [Schema Design for BI](https://cloud.google.com/architecture/bigquery-data-warehouse)
+---
+**Last Updated**: December 23, 2025
+**Schema Version**: 1.0.0
+**Maintained By**: Data Science Team
+**Review Cadence**: Quarterly

CHECKLIST.md ADDED Viewed

	@@ -0,0 +1,97 @@

+# ✅ Pre-Launch Checklist
+## Before Running the Application
+### 1. Environment Variables ⚠️ **REQUIRED**
+You MUST set your API key before starting:
+```powershell
+# Windows PowerShell
+$env:GOOGLE_API_KEY="your-google-api-key-here"
+# Verify it's set
+echo $env:GOOGLE_API_KEY
+```
+### 2. Build Status ✅
+- [x] Frontend dependencies installed
+- [x] Frontend built (FRRONTEEEND/dist exists)
+- [x] Backend code updated with new endpoints
+- [x] Configuration files in place
+### 3. Quick Start Commands
+**Option A - Use the start script:**
+```powershell
+.\start.ps1
+```
+**Option B - Manual start:**
+```powershell
+# Make sure you're in the project root
+Set-Location "c:\Users\Pulastya\Videos\DS AGENTTTT"
+# Set API key (if not already set)
+$env:GOOGLE_API_KEY="your-key-here"
+# Start the server
+python src\api\app.py
+```
+### 4. Access the Application
+Once the server starts, open your browser to:
+**http://localhost:8080**
+You should see:
+1. **Landing Page** - Professional homepage with agent features
+2. **Launch Console** button - Click to open the chat interface
+3. **Chat Interface** - Modern conversational UI
+### 5. Test the Chat
+Try these sample prompts:
+- "What can you do?"
+- "Explain your data science capabilities"
+- "How do I upload a dataset?"
+- "What ML models do you support?"
+### 6. Expected Console Output
+When you start the server, you should see:
+```
+INFO:     Started server process [####]
+INFO:     Waiting for application startup.
+✅ Agent initialized with provider: groq
+✅ Frontend assets mounted from C:\Users\Pulastya\Videos\DS AGENTTTT\FRRONTEEEND\dist
+INFO:     Application startup complete.
+INFO:     Uvicorn running on http://0.0.0.0:8080
+```
+### 7. Troubleshooting Quick Reference
+| Issue | Solution |
+|-------|----------|
+| "Agent not initialized" | Set GOOGLE_API_KEY environment variable |
+| "Frontend not found" | Run `cd FRRONTEEEND && npm run build` |
+| Port 8080 in use | Kill the process or change PORT env var |
+| Import errors | Run `pip install -r requirements.txt` |
+## Next Steps After Launch
+1. **Test the chat** with the agent
+2. **Upload a dataset** (feature coming soon in chat)
+3. **Try the API endpoints** at http://localhost:8080/docs
+4. **Customize the frontend** in FRRONTEEEND/components/
+## Documentation
+- 📖 [MIGRATION_COMPLETE.md](MIGRATION_COMPLETE.md) - What was changed
+- 📖 [FRONTEND_INTEGRATION.md](FRONTEND_INTEGRATION.md) - Technical details
+- 📖 [README.md](README.md) - Main project docs
+---
+**Ready to launch?** Run `.\start.ps1` and visit http://localhost:8080 🚀

DEPLOYMENT.md ADDED Viewed

	@@ -0,0 +1,495 @@

+# 🚀 Google Cloud Run Deployment Guide
+Complete guide to deploy the Data Science Agent to Google Cloud Run as a serverless API.
+## 📋 Prerequisites
+1. **Google Cloud Platform Account**
+   - Active GCP account with billing enabled
+   - Project created (or use existing project)
+2. **Install Google Cloud SDK**
+   ```bash
+   # macOS (Homebrew)
+   brew install --cask google-cloud-sdk
+   # Or download from: https://cloud.google.com/sdk/install
+   ```
+3. **Authenticate with GCP**
+   ```bash
+   gcloud auth login
+   gcloud auth application-default login
+   ```
+4. **Set Your Project**
+   ```bash
+   gcloud config set project YOUR_PROJECT_ID
+   ```
+---
+## 🎯 Deployment Options
+### Option 1: Automated Deployment (Recommended)
+Use the provided deployment script for one-command deployment:
+```bash
+# Set required environment variables
+export GCP_PROJECT_ID="your-project-id"
+export GROQ_API_KEY="your-groq-api-key"
+export GOOGLE_API_KEY="your-google-api-key"  # Optional for Gemini
+# Run deployment script
+./deploy.sh
+```
+**What it does:**
+- ✅ Enables required GCP APIs (Cloud Build, Cloud Run, Secret Manager)
+- ✅ Creates secrets for API keys
+- ✅ Builds Docker container
+- ✅ Deploys to Cloud Run
+- ✅ Returns service URL
+**Configuration options:**
+```bash
+# Optional: Customize deployment
+export CLOUD_RUN_REGION="us-central1"  # Change region
+export MEMORY="4Gi"                     # Increase memory
+export CPU="2"                          # Set CPU count
+export MAX_INSTANCES="10"               # Scale limit
+export TIMEOUT="900"                    # Request timeout (15 min)
+./deploy.sh
+```
+---
+### Option 2: Manual Deployment
+Step-by-step manual deployment for full control:
+#### Step 1: Enable APIs
+```bash
+gcloud services enable \
+  cloudbuild.googleapis.com \
+  run.googleapis.com \
+  containerregistry.googleapis.com \
+  secretmanager.googleapis.com
+```
+#### Step 2: Create Secrets
+```bash
+# Create GROQ API key secret
+echo -n "your-groq-api-key" | gcloud secrets create GROQ_API_KEY --data-file=-
+# Create Google API key secret (optional)
+echo -n "your-google-api-key" | gcloud secrets create GOOGLE_API_KEY --data-file=-
+# Grant Cloud Run access to secrets
+PROJECT_NUMBER=$(gcloud projects describe $(gcloud config get-value project) --format="value(projectNumber)")
+gcloud secrets add-iam-policy-binding GROQ_API_KEY \
+  --member="serviceAccount:${PROJECT_NUMBER}-compute@developer.gserviceaccount.com" \
+  --role="roles/secretmanager.secretAccessor"
+```
+#### Step 3: Build Container
+```bash
+gcloud builds submit --tag gcr.io/$(gcloud config get-value project)/data-science-agent
+```
+#### Step 4: Deploy to Cloud Run
+```bash
+gcloud run deploy data-science-agent \
+  --image gcr.io/$(gcloud config get-value project)/data-science-agent \
+  --platform managed \
+  --region us-central1 \
+  --allow-unauthenticated \
+  --memory 4Gi \
+  --cpu 2 \
+  --timeout 900 \
+  --max-instances 10 \
+  --set-env-vars LLM_PROVIDER=groq,REASONING_EFFORT=medium \
+  --set-secrets GROQ_API_KEY=GROQ_API_KEY:latest,GOOGLE_API_KEY=GOOGLE_API_KEY:latest
+```
+---
+### Option 3: CI/CD with Cloud Build Triggers
+Automated deployment on git push:
+#### Step 1: Connect Repository
+```bash
+# Connect GitHub/GitLab/Bitbucket repository
+gcloud beta builds connections create github connection-name \
+  --region=us-central1
+```
+#### Step 2: Create Build Trigger
+```bash
+gcloud builds triggers create github \
+  --name="deploy-data-science-agent" \
+  --repo-name="Data-Science-Agent" \
+  --repo-owner="Surfing-Ninja" \
+  --branch-pattern="^main$" \
+  --build-config="cloudbuild.yaml"
+```
+Now every push to `main` branch automatically deploys! 🎉
+---
+## 🧪 Testing the Deployment
+### 1. Health Check
+```bash
+SERVICE_URL=$(gcloud run services describe data-science-agent \
+  --region us-central1 \
+  --format 'value(status.url)')
+curl $SERVICE_URL/health
+```
+**Expected response:**
+```json
+{
+  "status": "healthy",
+  "agent_ready": true,
+  "provider": "groq",
+  "tools_count": 82
+}
+```
+### 2. List Available Tools
+```bash
+curl $SERVICE_URL/tools | jq
+```
+### 3. Profile a Dataset
+```bash
+curl -X POST $SERVICE_URL/profile \
+  -F "file=@test_data/sample.csv"
+```
+### 4. Run Full Analysis
+```bash
+curl -X POST $SERVICE_URL/run \
+  -F "file=@test_data/sample.csv" \
+  -F "task_description=Analyze this dataset, detect outliers, and train a prediction model" \
+  -F "target_col=target" \
+  | jq
+```
+---
+## 📊 Monitoring & Logs
+### View Real-time Logs
+```bash
+gcloud run logs tail data-science-agent --region us-central1
+```
+### View Recent Logs
+```bash
+gcloud run logs read data-science-agent \
+  --region us-central1 \
+  --limit 50
+```
+### Cloud Console Monitoring
+- Go to: https://console.cloud.google.com/run
+- Click on `data-science-agent`
+- View: Metrics, Logs, Revisions
+---
+## 💰 Cost Estimation
+### Cloud Run Pricing (as of Dec 2024)
+**Free Tier** (per month):
+- 2 million requests
+- 360,000 GB-seconds of memory
+- 180,000 vCPU-seconds
+**Paid Tier** (us-central1):
+- CPU: $0.00002400 per vCPU-second
+- Memory: $0.00000250 per GB-second
+- Requests: $0.40 per million requests
+**Example Cost for 4Gi Memory, 2 vCPU:**
+- 1 request taking 60 seconds
+  - CPU: 2 vCPU × 60s × $0.000024 = $0.00288
+  - Memory: 4GB × 60s × $0.0000025 = $0.0006
+  - Request: $0.0000004
+  - **Total: ~$0.0035 per request**
+**Monthly estimate for 1000 requests/month:**
+- ~$3.50/month (well within free tier for testing!)
+---
+## 🔒 Security Best Practices
+### 1. Enable Authentication (Production)
+```bash
+# Deploy with authentication required
+gcloud run deploy data-science-agent \
+  --no-allow-unauthenticated \
+  --region us-central1 \
+  --image gcr.io/PROJECT_ID/data-science-agent
+# Create service account for clients
+gcloud iam service-accounts create api-client
+# Grant invoker role
+gcloud run services add-iam-policy-binding data-science-agent \
+  --member="serviceAccount:api-client@PROJECT_ID.iam.gserviceaccount.com" \
+  --role="roles/run.invoker" \
+  --region us-central1
+```
+### 2. Use VPC Connector (For BigQuery/GCS)
+```bash
+# Create VPC connector
+gcloud compute networks vpc-access connectors create ds-agent-connector \
+  --network default \
+  --region us-central1 \
+  --range 10.8.0.0/28
+# Deploy with VPC
+gcloud run deploy data-science-agent \
+  --vpc-connector ds-agent-connector \
+  --region us-central1
+```
+### 3. Restrict API Keys
+- Set **Application restrictions** in Google Cloud Console
+- Whitelist only Cloud Run service URL
+- Set **API restrictions** to only required APIs
+---
+## 🔧 Configuration Options
+### Environment Variables
+```bash
+# Set during deployment
+--set-env-vars KEY1=value1,KEY2=value2
+# Available variables:
+LLM_PROVIDER=groq                    # or "gemini"
+REASONING_EFFORT=medium              # low, medium, high
+CACHE_TTL_SECONDS=86400              # Cache lifetime
+ARTIFACT_BACKEND=local               # or "gcs" for cloud storage
+GCS_BUCKET_NAME=your-bucket          # If using GCS backend
+OUTPUT_DIR=/tmp/outputs              # Output directory
+MAX_PARALLEL_TOOLS=5                 # Concurrent tool execution
+MAX_RETRIES=3                        # Tool retry attempts
+TIMEOUT_SECONDS=300                  # Tool timeout
+```
+### Resource Limits
+```bash
+--memory 4Gi              # 128Mi to 32Gi
+--cpu 2                   # 1 to 8 vCPU
+--timeout 900             # Max 3600s (1 hour)
+--max-instances 10        # Scale limit
+--min-instances 0         # Always-warm instances
+--concurrency 10          # Requests per instance
+```
+---
+## 🐛 Troubleshooting
+### Build Fails
+```bash
+# Check build logs
+gcloud builds list --limit=5
+gcloud builds log BUILD_ID
+# Common fixes:
+# - Ensure Dockerfile is in root directory
+# - Check requirements.txt has all dependencies
+# - Increase build timeout: --timeout=1200s
+```
+### Deployment Fails
+```bash
+# Check service status
+gcloud run services describe data-science-agent --region us-central1
+# Common fixes:
+# - Ensure APIs are enabled
+# - Check secrets exist and are accessible
+# - Verify service account permissions
+```
+### Runtime Errors
+```bash
+# View logs
+gcloud run logs tail data-science-agent --region us-central1
+# Common issues:
+# - API keys not set: Check secrets
+# - Import errors: Ensure all dependencies in requirements.txt
+# - Memory issues: Increase --memory limit
+# - Timeout: Increase --timeout value
+```
+### Container Crashes
+```bash
+# Test locally first
+docker build -t ds-agent .
+docker run -p 8080:8080 \
+  -e GROQ_API_KEY="your-key" \
+  ds-agent
+curl http://localhost:8080/health
+```
+---
+## 🚀 Advanced Features
+### Custom Domain
+```bash
+# Map custom domain
+gcloud run domain-mappings create \
+  --service data-science-agent \
+  --domain api.yourdomain.com \
+  --region us-central1
+```
+### Load Balancing
+```bash
+# Create multiple regional deployments
+for region in us-central1 us-east1 europe-west1; do
+  gcloud run deploy data-science-agent \
+    --image gcr.io/PROJECT_ID/data-science-agent \
+    --region $region
+done
+# Set up global load balancer
+# Follow: https://cloud.google.com/load-balancing/docs/https/setup-global-ext-https-serverless
+```
+### Multi-Region Deployment
+```bash
+# Deploy to multiple regions for high availability
+./deploy.sh CLOUD_RUN_REGION=us-central1
+./deploy.sh CLOUD_RUN_REGION=europe-west1
+./deploy.sh CLOUD_RUN_REGION=asia-east1
+```
+---
+## 📝 API Documentation
+Once deployed, access Swagger docs at:
+```
+https://YOUR_SERVICE_URL/docs
+```
+### Available Endpoints
+#### `GET /` - Health Check
+Returns service status and tool count.
+#### `GET /health` - Detailed Health
+Returns agent readiness and provider info.
+#### `GET /tools` - List Tools
+Returns all 82 available tools organized by category.
+#### `POST /run` - Run Full Analysis
+Upload dataset and execute complete data science workflow.
+**Parameters:**
+- `file`: CSV/Parquet file (multipart/form-data)
+- `task_description`: Natural language task description
+- `target_col`: Target column for ML (optional)
+- `use_cache`: Enable caching (default: true)
+- `max_iterations`: Max workflow steps (default: 20)
+#### `POST /profile` - Quick Profile
+Quick dataset profiling without full workflow.
+**Parameters:**
+- `file`: CSV/Parquet file (multipart/form-data)
+---
+## 🔄 Updates & Rollbacks
+### Update Deployment
+```bash
+# Rebuild and redeploy
+./deploy.sh
+```
+### Rollback to Previous Revision
+```bash
+# List revisions
+gcloud run revisions list --service data-science-agent --region us-central1
+# Rollback
+gcloud run services update-traffic data-science-agent \
+  --to-revisions REVISION_NAME=100 \
+  --region us-central1
+```
+### Blue/Green Deployment
+```bash
+# Deploy new version with tag
+gcloud run deploy data-science-agent \
+  --tag blue \
+  --no-traffic \
+  --region us-central1
+# Test: https://blue---data-science-agent-HASH.run.app
+# Switch traffic
+gcloud run services update-traffic data-science-agent \
+  --to-tags blue=100 \
+  --region us-central1
+```
+---
+## 📚 Additional Resources
+- **Cloud Run Docs**: https://cloud.google.com/run/docs
+- **Pricing Calculator**: https://cloud.google.com/products/calculator
+- **Best Practices**: https://cloud.google.com/run/docs/tips
+- **Quotas & Limits**: https://cloud.google.com/run/quotas
+---
+## ✅ Deployment Checklist
+- [ ] GCP project created and billing enabled
+- [ ] Google Cloud SDK installed and authenticated
+- [ ] API keys obtained (GROQ_API_KEY, GOOGLE_API_KEY)
+- [ ] Secrets created in Secret Manager
+- [ ] Docker container builds successfully locally
+- [ ] Cloud Run APIs enabled
+- [ ] Service deployed to Cloud Run
+- [ ] Health check endpoint returns 200
+- [ ] Test dataset profiled successfully
+- [ ] Full analysis workflow tested
+- [ ] Monitoring/logging configured
+- [ ] Cost alerts set up (optional)
+- [ ] Custom domain mapped (optional)
+- [ ] CI/CD pipeline configured (optional)
+---
+**Need help?** Check the troubleshooting section or view logs with:
+```bash
+gcloud run logs tail data-science-agent --region us-central1
+```
+Happy deploying! 🎉

Dockerfile ADDED Viewed

	@@ -0,0 +1,78 @@

+# Multi-stage build for Google Cloud Run
+# Stage 1: Build Frontend
+FROM node:20-alpine as frontend-builder
+WORKDIR /frontend
+# Copy frontend files
+COPY FRRONTEEEND/package*.json ./
+RUN npm install
+COPY FRRONTEEEND/ ./
+RUN npm run build
+# Stage 2: Build Python environment
+FROM python:3.13-slim as builder
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    gcc \
+    g++ \
+    make \
+    && rm -rf /var/lib/apt/lists/*
+# Create virtual environment
+RUN python -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+# Copy requirements and install Python packages
+COPY requirements.txt .
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r requirements.txt
+# Stage 3: Runtime environment
+FROM python:3.13-slim
+# Install runtime dependencies only
+RUN apt-get update && apt-get install -y \
+    libgomp1 \
+    && rm -rf /var/lib/apt/lists/*
+# Copy virtual environment from builder
+COPY --from=builder /opt/venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+# Set working directory
+WORKDIR /app
+# Copy application code
+COPY src/ /app/src/
+COPY examples/ /app/examples/
+# Copy built frontend from frontend-builder
+COPY --from=frontend-builder /frontend/dist /app/FRRONTEEEND/dist
+# Create necessary directories for Cloud Run ephemeral storage
+RUN mkdir -p /tmp/data_science_agent \
+    /tmp/outputs/models \
+    /tmp/outputs/plots \
+    /tmp/outputs/reports \
+    /tmp/outputs/data \
+    /tmp/cache_db
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+ENV PORT=8080
+ENV OUTPUT_DIR=/tmp/outputs
+ENV CACHE_DB_PATH=/tmp/cache_db/cache.db
+ENV ARTIFACT_BACKEND=local
+# Cloud Run expects the service to listen on the PORT env variable
+EXPOSE 8080
+# Health check (optional, Cloud Run handles this)
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+    CMD python -c "import requests; requests.get('http://localhost:8080/health')" || exit 1
+# Run the FastAPI application
+CMD ["python", "src/api/app.py"]

FRONTEND_INTEGRATION.md ADDED Viewed

	@@ -0,0 +1,234 @@

+# Data Science Agent - Frontend Integration Guide
+## 🎉 New React Frontend
+The application now features a modern, professional React frontend that replaces the old Gradio interface.
+### Features
+- **Beautiful Landing Page**: Showcases the agent's capabilities with modern design
+- **Professional Chat Interface**: NextChat-style conversational UI
+- **Direct Backend Integration**: Communicates with your FastAPI backend
+- **Responsive Design**: Works on all devices
+- **Dark Theme**: Modern, eye-friendly interface
+## 🚀 Quick Start
+### Prerequisites
+- Python 3.13+
+- Node.js 20+
+- npm (comes with Node.js)
+### Running the Application
+#### Option 1: Using the Build Script (Recommended)
+**Windows:**
+```powershell
+.\build-and-deploy.ps1
+```
+**Linux/Mac:**
+```bash
+chmod +x build-and-deploy.sh
+./build-and-deploy.sh
+```
+Then start the server:
+```bash
+python src/api/app.py
+```
+#### Option 2: Manual Steps
+1. **Build the Frontend:**
+```bash
+cd FRRONTEEEND
+npm.cmd install
+npm.cmd run build
+cd ..
+```
+2. **Install Python Dependencies:**
+```bash
+pip install -r requirements.txt
+```
+3. **Start the Backend Server:**
+```bash
+python src/api/app.py
+```
+4. **Access the Application:**
+Open your browser and navigate to: http://localhost:8080
+## 🏗️ Architecture
+### Backend (FastAPI)
+- **Location**: `src/api/app.py`
+- **Port**: 8080
+- **Endpoints**:
+  - `GET /` - Health check & landing page
+  - `POST /chat` - Chat interface endpoint
+  - `POST /run` - Full data science workflow
+  - `POST /profile` - Dataset profiling
+  - `GET /tools` - List available tools
+### Frontend (React + Vite)
+- **Location**: `FRRONTEEEND/`
+- **Build Output**: `FRRONTEEEND/dist/`
+- **Dev Port**: 3000 (development mode)
+- **Production**: Served by FastAPI at port 8080
+## 🔧 Development Mode
+If you want to develop the frontend with hot-reloading:
+1. **Terminal 1 - Backend:**
+```bash
+python src/api/app.py
+```
+2. **Terminal 2 - Frontend:**
+```bash
+cd FRRONTEEEND
+npm.cmd run dev
+```
+Access:
+- Frontend (dev): http://localhost:3000
+- Backend API: http://localhost:8080
+## 🌐 API Integration
+The frontend now communicates with your FastAPI backend instead of calling external APIs directly.
+### Environment Variables
+Create `FRRONTEEEND/.env` for local development:
+```env
+VITE_API_URL=http://localhost:8080
+```
+For production, update `FRRONTEEEND/.env.production`:
+```env
+VITE_API_URL=https://your-cloud-run-url.run.app
+```
+## 📦 Deployment
+### Docker Build
+The Dockerfile now includes a multi-stage build that:
+1. Builds the React frontend
+2. Builds the Python environment
+3. Combines both in the final image
+```bash
+docker build -t data-science-agent .
+docker run -p 8080:8080 data-science-agent
+```
+### Google Cloud Run
+```bash
+gcloud builds submit --tag gcr.io/YOUR-PROJECT-ID/data-science-agent
+gcloud run deploy data-science-agent \
+  --image gcr.io/YOUR-PROJECT-ID/data-science-agent \
+  --platform managed \
+  --region us-central1 \
+  --allow-unauthenticated \
+  --set-env-vars GROQ_API_KEY=your-api-key
+```
+## 🔄 What Changed
+### Removed
+- ❌ Gradio interface (`chat_ui.py` - kept for reference)
+- ❌ Direct Google GenAI calls from frontend
+- ❌ Gradio dependency
+### Added
+- ✅ React + TypeScript frontend with Vite
+- ✅ Professional landing page
+- ✅ Modern chat interface
+- ✅ `/chat` API endpoint
+- ✅ CORS support in FastAPI
+- ✅ Static file serving for React app
+- ✅ Multi-stage Docker build
+## 🛠️ Tech Stack
+### Frontend
+- React 19
+- TypeScript 5.8
+- Vite 6
+- Tailwind CSS
+- Framer Motion (animations)
+- Lucide React (icons)
+### Backend (unchanged)
+- FastAPI
+- Python 3.13
+- Groq API
+- Polars, DuckDB
+- Scikit-learn, XGBoost, LightGBM
+## 📁 Project Structure
+```
+.
+├── FRRONTEEEND/              # React frontend
+│   ├── components/           # React components
+│   ├── dist/                 # Built frontend (after npm run build)
+│   ├── package.json
+│   ├── vite.config.ts
+│   └── .env                  # Frontend environment variables
+├── src/
+│   ├── api/
+│   │   └── app.py           # FastAPI backend (updated)
+│   ├── tools/               # Data science tools
+│   └── orchestrator.py      # Main agent logic
+├── requirements.txt          # Python dependencies (updated)
+├── Dockerfile               # Multi-stage build (updated)
+├── build-and-deploy.ps1     # Windows build script
+└── build-and-deploy.sh      # Linux/Mac build script
+```
+## 🐛 Troubleshooting
+### Frontend doesn't load
+- Make sure you've run `npm run build` in the FRRONTEEEND directory
+- Check that `FRRONTEEEND/dist/` exists and contains files
+### API errors in chat
+- Ensure the backend is running on port 8080
+- Check that `GROQ_API_KEY` is set in your environment
+- Verify the API URL in `.env` file
+### CORS errors
+- The backend now has CORS enabled for development
+- For production, update the `allow_origins` in `src/api/app.py`
+## 📝 Notes
+- The old `chat_ui.py` has been kept for reference but is no longer used
+- All chat functionality now goes through the `/chat` endpoint
+- The frontend is automatically served by FastAPI in production mode
+- Session history is maintained in the frontend (browser)
+## 🎯 Next Steps
+1. **Customize the frontend**: Edit files in `FRRONTEEEND/components/`
+2. **Add file upload**: Extend `ChatInterface.tsx` to handle file uploads
+3. **Add visualization**: Display charts from the backend in the chat
+4. **Authentication**: Add user authentication if needed
+## 📞 Support
+For issues or questions:
+1. Check the console logs (browser & terminal)
+2. Verify environment variables
+3. Ensure all dependencies are installed
+4. Review the API documentation at http://localhost:8080/docs

FRRONTEEEND/.env.production ADDED Viewed

	@@ -0,0 +1,3 @@

+# Production API Configuration
+# Update this to your production API URL
+VITE_API_URL=https://your-cloud-run-url.run.app

FRRONTEEEND/.gitignore ADDED Viewed

	@@ -0,0 +1,24 @@

+# Logs
+logs
+*.log
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+pnpm-debug.log*
+lerna-debug.log*
+node_modules
+dist
+dist-ssr
+*.local
+# Editor directories and files
+.vscode/*
+!.vscode/extensions.json
+.idea
+.DS_Store
+*.suo
+*.ntvs*
+*.njsproj
+*.sln
+*.sw?

FRRONTEEEND/App.tsx ADDED Viewed

	@@ -0,0 +1,59 @@

+import React, { useState } from 'react';
+import { HeroGeometric } from './components/HeroGeometric';
+import ProblemSolution from './components/ProblemSolution';
+import KeyCapabilities from './components/KeyCapabilities';
+import Process from './components/Process';
+import TechStack from './components/TechStack';
+import Footer from './components/Footer';
+import { BackgroundPaths } from './components/BackgroundPaths';
+import { Logo } from './components/Logo';
+import { ChatInterface } from './components/ChatInterface';
+const App: React.FC = () => {
+  const [view, setView] = useState<'landing' | 'chat'>('landing');
+  if (view === 'chat') {
+    return <ChatInterface onBack={() => setView('landing')} />;
+  }
+  return (
+    <div className="min-h-screen bg-[#030303] text-white selection:bg-indigo-500/30">
+      {/* Navigation (Overlay) */}
+      <nav className="fixed top-0 left-0 right-0 z-50 flex justify-between items-center px-6 py-4 backdrop-blur-md bg-[#030303]/20 border-b border-white/5">
+        <div className="flex items-center gap-3 cursor-pointer" onClick={() => setView('landing')}>
+          <Logo className="w-10 h-10" />
+          <span className="font-bold tracking-tight text-lg hidden sm:block uppercase text-white">
+            DATA SCIENCE AGENT
+          </span>
+        </div>
+        <button
+          onClick={() => setView('chat')}
+          className="px-5 py-2 bg-white/5 hover:bg-white/10 border border-white/10 rounded-lg text-sm font-medium transition-all"
+        >
+          Launch Console
+        </button>
+      </nav>
+      <main>
+        <HeroGeometric onChatClick={() => setView('chat')} />
+        <TechStack />
+        <ProblemSolution />
+        <KeyCapabilities />
+        {/* Transitional background paths section */}
+        <BackgroundPaths
+            title="Intelligence Without Limits"
+            subtitle="The agent continuously learns from your specific domain, optimizing its own tools and reasoning strategies to solve your hardest data challenges."
+        />
+        <Process />
+      </main>
+      <Footer />
+    </div>
+  );
+};
+export default App;

FRRONTEEEND/README.md ADDED Viewed

	@@ -0,0 +1,20 @@

+<div align="center">
+<img width="1200" height="475" alt="GHBanner" src="https://github.com/user-attachments/assets/0aa67016-6eaf-458a-adb2-6e31a0763ed6" />
+</div>
+# Run and deploy your AI Studio app
+This contains everything you need to run your app locally.
+View your app in AI Studio: https://ai.studio/apps/drive/1gChoktTuh429q26FzxS4BPo0q0LnlRE9
+## Run Locally
+**Prerequisites:**  Node.js
+1. Install dependencies:
+   `npm install`
+2. Set the `GEMINI_API_KEY` in [.env.local](.env.local) to your Gemini API key
+3. Run the app:
+   `npm run dev`

FRRONTEEEND/components/BackgroundPaths.tsx ADDED Viewed

	@@ -0,0 +1,148 @@

+import React from "react";
+import { motion } from "framer-motion";
+import { ArrowRight } from "lucide-react";
+import { cn } from "../lib/utils";
+function FloatingPaths({ position }: { position: number }) {
+    const paths = Array.from({ length: 36 }, (_, i) => ({
+        id: i,
+        d: `M-${380 - i * 5 * position} -${189 + i * 6}C-${
+            380 - i * 5 * position
+        } -${189 + i * 6} -${312 - i * 5 * position} ${216 - i * 6} ${
+            152 - i * 5 * position
+        } ${343 - i * 6}C${616 - i * 5 * position} ${470 - i * 6} ${
+            684 - i * 5 * position
+        } ${875 - i * 6} ${684 - i * 5 * position} ${875 - i * 6}`,
+        color: `rgba(99,102,241,${0.05 + i * 0.01})`, // Using indigo-500 tint
+        width: 0.5 + i * 0.03,
+    }));
+    return (
+        <div className="absolute inset-0 pointer-events-none">
+            <svg
+                className="w-full h-full text-indigo-500/20"
+                viewBox="0 0 696 316"
+                fill="none"
+            >
+                <title>Background Paths</title>
+                {paths.map((path) => (
+                    <motion.path
+                        key={path.id}
+                        d={path.d}
+                        stroke="currentColor"
+                        strokeWidth={path.width}
+                        strokeOpacity={0.1 + path.id * 0.02}
+                        initial={{ pathLength: 0.3, opacity: 0.4 }}
+                        animate={{
+                            pathLength: 1,
+                            opacity: [0.2, 0.5, 0.2],
+                            pathOffset: [0, 1, 0],
+                        }}
+                        transition={{
+                            duration: 15 + Math.random() * 10,
+                            repeat: Number.POSITIVE_INFINITY,
+                            ease: "linear",
+                        }}
+                    />
+                ))}
+            </svg>
+        </div>
+    );
+}
+export function BackgroundPaths({
+    title = "The Future is Autonomous",
+    subtitle = "Scale your data engineering and predictive modeling beyond human limits.",
+}: {
+    title?: string;
+    subtitle?: string;
+}) {
+    const words = title.split(" ");
+    return (
+        <section className="relative min-h-[80vh] w-full flex items-center justify-center overflow-hidden bg-[#030303]">
+            <div className="absolute inset-0">
+                <FloatingPaths position={1} />
+                <FloatingPaths position={-1} />
+            </div>
+            <div className="relative z-10 container mx-auto px-4 md:px-6 text-center">
+                <motion.div
+                    initial={{ opacity: 0 }}
+                    animate={{ opacity: 1 }}
+                    transition={{ duration: 2 }}
+                    className="max-w-4xl mx-auto"
+                >
+                    <h2 className="text-5xl sm:text-6xl md:text-8xl font-extrabold mb-8 tracking-tighter">
+                        {words.map((word, wordIndex) => (
+                            <span
+                                key={wordIndex}
+                                className="inline-block mr-4 last:mr-0"
+                            >
+                                {word.split("").map((letter, letterIndex) => (
+                                    <motion.span
+                                        key={`${wordIndex}-${letterIndex}`}
+                                        initial={{ y: 50, opacity: 0 }}
+                                        whileInView={{ y: 0, opacity: 1 }}
+                                        viewport={{ once: true }}
+                                        transition={{
+                                            delay:
+                                                wordIndex * 0.1 +
+                                                letterIndex * 0.02,
+                                            type: "spring",
+                                            stiffness: 150,
+                                            damping: 25,
+                                        }}
+                                        className="inline-block text-transparent bg-clip-text
+                                        bg-gradient-to-r from-white via-white/90 to-white/70"
+                                    >
+                                        {letter}
+                                    </motion.span>
+                                ))}
+                            </span>
+                        ))}
+                    </h2>
+                    <motion.p
+                        initial={{ opacity: 0, y: 20 }}
+                        whileInView={{ opacity: 1, y: 0 }}
+                        viewport={{ once: true }}
+                        transition={{ delay: 0.5 }}
+                        className="text-white/40 text-xl font-medium mb-12 max-w-2xl mx-auto tracking-tight"
+                    >
+                        {subtitle}
+                    </motion.p>
+                    <motion.div
+                        initial={{ opacity: 0, scale: 0.9 }}
+                        whileInView={{ opacity: 1, scale: 1 }}
+                        viewport={{ once: true }}
+                        transition={{ delay: 0.8 }}
+                        className="inline-block group relative bg-gradient-to-b from-white/10 to-indigo-500/10
+                        p-px rounded-2xl backdrop-blur-lg
+                        overflow-hidden shadow-lg hover:shadow-indigo-500/20 transition-all duration-300"
+                    >
+                        <button
+                            className="rounded-[1.15rem] px-10 py-5 text-lg font-bold backdrop-blur-md
+                            bg-white/95 hover:bg-white text-black transition-all duration-300
+                            group-hover:-translate-y-0.5 border border-white/10
+                            flex items-center gap-3"
+                        >
+                            <span className="opacity-90 group-hover:opacity-100 transition-opacity">
+                                Deploy Your First Agent
+                            </span>
+                            <ArrowRight
+                                className="w-5 h-5 opacity-70 group-hover:opacity-100 group-hover:translate-x-1.5
+                                transition-all duration-300"
+                            />
+                        </button>
+                    </motion.div>
+                </motion.div>
+            </div>
+            {/* Subtle glow effect at the bottom */}
+            <div className="absolute bottom-0 left-1/2 -translate-x-1/2 w-full h-px bg-gradient-to-r from-transparent via-indigo-500/50 to-transparent shadow-[0_0_50px_2px_rgba(99,102,241,0.2)]" />
+        </section>
+    );
+}

FRRONTEEEND/components/ChatInterface.tsx ADDED Viewed

	@@ -0,0 +1,571 @@

+import React, { useState, useRef, useEffect } from 'react';
+import { motion, AnimatePresence } from 'framer-motion';
+import { Send, Plus, Search, Settings, MoreHorizontal, User, Bot, ArrowLeft, Paperclip, Sparkles, Trash2, X, Upload } from 'lucide-react';
+import { cn } from '../lib/utils';
+import { Logo } from './Logo';
+import ReactMarkdown from 'react-markdown';
+interface Message {
+  id: string;
+  role: 'user' | 'assistant';
+  content: string;
+  timestamp: Date;
+  file?: {
+    name: string;
+    size: number;
+  };
+  reports?: Array<{
+    name: string;
+    path: string;
+  }>;
+}
+interface ChatSession {
+  id: string;
+  title: string;
+  messages: Message[];
+  updatedAt: Date;
+}
+export const ChatInterface: React.FC<{ onBack: () => void }> = ({ onBack }) => {
+  const [sessions, setSessions] = useState<ChatSession[]>([
+    {
+      id: '1',
+      title: 'ML Model Analysis',
+      messages: [],
+      updatedAt: new Date(),
+    }
+  ]);
+  const [activeSessionId, setActiveSessionId] = useState('1');
+  const [input, setInput] = useState('');
+  const [isTyping, setIsTyping] = useState(false);
+  const [uploadedFile, setUploadedFile] = useState<File | null>(null);
+  const [reportModalUrl, setReportModalUrl] = useState<string | null>(null);
+  const fileInputRef = useRef<HTMLInputElement>(null);
+  const scrollRef = useRef<HTMLDivElement>(null);
+  const activeSession = sessions.find(s => s.id === activeSessionId) || sessions[0];
+  useEffect(() => {
+    if (scrollRef.current) {
+      scrollRef.current.scrollTop = scrollRef.current.scrollHeight;
+    }
+  }, [activeSession.messages, isTyping]);
+  const handleSend = async () => {
+    if ((!input.trim() && !uploadedFile) || isTyping) return;
+    const userMessage: Message = {
+      id: Date.now().toString(),
+      role: 'user',
+      content: input || (uploadedFile ? `Uploaded: ${uploadedFile.name}` : ''),
+      timestamp: new Date(),
+      file: uploadedFile ? { name: uploadedFile.name, size: uploadedFile.size } : undefined,
+    };
+    const newMessages = [...activeSession.messages, userMessage];
+    updateSession(activeSessionId, newMessages);
+    setInput('');
+    setIsTyping(true);
+    try {
+      // Use the current origin if running on same server, otherwise use env variable
+      const API_URL = window.location.origin;
+      console.log('API URL:', API_URL);
+      let response;
+      if (uploadedFile) {
+        const formData = new FormData();
+        formData.append('file', uploadedFile);
+        formData.append('task_description', input || 'Analyze this dataset and provide insights');
+        formData.append('use_cache', 'true');
+        formData.append('max_iterations', '20');
+        response = await fetch(`${API_URL}/run`, {
+          method: 'POST',
+          body: formData
+        });
+        setUploadedFile(null);
+      } else {
+        response = await fetch(`${API_URL}/chat`, {
+          method: 'POST',
+          headers: {
+            'Content-Type': 'application/json',
+          },
+          body: JSON.stringify({
+            messages: newMessages.map(m => ({
+              role: m.role,
+              content: m.content
+            })),
+            stream: false
+          })
+        });
+      }
+      if (!response.ok) {
+        throw new Error(`API error: ${response.status}`);
+      }
+      const data = await response.json();
+      let assistantContent = '';
+      let reports: Array<{name: string, path: string}> = [];
+      if (uploadedFile && data.result) {
+        const result = data.result;
+        assistantContent = `✅ Analysis Complete!\n\n`;
+        // Extract report paths from workflow history
+        if (result.workflow_history) {
+          const reportTools = ['generate_ydata_profiling_report', 'generate_sweetviz_report', 'generate_combined_eda_report'];
+          result.workflow_history.forEach((step: any) => {
+            if (reportTools.includes(step.tool)) {
+              // Check multiple possible locations for the report path
+              const reportPath = step.result?.output_path || step.result?.report_path || step.arguments?.output_path;
+              if (reportPath && (step.result?.success !== false)) {
+                reports.push({
+                  name: step.tool.replace('generate_', '').replace(/_/g, ' ').replace('report', '').trim(),
+                  path: reportPath
+                });
+              }
+            }
+          });
+        }
+        // Also check for report paths mentioned in the summary text
+        if (result.summary && !reports.length) {
+          const reportPathMatch = result.summary.match(/\.(\/outputs\/reports\/[^\s]+\.html)/);
+          if (reportPathMatch) {
+            reports.push({
+              name: 'ydata profiling',
+              path: reportPathMatch[1]
+            });
+          }
+        }
+        if (result.summary) {
+          assistantContent += `**Summary:**\n${result.summary}\n\n`;
+        }
+        if (result.workflow_history && result.workflow_history.length > 0) {
+          assistantContent += `**Tools Used:** ${result.workflow_history.length} steps\n\n`;
+          assistantContent += `**Final Result:**\n${result.final_result || 'Analysis completed successfully'}`;
+        }
+      } else if (data.success && data.message) {
+        assistantContent = data.message;
+      } else {
+        throw new Error('Invalid response from API');
+      }
+      updateSession(activeSessionId, [...newMessages, {
+        id: (Date.now() + 1).toString(),
+        role: 'assistant',
+        content: assistantContent,
+        timestamp: new Date(),
+        reports: reports.length > 0 ? reports : undefined
+      }]);
+    } catch (error: any) {
+      console.error("Chat Error:", error);
+      let errorMessage = "I'm sorry, I encountered an error processing your request.";
+      if (error.message) {
+        errorMessage += `\n\n**Error:** ${error.message}`;
+      }
+      // Try to parse response error
+      try {
+        const errorText = await error.text?.();
+        if (errorText) {
+          const errorData = JSON.parse(errorText);
+          if (errorData.detail) {
+            errorMessage = `**Error:** ${typeof errorData.detail === 'string' ? errorData.detail : JSON.stringify(errorData.detail)}`;
+          }
+        }
+      } catch (e) {
+        // Ignore parsing errors
+      }
+      updateSession(activeSessionId, [...newMessages, {
+        id: 'err-' + Date.now(),
+        role: 'assistant',
+        content: errorMessage,
+        timestamp: new Date()
+      }]);
+    } finally {
+      setIsTyping(false);
+    }
+  };
+  const updateSession = (id: string, messages: Message[]) => {
+    setSessions(prev => prev.map(s => {
+      if (s.id === id) {
+        return { ...s, messages, updatedAt: new Date() };
+      }
+      return s;
+    }));
+  };
+  const createNewChat = () => {
+    const newId = Date.now().toString();
+    const newSession: ChatSession = {
+      id: newId,
+      title: 'New Chat',
+      messages: [],
+      updatedAt: new Date()
+    };
+    setSessions([newSession, ...sessions]);
+    setActiveSessionId(newId);
+  };
+  const deleteSession = (e: React.MouseEvent, id: string) => {
+    e.stopPropagation();
+    if (sessions.length === 1) return;
+    setSessions(prev => prev.filter(s => s.id !== id));
+    if (activeSessionId === id) {
+      setActiveSessionId(sessions.find(s => s.id !== id)?.id || '');
+    }
+  };
+  const handleFileSelect = (e: React.ChangeEvent<HTMLInputElement>) => {
+    const file = e.target.files?.[0];
+    if (file) {
+      const validTypes = ['.csv', '.parquet'];
+      const fileExt = file.name.substring(file.name.lastIndexOf('.')).toLowerCase();
+      if (validTypes.includes(fileExt)) {
+        setUploadedFile(file);
+      } else {
+        alert('Please upload a CSV or Parquet file');
+      }
+    }
+  };
+  const removeFile = () => {
+    setUploadedFile(null);
+    if (fileInputRef.current) {
+      fileInputRef.current.value = '';
+    }
+  };
+  return (
+    <div className="flex h-screen w-full bg-[#050505] overflow-hidden text-white/90">
+      {/* Sidebar */}
+      <aside className="w-[280px] hidden md:flex flex-col border-r border-white/5 bg-[#0a0a0a]/50 backdrop-blur-xl">
+        <div className="p-4 flex flex-col h-full">
+          <div className="flex items-center gap-3 mb-8 px-2">
+            <Logo className="w-8 h-8" />
+            <span className="font-bold tracking-tight text-sm uppercase">Console</span>
+          </div>
+          <button
+            onClick={createNewChat}
+            className="w-full flex items-center gap-3 px-4 py-3 rounded-xl bg-white/5 hover:bg-white/10 border border-white/10 transition-all text-sm font-medium mb-6 group"
+          >
+            <Plus className="w-4 h-4 group-hover:scale-110 transition-transform" />
+            New Conversation
+          </button>
+          <div className="flex-1 overflow-y-auto space-y-2 custom-scrollbar">
+            <p className="px-3 text-[10px] uppercase tracking-widest text-white/30 font-bold mb-2">History</p>
+            {sessions.map(session => (
+              <div
+                key={session.id}
+                onClick={() => setActiveSessionId(session.id)}
+                className={cn(
+                  "group flex items-center justify-between px-4 py-3 rounded-xl cursor-pointer transition-all text-sm",
+                  activeSessionId === session.id
+                    ? "bg-white/10 text-white border border-white/10 shadow-lg"
+                    : "text-white/40 hover:text-white/70 hover:bg-white/5"
+                )}
+              >
+                <span className="truncate flex-1 pr-2">{session.title}</span>
+                <Trash2
+                  onClick={(e) => deleteSession(e, session.id)}
+                  className="w-4 h-4 opacity-0 group-hover:opacity-100 hover:text-rose-400 transition-all"
+                />
+              </div>
+            ))}
+          </div>
+          <div className="mt-auto pt-4 border-t border-white/5 flex items-center justify-between px-2">
+            <button onClick={onBack} className="p-2 hover:bg-white/5 rounded-lg transition-colors text-white/40 hover:text-white">
+              <ArrowLeft className="w-5 h-5" />
+            </button>
+            <div className="flex gap-2">
+              <button className="p-2 hover:bg-white/5 rounded-lg transition-colors text-white/40 hover:text-white">
+                <Settings className="w-5 h-5" />
+              </button>
+              <button className="p-2 hover:bg-white/5 rounded-lg transition-colors text-white/40 hover:text-white">
+                <User className="w-5 h-5" />
+              </button>
+            </div>
+          </div>
+        </div>
+      </aside>
+      {/* Main Chat Area */}
+      <main className="flex-1 flex flex-col relative bg-gradient-to-b from-[#080808] to-[#050505]">
+        {/* Top Header */}
+        <header className="h-16 flex items-center justify-between px-6 border-b border-white/5 backdrop-blur-md bg-black/20 sticky top-0 z-10">
+          <div className="flex items-center gap-4">
+             <button onClick={onBack} className="md:hidden p-2 hover:bg-white/5 rounded-lg">
+               <ArrowLeft className="w-5 h-5" />
+             </button>
+             <div>
+               <h2 className="text-sm font-bold text-white tracking-tight">{activeSession.title}</h2>
+               <p className="text-[10px] text-white/30 font-medium">{activeSession.messages.length} messages in session</p>
+             </div>
+          </div>
+          <div className="flex items-center gap-3">
+            <button className="p-2 text-white/40 hover:text-white transition-colors">
+              <Search className="w-5 h-5" />
+            </button>
+            <button className="p-2 text-white/40 hover:text-white transition-colors">
+              <MoreHorizontal className="w-5 h-5" />
+            </button>
+          </div>
+        </header>
+        {/* Message List */}
+        <div
+          ref={scrollRef}
+          className="flex-1 overflow-y-auto p-4 md:p-8 space-y-8 scroll-smooth"
+        >
+          {activeSession.messages.length === 0 ? (
+            <div className="h-full flex flex-col items-center justify-center text-center px-4">
+               <motion.div
+                 initial={{ opacity: 0, scale: 0.9 }}
+                 animate={{ opacity: 1, scale: 1 }}
+                 className="w-16 h-16 bg-gradient-to-br from-indigo-500/20 to-rose-500/20 rounded-2xl flex items-center justify-center mb-6 border border-white/10"
+               >
+                 <Sparkles className="w-8 h-8 text-indigo-400" />
+               </motion.div>
+               <h1 className="text-2xl font-extrabold text-white mb-3">Welcome, Data Scientist</h1>
+               <p className="text-white/40 max-w-sm leading-relaxed text-sm">
+                 I'm your autonomous agent ready to profile data, train models, or build dashboards.
+                 Try uploading a dataset or describing your ML objective.
+               </p>
+               <div className="grid grid-cols-1 sm:grid-cols-2 gap-3 mt-8 w-full max-w-lg">
+                  {[
+                    "Profile my sales.csv",
+                    "Train a XGBoost classifier",
+                    "Generate a correlation heatmap",
+                    "Explain feature importance"
+                  ].map(prompt => (
+                    <button
+                      key={prompt}
+                      onClick={() => setInput(prompt)}
+                      className="text-left px-4 py-3 rounded-xl bg-white/[0.03] border border-white/5 hover:bg-white/5 transition-all text-xs text-white/60 hover:text-white"
+                    >
+                      "{prompt}"
+                    </button>
+                  ))}
+               </div>
+            </div>
+          ) : (
+            activeSession.messages.map((msg) => (
+              <motion.div
+                key={msg.id}
+                initial={{ opacity: 0, y: 10 }}
+                animate={{ opacity: 1, y: 0 }}
+                className={cn(
+                  "flex w-full gap-4",
+                  msg.role === 'user' ? "flex-row-reverse" : "flex-row"
+                )}
+              >
+                <div className={cn(
+                  "w-8 h-8 rounded-lg flex items-center justify-center shrink-0 border border-white/10",
+                  msg.role === 'user' ? "bg-indigo-500/20" : "bg-white/5"
+                )}>
+                  {msg.role === 'user' ? <User className="w-4 h-4" /> : <Bot className="w-4 h-4 text-indigo-400" />}
+                </div>
+                <div className={cn(
+                  "max-w-[80%] md:max-w-[70%] p-4 rounded-2xl text-sm leading-relaxed",
+                  msg.role === 'user'
+                    ? "bg-indigo-600/20 text-indigo-50 border border-indigo-500/20"
+                    : "bg-white/[0.03] text-white/80 border border-white/5"
+                )}>
+                  {msg.file && (
+                    <div className="mb-2 flex items-center gap-2 text-xs bg-white/5 rounded-lg px-3 py-2 border border-white/10">
+                      <Paperclip className="w-3 h-3" />
+                      <span className="font-medium">{msg.file.name}</span>
+                      <span className="text-white/40">({(msg.file.size / 1024).toFixed(1)} KB)</span>
+                    </div>
+                  )}
+                  {msg.role === 'assistant' ? (
+                    <ReactMarkdown
+                      className="prose prose-invert prose-sm max-w-none prose-p:leading-relaxed prose-pre:bg-black/40 prose-pre:border prose-pre:border-white/10 prose-headings:text-white prose-strong:text-white prose-li:text-white/80"
+                      components={{
+                        p: ({node, ...props}) => <p className="mb-3 last:mb-0" {...props} />,
+                        ul: ({node, ...props}) => <ul className="mb-3 space-y-1" {...props} />,
+                        ol: ({node, ...props}) => <ol className="mb-3 space-y-1" {...props} />,
+                        li: ({node, ...props}) => <li className="ml-4" {...props} />,
+                        strong: ({node, ...props}) => <strong className="font-semibold text-white" {...props} />,
+                        code: ({node, inline, ...props}: any) =>
+                          inline ?
+                            <code className="px-1.5 py-0.5 rounded bg-white/10 text-indigo-300 text-xs font-mono" {...props} /> :
+                            <code className="block p-3 rounded-lg bg-black/40 border border-white/10 text-xs font-mono overflow-x-auto" {...props} />
+                      }}
+                    >
+                      {msg.content || ''}
+                    </ReactMarkdown>
+                  ) : (
+                    msg.content || (msg.role === 'assistant' && isTyping && "...")
+                  )}
+                  {msg.reports && msg.reports.length > 0 && (
+                    <div className="mt-4 flex flex-wrap gap-2">
+                      {msg.reports.map((report, idx) => (
+                        <button
+                          key={idx}
+                          onClick={() => setReportModalUrl(`${window.location.origin}${report.path}`)}
+                          className="flex items-center gap-2 px-4 py-2 rounded-lg bg-indigo-500/20 hover:bg-indigo-500/30 border border-indigo-500/30 text-indigo-200 text-xs font-medium transition-all group"
+                        >
+                          <Sparkles className="w-3.5 h-3.5 group-hover:scale-110 transition-transform" />
+                          View {report.name} Report
+                        </button>
+                      ))}
+                    </div>
+                  )}
+                  <div className="mt-2 text-[10px] opacity-20 font-mono">
+                    {msg.timestamp.toLocaleTimeString([], { hour: '2-digit', minute: '2-digit' })}
+                  </div>
+                </div>
+              </motion.div>
+            ))
+          )}
+          {isTyping && activeSession.messages[activeSession.messages.length - 1]?.role === 'user' && (
+             <div className="flex gap-4">
+                <div className="w-8 h-8 rounded-lg flex items-center justify-center shrink-0 bg-white/5 border border-white/10">
+                  <Bot className="w-4 h-4 text-indigo-400" />
+                </div>
+                <div className="bg-white/[0.03] p-4 rounded-2xl border border-white/5">
+                  <div className="flex gap-1">
+                    <span className="w-1.5 h-1.5 bg-white/20 rounded-full animate-bounce [animation-delay:-0.3s]"></span>
+                    <span className="w-1.5 h-1.5 bg-white/20 rounded-full animate-bounce [animation-delay:-0.15s]"></span>
+                    <span className="w-1.5 h-1.5 bg-white/20 rounded-full animate-bounce"></span>
+                  </div>
+                </div>
+             </div>
+          )}
+        </div>
+        {/* Input Bar */}
+        <div className="p-4 md:p-8 pt-0">
+          <div className="max-w-4xl mx-auto relative">
+            <div className="absolute -top-10 left-4 flex gap-2">
+               <input
+                 ref={fileInputRef}
+                 type="file"
+                 accept=".csv,.parquet"
+                 onChange={handleFileSelect}
+                 className="hidden"
+                 id="file-upload"
+               />
+               <label
+                 htmlFor="file-upload"
+                 className="flex items-center gap-1.5 px-3 py-1 rounded-full bg-white/[0.03] border border-white/5 text-[10px] text-white/40 hover:text-white hover:bg-white/5 transition-all cursor-pointer"
+               >
+                  <Upload className="w-3 h-3" /> Upload Dataset
+               </label>
+               {uploadedFile && (
+                 <div className="flex items-center gap-2 px-3 py-1 rounded-full bg-indigo-500/20 border border-indigo-500/30 text-[10px] text-indigo-200">
+                   <Paperclip className="w-3 h-3" />
+                   <span className="max-w-[150px] truncate">{uploadedFile.name}</span>
+                   <button onClick={removeFile} className="hover:text-white transition-colors">
+                     <X className="w-3 h-3" />
+                   </button>
+                 </div>
+               )}
+            </div>
+            <div className="relative group">
+               <textarea
+                value={input}
+                onChange={(e) => setInput(e.target.value)}
+                onKeyDown={(e) => {
+                  if (e.key === 'Enter' && !e.shiftKey) {
+                    e.preventDefault();
+                    handleSend();
+                  }
+                }}
+                placeholder={uploadedFile ? "Describe what you want to do with this dataset..." : "Ask your agent anything or upload a dataset..."}
+                className="w-full bg-[#0d0d0d] border border-white/10 rounded-2xl p-4 pr-16 text-sm min-h-[56px] max-h-48 resize-none focus:outline-none focus:border-indigo-500/50 focus:ring-1 focus:ring-indigo-500/20 transition-all text-white/90 placeholder:text-white/20 shadow-2xl"
+              />
+              <button
+                onClick={handleSend}
+                disabled={(!input.trim() && !uploadedFile) || isTyping}
+                className={cn(
+                  "absolute right-3 bottom-3 p-2.5 rounded-xl transition-all",
+                  (input.trim() || uploadedFile) && !isTyping
+                    ? "bg-white text-black hover:scale-105 active:scale-95"
+                    : "bg-white/5 text-white/20 cursor-not-allowed"
+                )}
+              >
+                <Send className="w-4 h-4" />
+              </button>
+            </div>
+            <p className="text-center mt-3 text-[10px] text-white/20 font-medium">
+              Enterprise Data Agent v3.1 | Secured with end-to-end encryption
+            </p>
+          </div>
+        </div>
+      </main>
+      {/* Report Modal */}
+      <AnimatePresence>
+        {reportModalUrl && (
+          <motion.div
+            initial={{ opacity: 0 }}
+            animate={{ opacity: 1 }}
+            exit={{ opacity: 0 }}
+            className="fixed inset-0 bg-black/80 backdrop-blur-sm z-50 flex items-center justify-center p-4"
+            onClick={() => setReportModalUrl(null)}
+          >
+            <motion.div
+              initial={{ scale: 0.95, opacity: 0 }}
+              animate={{ scale: 1, opacity: 1 }}
+              exit={{ scale: 0.95, opacity: 0 }}
+              className="bg-[#0a0a0a] border border-white/10 rounded-2xl w-full max-w-7xl h-[90vh] flex flex-col overflow-hidden shadow-2xl"
+              onClick={(e) => e.stopPropagation()}
+            >
+              <div className="flex items-center justify-between p-4 border-b border-white/5">
+                <h3 className="text-lg font-semibold text-white">Data Profiling Report</h3>
+                <button
+                  onClick={() => setReportModalUrl(null)}
+                  className="p-2 rounded-lg hover:bg-white/5 transition-colors"
+                >
+                  <X className="w-5 h-5" />
+                </button>
+              </div>
+              <iframe
+                src={reportModalUrl}
+                className="flex-1 w-full bg-white"
+                title="Report Viewer"
+              />
+            </motion.div>
+          </motion.div>
+        )}
+      </AnimatePresence>
+      <style>{`
+        .custom-scrollbar::-webkit-scrollbar {
+          width: 4px;
+        }
+        .custom-scrollbar::-webkit-scrollbar-track {
+          background: transparent;
+        }
+        .custom-scrollbar::-webkit-scrollbar-thumb {
+          background: rgba(255, 255, 255, 0.05);
+          border-radius: 10px;
+        }
+        .custom-scrollbar::-webkit-scrollbar-thumb:hover {
+          background: rgba(255, 255, 255, 0.1);
+        }
+      `}</style>
+    </div>
+  );
+};

FRRONTEEEND/components/Footer.tsx ADDED Viewed

	@@ -0,0 +1,171 @@

+import React, { useRef, useId, useEffect } from 'react';
+import { motion, animate, useMotionValue, AnimationPlaybackControls } from 'framer-motion';
+import { ArrowRight } from 'lucide-react';
+import { Logo } from './Logo';
+function mapRange(
+    value: number,
+    fromLow: number,
+    fromHigh: number,
+    toLow: number,
+    toHigh: number
+): number {
+    if (fromLow === fromHigh) {
+        return toLow;
+    }
+    const percentage = (value - fromLow) / (fromHigh - fromLow);
+    return toLow + percentage * (toHigh - toLow);
+}
+const Footer = () => {
+  const id = useId().replace(/:/g, "");
+  const instanceId = `footer-shadow-${id}`;
+  const feColorMatrixRef = useRef<SVGFEColorMatrixElement>(null);
+  const hueRotateMotionValue = useMotionValue(0);
+  const hueRotateAnimation = useRef<AnimationPlaybackControls | null>(null);
+  // Configuration from ShadowSection
+  const animationScale = 50;
+  const animationSpeed = 15;
+  const displacementScale = mapRange(animationScale, 1, 100, 20, 100);
+  const animationDuration = mapRange(animationSpeed, 1, 100, 1000, 50);
+  useEffect(() => {
+    if (feColorMatrixRef.current) {
+        hueRotateAnimation.current = animate(hueRotateMotionValue, 360, {
+            duration: animationDuration / 25,
+            repeat: Infinity,
+            repeatType: "loop",
+            ease: "linear",
+            onUpdate: (value: number) => {
+                if (feColorMatrixRef.current) {
+                    feColorMatrixRef.current.setAttribute("values", String(value));
+                }
+            }
+        });
+        return () => hueRotateAnimation.current?.stop();
+    }
+  }, [animationDuration, hueRotateMotionValue]);
+  return (
+    <footer className="bg-[#030303] overflow-hidden">
+      {/* High-Impact CTA with Atmospheric Shadow UI */}
+      <section className="relative w-full py-32 md:py-48 flex items-center justify-center border-t border-white/5">
+        <div
+            className="absolute inset-0 pointer-events-none overflow-hidden"
+            style={{
+                filter: `url(#${instanceId}) blur(12px)`,
+                opacity: 0.8
+            }}
+        >
+            <svg style={{ position: "absolute", width: 0, height: 0 }}>
+                <defs>
+                    <filter id={instanceId}>
+                        <feTurbulence
+                            result="undulation"
+                            numOctaves="2"
+                            baseFrequency={`${mapRange(animationScale, 0, 100, 0.001, 0.0005)},${mapRange(animationScale, 0, 100, 0.004, 0.002)}`}
+                            seed="0"
+                            type="turbulence"
+                        />
+                        <feColorMatrix
+                            ref={feColorMatrixRef}
+                            in="undulation"
+                            type="hueRotate"
+                            values="180"
+                        />
+                        <feColorMatrix
+                            in="dist"
+                            result="circulation"
+                            type="matrix"
+                            values="4 0 0 0 1  4 0 0 0 1  4 0 0 0 1  1 0 0 0 0"
+                        />
+                        <feDisplacementMap
+                            in="SourceGraphic"
+                            in2="circulation"
+                            scale={displacementScale}
+                            result="dist"
+                        />
+                        <feDisplacementMap
+                            in="dist"
+                            in2="undulation"
+                            scale={displacementScale}
+                            result="output"
+                        />
+                    </filter>
+                </defs>
+            </svg>
+            <div
+                style={{
+                    backgroundColor: 'rgba(99, 102, 241, 0.4)',
+                    maskImage: `url('https://framerusercontent.com/images/ceBGguIpUU8luwByxuQz79t7To.png')`,
+                    maskSize: "cover",
+                    maskRepeat: "no-repeat",
+                    maskPosition: "center",
+                    width: "120%",
+                    height: "120%",
+                    position: 'absolute',
+                    top: '-10%',
+                    left: '-10%'
+                }}
+            />
+        </div>
+        {/* Noise overlay */}
+        <div
+            className="absolute inset-0 pointer-events-none opacity-[0.03]"
+            style={{
+                backgroundImage: `url("https://framerusercontent.com/images/g0QcWrxr87K0ufOxIUFBakwYA8.png")`,
+                backgroundSize: '100px',
+                backgroundRepeat: "repeat",
+            }}
+        />
+        <div className="relative z-20 max-w-7xl mx-auto px-6 text-center">
+            <motion.div
+                initial={{ opacity: 0, y: 30 }}
+                whileInView={{ opacity: 1, y: 0 }}
+                viewport={{ once: true }}
+                transition={{ duration: 0.8 }}
+            >
+                <h2 className="text-4xl md:text-7xl font-extrabold text-white mb-8 tracking-tighter">
+                    Ready to automate your workflow?
+                </h2>
+                <p className="text-white/50 text-xl md:text-2xl mb-12 max-w-2xl mx-auto font-medium leading-relaxed">
+                    Build smarter ML workflows with AI autonomy. Join the next generation of data scientists.
+                </p>
+                <button className="group relative px-10 py-5 bg-white text-black font-extrabold rounded-2xl transition-all hover:scale-105 active:scale-95 shadow-[0_0_50px_-12px_rgba(255,255,255,0.5)] flex items-center gap-3 mx-auto">
+                    Get Started Now
+                    <ArrowRight className="w-5 h-5 group-hover:translate-x-1 transition-transform" />
+                </button>
+            </motion.div>
+        </div>
+        {/* Gradient fades to blend with rest of footer */}
+        <div className="absolute inset-x-0 bottom-0 h-40 bg-gradient-to-t from-[#030303] to-transparent z-10" />
+        <div className="absolute inset-x-0 top-0 h-40 bg-gradient-to-b from-[#030303] to-transparent z-10" />
+      </section>
+      {/* Main Footer Links */}
+      <div className="max-w-7xl mx-auto px-6 pb-20">
+        <div className="pt-8 border-t border-white/5 flex flex-col md:flex-row justify-between items-center gap-6">
+          <div className="flex items-center gap-4">
+            <Logo className="w-8 h-8" />
+            <span className="text-white font-extrabold tracking-tight uppercase">DATA SCIENCE AGENT</span>
+          </div>
+          <div className="text-white/30 text-[10px] sm:text-xs font-semibold uppercase tracking-wider">
+            © 2025 Data Science Agent. Built for the autonomous future.
+          </div>
+          <div className="flex gap-8 text-white/40 text-sm font-bold italic">
+            <a href="#" className="hover:text-white transition-colors">Twitter</a>
+            <a href="#" className="hover:text-white transition-colors">GitHub</a>
+            <a href="#" className="hover:text-white transition-colors">Docs</a>
+          </div>
+        </div>
+      </div>
+    </footer>
+  );
+};
+export default Footer;

FRRONTEEEND/components/HeroGeometric.tsx ADDED Viewed

	@@ -0,0 +1,213 @@

+import React from 'react';
+import { motion, Variants } from "framer-motion";
+import { Circle, MessageSquare } from "lucide-react";
+import { cn } from "../lib/utils";
+function ElegantShape({
+    className,
+    delay = 0,
+    width = 400,
+    height = 100,
+    rotate = 0,
+    gradient = "from-white/[0.08]",
+}: {
+    className?: string;
+    delay?: number;
+    width?: number;
+    height?: number;
+    rotate?: number;
+    gradient?: string;
+}) {
+    return (
+        <motion.div
+            initial={{
+                opacity: 0,
+                y: -150,
+                rotate: rotate - 15,
+            }}
+            animate={{
+                opacity: 1,
+                y: 0,
+                rotate: rotate,
+            }}
+            transition={{
+                duration: 2.4,
+                delay,
+                ease: [0.23, 0.86, 0.39, 0.96],
+                opacity: { duration: 1.2 },
+            }}
+            className={cn("absolute", className)}
+        >
+            <motion.div
+                animate={{
+                    y: [0, 15, 0],
+                }}
+                transition={{
+                    duration: 12,
+                    repeat: Number.POSITIVE_INFINITY,
+                    ease: "easeInOut",
+                }}
+                style={{
+                    width,
+                    height,
+                }}
+                className="relative"
+            >
+                <div
+                    className={cn(
+                        "absolute inset-0 rounded-full",
+                        "bg-gradient-to-r to-transparent",
+                        gradient,
+                        "backdrop-blur-[2px] border-2 border-white/[0.15]",
+                        "shadow-[0_8px_32px_0_rgba(255,255,255,0.1)]",
+                        "after:absolute after:inset-0 after:rounded-full",
+                        "after:bg-[radial-gradient(circle_at_50%_50%,rgba(255,255,255,0.2),transparent_70%)]"
+                    )}
+                />
+            </motion.div>
+        </motion.div>
+    );
+}
+export function HeroGeometric({
+    badge = "Autonomous AI for Data Science",
+    title1 = "DATA SCIENCE AGENT",
+    title2 = "Autonomous AI for End-to-End ML",
+    onChatClick,
+}: {
+    badge?: string;
+    title1?: string;
+    title2?: string;
+    onChatClick?: () => void;
+}) {
+    const fadeUpVariants: Variants = {
+        hidden: { opacity: 0, y: 30 },
+        visible: (i: number) => ({
+            opacity: 1,
+            y: 0,
+            transition: {
+                duration: 1,
+                delay: 0.5 + i * 0.2,
+                ease: [0.25, 0.4, 0.25, 1] as [number, number, number, number],
+            },
+        }),
+    };
+    return (
+        <div className="relative min-h-screen w-full flex items-center justify-center overflow-hidden bg-[#030303]">
+            <div className="absolute inset-0 bg-gradient-to-br from-indigo-500/[0.05] via-transparent to-rose-500/[0.05] blur-3xl" />
+            <div className="absolute inset-0 overflow-hidden">
+                <ElegantShape
+                    delay={0.3}
+                    width={600}
+                    height={140}
+                    rotate={12}
+                    gradient="from-indigo-500/[0.15]"
+                    className="left-[-10%] md:left-[-5%] top-[15%] md:top-[20%]"
+                />
+                <ElegantShape
+                    delay={0.5}
+                    width={500}
+                    height={120}
+                    rotate={-15}
+                    gradient="from-rose-500/[0.15]"
+                    className="right-[-5%] md:right-[0%] top-[70%] md:top-[75%]"
+                />
+                <ElegantShape
+                    delay={0.4}
+                    width={300}
+                    height={80}
+                    rotate={-8}
+                    gradient="from-violet-500/[0.15]"
+                    className="left-[5%] md:left-[10%] bottom-[5%] md:bottom-[10%]"
+                />
+                <ElegantShape
+                    delay={0.6}
+                    width={200}
+                    height={60}
+                    rotate={20}
+                    gradient="from-amber-500/[0.15]"
+                    className="right-[15%] md:right-[20%] top-[10%] md:top-[15%]"
+                />
+                <ElegantShape
+                    delay={0.7}
+                    width={150}
+                    height={40}
+                    rotate={-25}
+                    gradient="from-cyan-500/[0.15]"
+                    className="left-[20%] md:left-[25%] top-[5%] md:top-[10%]"
+                />
+            </div>
+            <div className="relative z-10 container mx-auto px-4 md:px-6">
+                <div className="max-w-4xl mx-auto text-center">
+                    <motion.div
+                        custom={0}
+                        variants={fadeUpVariants}
+                        initial="hidden"
+                        animate="visible"
+                        className="inline-flex items-center gap-2 px-4 py-1.5 rounded-full bg-white/[0.03] border border-white/[0.08] mb-6 md:mb-10"
+                    >
+                        <Circle className="h-2 w-2 fill-indigo-500/80" />
+                        <span className="text-xs font-semibold text-white/60 tracking-[0.1em] uppercase">
+                            {badge}
+                        </span>
+                    </motion.div>
+                    <motion.div
+                        custom={1}
+                        variants={fadeUpVariants}
+                        initial="hidden"
+                        animate="visible"
+                    >
+                        <h1 className="text-3xl sm:text-4xl md:text-6xl font-extrabold mb-6 md:mb-8 tracking-tight leading-[1.1]">
+                            <span className="bg-clip-text text-transparent bg-gradient-to-b from-white to-white/80">
+                                {title1}
+                            </span>
+                            <br />
+                            <span
+                                className={cn(
+                                    "bg-clip-text text-transparent bg-gradient-to-r from-indigo-300 via-white/90 to-rose-300"
+                                )}
+                            >
+                                {title2}
+                            </span>
+                        </h1>
+                    </motion.div>
+                    <motion.div
+                        custom={2}
+                        variants={fadeUpVariants}
+                        initial="hidden"
+                        animate="visible"
+                    >
+                        <p className="text-sm sm:text-base md:text-lg text-white/40 mb-10 leading-relaxed font-normal tracking-tight max-w-xl mx-auto px-4">
+                            Upload your data. Describe your goal.
+                            Let AI handle profiling, modeling, visualization, and strategic insights autonomously.
+                        </p>
+                    </motion.div>
+                    <motion.div
+                        custom={3}
+                        variants={fadeUpVariants}
+                        initial="hidden"
+                        animate="visible"
+                        className="flex flex-col sm:flex-row items-center justify-center gap-4 px-4"
+                    >
+                        <button
+                            onClick={onChatClick}
+                            className="w-full sm:w-auto px-8 py-3.5 bg-white text-black font-bold rounded-xl hover:bg-white/90 transition-all flex items-center justify-center gap-2 group text-sm shadow-xl"
+                        >
+                            Chat Now
+                            <MessageSquare className="w-4 h-4 fill-black group-hover:translate-x-0.5 transition-transform" />
+                        </button>
+                    </motion.div>
+                </div>
+            </div>
+            <div className="absolute inset-0 bg-gradient-to-t from-[#030303] via-transparent to-[#030303]/80 pointer-events-none" />
+        </div>
+    );
+}

FRRONTEEEND/components/KeyCapabilities.tsx ADDED Viewed

	@@ -0,0 +1,91 @@

+import React from 'react';
+import { motion } from 'framer-motion';
+import { Database, Wrench, Cpu, Brain, LineChart, Server } from 'lucide-react';
+import { cn } from '../lib/utils';
+const capabilities = [
+  {
+    title: "Autonomous ML Pipelines",
+    description: "End-to-end automation from profiling to deployment without manual coding.",
+    icon: Database,
+    color: "from-blue-500/20 to-cyan-500/20",
+    hover: "hover:bg-blue-500/10 hover:border-blue-500/30 hover:shadow-[0_0_30px_-10px_rgba(59,130,246,0.2)]"
+  },
+  {
+    title: "82+ Specialized Tools",
+    description: "An extensive arsenal for cleaning, statistical testing, and predictive modeling.",
+    icon: Wrench,
+    color: "from-purple-500/20 to-pink-500/20",
+    hover: "hover:bg-pink-500/10 hover:border-pink-500/30 hover:shadow-[0_0_30px_-10px_rgba(236,72,153,0.2)]"
+  },
+  {
+    title: "Dual LLM Intelligence",
+    description: "Orchestrated by Groq (for speed) and Gemini (for deep reasoning).",
+    icon: Brain,
+    color: "from-orange-500/20 to-amber-500/20",
+    hover: "hover:bg-amber-500/10 hover:border-amber-500/30 hover:shadow-[0_0_30px_-10px_rgba(245,158,11,0.2)]"
+  },
+  {
+    title: "Session Memory",
+    description: "Maintains context across complex workflows, allowing for iterative refinement.",
+    icon: Cpu,
+    color: "from-emerald-500/20 to-teal-500/20",
+    hover: "hover:bg-emerald-500/10 hover:border-emerald-500/30 hover:shadow-[0_0_30px_-10px_rgba(16,185,129,0.2)]"
+  },
+  {
+    title: "Visual Insights",
+    description: "Automatic generation of publication-quality charts and explainability reports.",
+    icon: LineChart,
+    color: "from-indigo-500/20 to-blue-500/20",
+    hover: "hover:bg-indigo-500/10 hover:border-indigo-500/30 hover:shadow-[0_0_30px_-10px_rgba(99,102,241,0.2)]"
+  },
+  {
+    title: "Cloud Run Ready",
+    description: "Deploy your optimized models directly to production-grade cloud environments.",
+    icon: Server,
+    color: "from-rose-500/20 to-red-500/20",
+    hover: "hover:bg-rose-500/10 hover:border-rose-500/30 hover:shadow-[0_0_30px_-10px_rgba(244,63,94,0.2)]"
+  }
+];
+const KeyCapabilities = () => {
+  return (
+    <section id="features" className="py-24 bg-[#030303]">
+      <div className="max-w-7xl mx-auto px-6">
+        <div className="text-center mb-16">
+          <h2 className="text-4xl md:text-5xl font-extrabold text-white mb-4 tracking-tight">Powerful Orchestration</h2>
+          <p className="text-white/40 text-xl font-medium">Not just a chatbot, but a true system of intelligence.</p>
+        </div>
+        <div className="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-3 gap-8">
+          {capabilities.map((cap, i) => (
+            <motion.div
+              key={i}
+              initial={{ opacity: 0, y: 20 }}
+              whileInView={{ opacity: 1, y: 0 }}
+              viewport={{ once: true }}
+              transition={{ delay: i * 0.1 }}
+              whileHover={{ scale: 1.02, y: -5 }}
+              className={cn(
+                "group p-8 rounded-2xl bg-white/[0.02] border border-white/[0.08] transition-all duration-300 cursor-default",
+                cap.hover
+              )}
+            >
+              <div className={cn(
+                "w-12 h-12 rounded-lg bg-gradient-to-br flex items-center justify-center mb-6 group-hover:scale-110 transition-transform duration-300",
+                cap.color
+              )}>
+                <cap.icon className="w-6 h-6 text-white" />
+              </div>
+              <h3 className="text-xl font-bold text-white mb-3 tracking-tight">{cap.title}</h3>
+              <p className="text-white/50 leading-relaxed font-medium">{cap.description}</p>
+            </motion.div>
+          ))}
+        </div>
+      </div>
+    </section>
+  );
+};
+export default KeyCapabilities;

FRRONTEEEND/components/Logo.tsx ADDED Viewed

	@@ -0,0 +1,92 @@

+import React from 'react';
+import { cn } from '../lib/utils';
+interface LogoProps {
+  className?: string;
+  showText?: boolean;
+}
+export const Logo: React.FC<LogoProps> = ({ className, showText = false }) => {
+  return (
+    <div className={cn("flex flex-col items-center", className)}>
+      <svg
+        viewBox="0 0 120 120"
+        className="w-full h-full"
+        fill="none"
+        xmlns="http://www.w3.org/2000/svg"
+      >
+        <defs>
+          <linearGradient id="logoGradient" x1="0%" y1="0%" x2="100%" y2="100%">
+            <stop offset="0%" stopColor="#22d3ee" />
+            <stop offset="100%" stopColor="#6366f1" />
+          </linearGradient>
+          <filter id="glow" x="-20%" y="-20%" width="140%" height="140%">
+            <feGaussianBlur stdDeviation="2" result="blur" />
+            <feComposite in="SourceGraphic" in2="blur" operator="over" />
+          </filter>
+        </defs>
+        {/* Central Core */}
+        <circle cx="60" cy="60" r="6" fill="url(#logoGradient)" filter="url(#glow)" />
+        {/* Inner Circuit Ring */}
+        <circle cx="60" cy="60" r="18" stroke="url(#logoGradient)" strokeWidth="1" strokeDasharray="2 4" opacity="0.4" />
+        {/* Complex Neural Paths (Stylized) */}
+        <g opacity="0.8">
+          {[0, 45, 90, 135, 180, 225, 270, 315].map((angle) => (
+            <g key={angle} transform={`rotate(${angle} 60 60)`}>
+              <path
+                d="M60 35 L60 30 M60 30 L55 25 M60 30 L65 25"
+                stroke="url(#logoGradient)"
+                strokeWidth="1.5"
+                strokeLinecap="round"
+              />
+              <circle cx="55" cy="25" r="1.5" fill="url(#logoGradient)" />
+              <circle cx="65" cy="25" r="1.5" fill="url(#logoGradient)" />
+            </g>
+          ))}
+        </g>
+        {/* Middle Dashed Ring */}
+        <circle cx="60" cy="60" r="32" stroke="url(#logoGradient)" strokeWidth="1.5" strokeDasharray="10 6" opacity="0.6" />
+        {/* Outer Orbital with Squares */}
+        <circle cx="60" cy="60" r="45" stroke="url(#logoGradient)" strokeWidth="0.5" opacity="0.3" />
+        {[0, 30, 60, 90, 120, 150, 180, 210, 240, 270, 300, 330].map((angle) => (
+          <rect
+            key={angle}
+            x="58"
+            y="12"
+            width="4"
+            height="4"
+            fill="url(#logoGradient)"
+            transform={`rotate(${angle} 60 60)`}
+            rx="1"
+          />
+        ))}
+        {/* Connection Spokes */}
+        {[0, 90, 180, 270].map((angle) => (
+          <line
+            key={angle}
+            x1="60"
+            y1="16"
+            x2="60"
+            y2="30"
+            stroke="url(#logoGradient)"
+            strokeWidth="1"
+            opacity="0.5"
+            transform={`rotate(${angle} 60 60)`}
+          />
+        ))}
+      </svg>
+      {showText && (
+        <span className="mt-2 text-white font-extrabold tracking-widest text-[10px] sm:text-xs uppercase">
+          DATA SCIENCE AGENT
+        </span>
+      )}
+    </div>
+  );
+};

FRRONTEEEND/components/ProblemSolution.tsx ADDED Viewed

	@@ -0,0 +1,70 @@

+import React from 'react';
+import { motion } from 'framer-motion';
+import { AlertCircle, Zap, ShieldCheck, Clock } from 'lucide-react';
+const ProblemSolution = () => {
+  return (
+    <section className="py-24 relative bg-[#030303] overflow-hidden">
+      <div className="max-w-7xl mx-auto px-6">
+        <div className="grid grid-cols-1 lg:grid-cols-2 gap-16 items-center">
+          <motion.div
+            initial={{ opacity: 0, x: -30 }}
+            whileInView={{ opacity: 1, x: 0 }}
+            viewport={{ once: true }}
+            transition={{ duration: 0.8 }}
+          >
+            <h2 className="text-3xl md:text-5xl font-extrabold text-white mb-6 tracking-tight">
+              The Data Science <span className="text-rose-400">Bottleneck</span>
+            </h2>
+            <p className="text-white/60 text-lg mb-8 leading-relaxed font-medium">
+              Modern data science is 80% manual labor. Cleaning messy datasets, engineering features, and tuning models takes weeks of repetitive effort. Mistakes are costly, and scaling insights is slow.
+            </p>
+            <ul className="space-y-4">
+              {[
+                { icon: AlertCircle, text: "Error-prone manual data preprocessing", color: "text-rose-400" },
+                { icon: Clock, text: "Days spent on hyperparameter tuning", color: "text-rose-400" },
+                { icon: AlertCircle, text: "Disconnected silos of code and insights", color: "text-rose-400" },
+              ].map((item, i) => (
+                <li key={i} className="flex items-center gap-3 text-white/80 font-semibold">
+                  <item.icon className={`w-5 h-5 ${item.color}`} />
+                  <span>{item.text}</span>
+                </li>
+              ))}
+            </ul>
+          </motion.div>
+          <motion.div
+            initial={{ opacity: 0, x: 30 }}
+            whileInView={{ opacity: 1, x: 0 }}
+            viewport={{ once: true }}
+            transition={{ duration: 0.8 }}
+            className="relative p-8 md:p-12 rounded-3xl bg-gradient-to-br from-indigo-500/10 via-white/5 to-rose-500/10 border border-white/10"
+          >
+            <div className="absolute -top-6 -right-6 w-32 h-32 bg-indigo-500/20 blur-3xl" />
+            <h2 className="text-3xl md:text-5xl font-extrabold text-white mb-6 tracking-tight">
+              The <span className="text-indigo-400">Autonomous</span> Solution
+            </h2>
+            <p className="text-white/60 text-lg mb-8 leading-relaxed font-medium">
+              DATA SCIENCE AGENT automates the entire lifecycle. From raw CSV to production-ready models and interactive dashboards, our agent uses 82+ specialized tools to deliver precision at scale.
+            </p>
+            <ul className="space-y-4">
+              {[
+                { icon: Zap, text: "Instant feature engineering and selection", color: "text-indigo-400" },
+                { icon: ShieldCheck, text: "Automated error recovery and re-training", color: "text-indigo-400" },
+                { icon: Zap, text: "Explainable AI (XAI) reports by default", color: "text-indigo-400" },
+              ].map((item, i) => (
+                <li key={i} className="flex items-center gap-3 text-white/80 font-semibold">
+                  <item.icon className={`w-5 h-5 ${item.color}`} />
+                  <span>{item.text}</span>
+                </li>
+              ))}
+            </ul>
+          </motion.div>
+        </div>
+      </div>
+    </section>
+  );
+};
+export default ProblemSolution;

FRRONTEEEND/components/Process.tsx ADDED Viewed

	@@ -0,0 +1,70 @@

+import React from 'react';
+import { motion } from 'framer-motion';
+const steps = [
+  {
+    number: "01",
+    title: "Ingest Data",
+    description: "Upload your raw CSV, JSON, or Parquet files directly to the secure environment."
+  },
+  {
+    number: "02",
+    title: "Define Objective",
+    description: "Describe what you want to achieve in natural language. 'Predict churn' or 'Find outliers'."
+  },
+  {
+    number: "03",
+    title: "Agent Execution",
+    description: "The agent orchestrates tools to clean, transform, and model your data autonomously."
+  },
+  {
+    number: "04",
+    title: "Receive Assets",
+    description: "Get fully trained models, performance metrics, and interactive explainable reports."
+  }
+];
+const Process = () => {
+  return (
+    <section id="process" className="py-24 bg-[#030303] border-y border-white/5">
+      <div className="max-w-7xl mx-auto px-6">
+        <div className="text-center mb-20">
+          <h2 className="text-4xl md:text-5xl font-extrabold text-white mb-4 tracking-tight">How it Works</h2>
+          <p className="text-white/40 text-xl font-medium">From raw data to actionable intelligence in 4 steps.</p>
+        </div>
+        <div className="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-4 gap-12">
+          {steps.map((step, i) => (
+            <motion.div
+              key={i}
+              initial={{ opacity: 0, scale: 0.95 }}
+              whileInView={{ opacity: 1, scale: 1 }}
+              viewport={{ once: true }}
+              transition={{ delay: i * 0.1 }}
+              className="relative"
+            >
+              <span className="text-7xl font-extrabold text-white/5 absolute -top-10 -left-4 select-none italic">
+                {step.number}
+              </span>
+              <div className="relative z-10">
+                <h3 className="text-xl font-bold text-white mb-4 flex items-center gap-2 tracking-tight">
+                  <span className="w-1.5 h-1.5 rounded-full bg-indigo-500" />
+                  {step.title}
+                </h3>
+                <p className="text-white/40 leading-relaxed font-medium">
+                  {step.description}
+                </p>
+              </div>
+              {i < steps.length - 1 && (
+                <div className="hidden lg:block absolute top-1/2 -right-6 w-12 h-[1px] bg-gradient-to-r from-white/10 to-transparent" />
+              )}
+            </motion.div>
+          ))}
+        </div>
+      </div>
+    </section>
+  );
+};
+export default Process;

FRRONTEEEND/components/ShadowSection.tsx ADDED Viewed

	@@ -0,0 +1,222 @@

+'use client';
+import React, { useRef, useId, useEffect, CSSProperties } from 'react';
+import { animate, useMotionValue, AnimationPlaybackControls, motion } from 'framer-motion';
+import { cn } from '../lib/utils';
+// Type definitions
+interface ResponsiveImage {
+    src: string;
+    alt?: string;
+    srcSet?: string;
+}
+interface AnimationConfig {
+    preview?: boolean;
+    scale: number;
+    speed: number;
+}
+interface NoiseConfig {
+    opacity: number;
+    scale: number;
+}
+interface ShadowOverlayProps {
+    type?: 'preset' | 'custom';
+    presetIndex?: number;
+    customImage?: ResponsiveImage;
+    sizing?: 'fill' | 'stretch';
+    color?: string;
+    animation?: AnimationConfig;
+    noise?: NoiseConfig;
+    style?: CSSProperties;
+    className?: string;
+    title?: string;
+    description?: string;
+}
+function mapRange(
+    value: number,
+    fromLow: number,
+    fromHigh: number,
+    toLow: number,
+    toHigh: number
+): number {
+    if (fromLow === fromHigh) {
+        return toLow;
+    }
+    const percentage = (value - fromLow) / (fromHigh - fromLow);
+    return toLow + percentage * (toHigh - toLow);
+}
+const useInstanceId = (): string => {
+    const id = useId();
+    const cleanId = id.replace(/:/g, "");
+    const instanceId = `shadowoverlay-${cleanId}`;
+    return instanceId;
+};
+export function ShadowSection({
+    sizing = 'fill',
+    color = 'rgba(99, 102, 241, 0.6)',
+    animation = { scale: 50, speed: 15 },
+    noise = { opacity: 0.1, scale: 0.5 },
+    style,
+    className,
+    title = "Cognitive Core",
+    description = "The unseen intelligence powering your most critical decisions."
+}: ShadowOverlayProps) {
+    const id = useInstanceId();
+    const animationEnabled = animation && animation.scale > 0;
+    const feColorMatrixRef = useRef<SVGFEColorMatrixElement>(null);
+    const hueRotateMotionValue = useMotionValue(180);
+    const hueRotateAnimation = useRef<AnimationPlaybackControls | null>(null);
+    const displacementScale = animation ? mapRange(animation.scale, 1, 100, 20, 100) : 0;
+    const animationDuration = animation ? mapRange(animation.speed, 1, 100, 1000, 50) : 1;
+    useEffect(() => {
+        if (feColorMatrixRef.current && animationEnabled) {
+            if (hueRotateAnimation.current) {
+                hueRotateAnimation.current.stop();
+            }
+            hueRotateMotionValue.set(0);
+            hueRotateAnimation.current = animate(hueRotateMotionValue, 360, {
+                duration: animationDuration / 25,
+                repeat: Infinity,
+                repeatType: "loop",
+                repeatDelay: 0,
+                ease: "linear",
+                delay: 0,
+                onUpdate: (value: number) => {
+                    if (feColorMatrixRef.current) {
+                        feColorMatrixRef.current.setAttribute("values", String(value));
+                    }
+                }
+            });
+            return () => {
+                if (hueRotateAnimation.current) {
+                    hueRotateAnimation.current.stop();
+                }
+            };
+        }
+    }, [animationEnabled, animationDuration, hueRotateMotionValue]);
+    return (
+        <section
+            className={cn("relative w-full h-[70vh] min-h-[500px] overflow-hidden bg-[#030303]", className)}
+            style={style}
+        >
+            <div
+                style={{
+                    position: "absolute",
+                    inset: -displacementScale,
+                    filter: animationEnabled ? `url(#${id}) blur(8px)` : "none"
+                }}
+            >
+                {animationEnabled && (
+                    <svg style={{ position: "absolute", width: 0, height: 0 }}>
+                        <defs>
+                            <filter id={id}>
+                                <feTurbulence
+                                    result="undulation"
+                                    numOctaves="2"
+                                    baseFrequency={`${mapRange(animation.scale, 0, 100, 0.001, 0.0005)},${mapRange(animation.scale, 0, 100, 0.004, 0.002)}`}
+                                    seed="0"
+                                    type="turbulence"
+                                />
+                                <feColorMatrix
+                                    ref={feColorMatrixRef}
+                                    in="undulation"
+                                    type="hueRotate"
+                                    values="180"
+                                />
+                                <feColorMatrix
+                                    in="dist"
+                                    result="circulation"
+                                    type="matrix"
+                                    values="4 0 0 0 1  4 0 0 0 1  4 0 0 0 1  1 0 0 0 0"
+                                />
+                                <feDisplacementMap
+                                    in="SourceGraphic"
+                                    in2="circulation"
+                                    scale={displacementScale}
+                                    result="dist"
+                                />
+                                <feDisplacementMap
+                                    in="dist"
+                                    in2="undulation"
+                                    scale={displacementScale}
+                                    result="output"
+                                />
+                            </filter>
+                        </defs>
+                    </svg>
+                )}
+                <div
+                    style={{
+                        backgroundColor: color,
+                        maskImage: `url('https://framerusercontent.com/images/ceBGguIpUU8luwByxuQz79t7To.png')`,
+                        maskSize: sizing === "stretch" ? "100% 100%" : "cover",
+                        maskRepeat: "no-repeat",
+                        maskPosition: "center",
+                        width: "100%",
+                        height: "100%"
+                    }}
+                />
+            </div>
+            <div
+                style={{
+                    position: "absolute",
+                    top: "50%",
+                    left: "50%",
+                    transform: "translate(-50%, -50%)",
+                    textAlign: "center",
+                    zIndex: 20,
+                    width: '100%',
+                    padding: '0 2rem'
+                }}
+            >
+                <motion.h2
+                    initial={{ opacity: 0, y: 20 }}
+                    whileInView={{ opacity: 1, y: 0 }}
+                    viewport={{ once: true }}
+                    className="md:text-7xl text-5xl lg:text-8xl font-heading font-bold text-center text-white relative z-20 tracking-tighter mb-4"
+                >
+                    {title}
+                </motion.h2>
+                <motion.p
+                    initial={{ opacity: 0, y: 20 }}
+                    whileInView={{ opacity: 1, y: 0 }}
+                    viewport={{ once: true }}
+                    transition={{ delay: 0.2 }}
+                    className="text-white/60 text-lg md:text-xl font-sans max-w-xl mx-auto"
+                >
+                    {description}
+                </motion.p>
+            </div>
+            {noise && noise.opacity > 0 && (
+                <div
+                    style={{
+                        position: "absolute",
+                        inset: 0,
+                        backgroundImage: `url("https://framerusercontent.com/images/g0QcWrxr87K0ufOxIUFBakwYA8.png")`,
+                        backgroundSize: noise.scale * 200,
+                        backgroundRepeat: "repeat",
+                        opacity: noise.opacity / 2,
+                        zIndex: 15
+                    }}
+                />
+            )}
+            {/* Bottom Vignette */}
+            <div className="absolute inset-x-0 bottom-0 h-40 bg-gradient-to-t from-[#030303] to-transparent z-30" />
+            <div className="absolute inset-x-0 top-0 h-40 bg-gradient-to-b from-[#030303] to-transparent z-30" />
+        </section>
+    );
+}

FRRONTEEEND/components/TechStack.tsx ADDED Viewed

	@@ -0,0 +1,36 @@

+import React from 'react';
+import { motion } from 'framer-motion';
+const techs = [
+  "Python", "Polars", "Pandas", "Scikit-Learn", "XGBoost", "LightGBM", "Groq", "Gemini", "FastAPI", "Cloud Run", "Docker", "PyTorch"
+];
+const TechStack = () => {
+  return (
+    <section className="py-24 bg-[#030303]">
+      <div className="max-w-7xl mx-auto px-6">
+        <div className="text-center mb-12">
+          <h3 className="text-xs font-bold uppercase tracking-[0.3em] text-white/30 italic">Built with the modern AI Stack</h3>
+        </div>
+        <div className="flex flex-wrap justify-center gap-4 md:gap-6 opacity-60">
+          {techs.map((tech, i) => (
+            <motion.div
+              key={tech}
+              initial={{ opacity: 0 }}
+              whileInView={{ opacity: 1 }}
+              viewport={{ once: true }}
+              transition={{ delay: i * 0.05 }}
+              className="px-5 py-2 rounded-lg border border-white/5 bg-white/[0.02] text-white/80 font-bold text-xs md:text-sm whitespace-nowrap tracking-wide uppercase"
+            >
+              {tech}
+            </motion.div>
+          ))}
+        </div>
+      </div>
+    </section>
+  );
+};
+export default TechStack;

FRRONTEEEND/index.html ADDED Viewed

	@@ -0,0 +1,59 @@

+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>Data Science Agent </title>
+    <script src="https://cdn.tailwindcss.com"></script>
+    <link rel="preconnect" href="https://fonts.googleapis.com">
+    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+    <link href="https://fonts.googleapis.com/css2?family=Plus+Jakarta+Sans:ital,wght@0,200;0,300;0,400;0,500;0,600;0,700;0,800;1,200;1,300;1,400;1,500;1,600;1,700;1,800&display=swap" rel="stylesheet">
+    <script>
+      tailwind.config = {
+        theme: {
+          extend: {
+            fontFamily: {
+              sans: ['Plus Jakarta Sans', 'sans-serif'],
+              heading: ['Plus Jakarta Sans', 'sans-serif'],
+              mono: ['Plus Jakarta Sans', 'sans-serif'],
+            },
+          },
+        },
+      }
+    </script>
+    <style>
+      body {
+        margin: 0;
+        background-color: #030303;
+        overflow-x: hidden;
+        font-family: 'Plus Jakarta Sans', sans-serif;
+        -webkit-font-smoothing: antialiased;
+        -moz-osx-font-smoothing: grayscale;
+      }
+      ::selection {
+        background-color: rgba(99, 102, 241, 0.3);
+        color: white;
+      }
+    </style>
+  <script type="importmap">
+{
+  "imports": {
+    "react": "https://esm.sh/react@^19.2.3",
+    "react-dom/": "https://esm.sh/react-dom@^19.2.3/",
+    "react/": "https://esm.sh/react@^19.2.3/",
+    "clsx": "https://esm.sh/clsx@^2.1.1",
+    "tailwind-merge": "https://esm.sh/tailwind-merge@^3.4.0",
+    "framer-motion": "https://esm.sh/framer-motion@^12.23.26",
+    "lucide-react": "https://esm.sh/lucide-react@^0.562.0",
+    "@google/genai": "https://esm.sh/@google/genai@^1.34.0"
+  }
+}
+</script>
+<link rel="stylesheet" href="/index.css">
+</head>
+  <body>
+    <div id="root"></div>
+  <script type="module" src="/index.tsx"></script>
+</body>
+</html>

FRRONTEEEND/index.tsx ADDED Viewed

	@@ -0,0 +1,16 @@

+import React from 'react';
+import ReactDOM from 'react-dom/client';
+import App from './App';
+const rootElement = document.getElementById('root');
+if (!rootElement) {
+  throw new Error("Could not find root element to mount to");
+}
+const root = ReactDOM.createRoot(rootElement);
+root.render(
+  <React.StrictMode>
+    <App />
+  </React.StrictMode>
+);

FRRONTEEEND/lib/utils.ts ADDED Viewed

	@@ -0,0 +1,7 @@

+import { clsx, type ClassValue } from 'clsx';
+import { twMerge } from 'tailwind-merge';
+export function cn(...inputs: ClassValue[]) {
+  return twMerge(clsx(inputs));
+}

FRRONTEEEND/metadata.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "name": "Data Science Agent",
+  "description": "A production-grade autonomous AI agent for end-to-end data science workflows, featuring 82+ specialized tools and dual LLM support.",
+  "requestFramePermissions": []
+}

FRRONTEEEND/package-lock.json ADDED Viewed

The diff for this file is too large to render. See raw diff

FRRONTEEEND/package.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "name": "data-science-agent",
+  "private": true,
+  "version": "0.0.0",
+  "type": "module",
+  "scripts": {
+    "dev": "vite",
+    "build": "vite build",
+    "preview": "vite preview"
+  },
+  "dependencies": {
+    "react": "^19.2.3",
+    "react-dom": "^19.2.3",
+    "clsx": "^2.1.1",
+    "tailwind-merge": "^3.4.0",
+    "framer-motion": "^12.23.26",
+    "lucide-react": "^0.562.0",
+    "react-markdown": "^9.0.1"
+  },
+  "devDependencies": {
+    "@types/node": "^22.14.0",
+    "@vitejs/plugin-react": "^5.0.0",
+    "typescript": "~5.8.2",
+    "vite": "^6.2.0"
+  }
+}

FRRONTEEEND/tsconfig.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "compilerOptions": {
+    "target": "ES2022",
+    "experimentalDecorators": true,
+    "useDefineForClassFields": false,
+    "module": "ESNext",
+    "lib": [
+      "ES2022",
+      "DOM",
+      "DOM.Iterable"
+    ],
+    "skipLibCheck": true,
+    "types": [
+      "node"
+    ],
+    "moduleResolution": "bundler",
+    "isolatedModules": true,
+    "moduleDetection": "force",
+    "allowJs": true,
+    "jsx": "react-jsx",
+    "paths": {
+      "@/*": [
+        "./*"
+      ]
+    },
+    "allowImportingTsExtensions": true,
+    "noEmit": true
+  }
+}

FRRONTEEEND/vite.config.ts ADDED Viewed

	@@ -0,0 +1,29 @@

+import path from 'path';
+import { defineConfig, loadEnv } from 'vite';
+import react from '@vitejs/plugin-react';
+export default defineConfig(({ mode }) => {
+    const env = loadEnv(mode, '.', '');
+    return {
+      server: {
+        port: 3000,
+        host: '0.0.0.0',
+        proxy: {
+          '/api': {
+            target: env.VITE_API_URL || 'http://localhost:8080',
+            changeOrigin: true,
+            rewrite: (path) => path.replace(/^\/api/, '')
+          }
+        }
+      },
+      plugins: [react()],
+      define: {
+        'import.meta.env.VITE_API_URL': JSON.stringify(env.VITE_API_URL || 'http://localhost:8080')
+      },
+      resolve: {
+        alias: {
+          '@': path.resolve(__dirname, '.'),
+        }
+      }
+    };
+});

GEMINI_UPDATE.md ADDED Viewed

	@@ -0,0 +1,93 @@

+# 🔄 Updated to Use Google Gemini!
+## What Changed
+The application now uses **Google Gemini (gemini-2.0-flash-exp)** instead of Groq for the chat interface.
+## Required Setup
+### 1. Set Your Google API Key
+```powershell
+# Windows PowerShell
+$env:GOOGLE_API_KEY="your-google-api-key-here"
+# Verify it's set
+echo $env:GOOGLE_API_KEY
+```
+### 2. Get Your API Key
+If you don't have a Google API key:
+1. Go to [Google AI Studio](https://aistudio.google.com/app/apikey)
+2. Create a new API key
+3. Copy and set it as shown above
+## Quick Start
+```powershell
+# Set your API key
+$env:GOOGLE_API_KEY="your-key-here"
+# Run the application
+.\start.ps1
+```
+Then open: **http://localhost:8080**
+## What's Using Gemini
+- ✅ **Chat Interface** (`/chat` endpoint) - Uses Gemini 2.0 Flash
+- ℹ️ **Full Workflow** (`/run` endpoint) - Uses the main agent (configurable via LLM_PROVIDER)
+## Technical Details
+The `/chat` endpoint now:
+- Uses `google.generativeai` SDK
+- Model: `gemini-2.0-flash-exp`
+- Maintains conversation history
+- Professional data science system instruction
+## Expected Console Output
+When you start the server:
+```
+INFO:     Started server process [####]
+INFO:     Waiting for application startup.
+✅ Agent initialized with provider: gemini
+✅ Frontend assets mounted from C:\Users\Pulastya\Videos\DS AGENTTTT\FRRONTEEEND\dist
+INFO:     Application startup complete.
+INFO:     Uvicorn running on http://0.0.0.0:8080
+```
+## Files Updated
+- ✅ [src/api/app.py](src/api/app.py) - `/chat` endpoint now uses Gemini
+- ✅ [.env.example](.env.example) - Updated to GOOGLE_API_KEY
+- ✅ [start.ps1](start.ps1) - Updated environment variable reference
+- ✅ [start.sh](start.sh) - Updated environment variable reference
+- ✅ [CHECKLIST.md](CHECKLIST.md) - Updated instructions
+- ✅ [FRRONTEEEND/.env](FRRONTEEEND/.env) - Added note about Gemini
+## Troubleshooting
+### Error: "API key not configured"
+**Solution**: Make sure you've set the environment variable:
+```powershell
+$env:GOOGLE_API_KEY="your-actual-api-key"
+```
+### Error: "Module google.generativeai not found"
+**Solution**: The dependency is already in requirements.txt. Verify it's installed:
+```bash
+pip install google-generativeai
+```
+### Rate Limits
+Gemini 2.0 Flash has generous rate limits:
+- Free tier: 15 RPM (requests per minute)
+- 1 million TPM (tokens per minute)
+---
+**Ready?** Set your `GOOGLE_API_KEY` and run `.\start.ps1` 🚀

MIGRATION_COMPLETE.md ADDED Viewed

	@@ -0,0 +1,325 @@

+# 🎉 Frontend Migration Complete!
+## Summary
+Successfully replaced the old Gradio interface with a modern React-based frontend featuring:
+- **Professional Landing Page**: Showcases the agent's capabilities
+- **Modern Chat Interface**: NextChat-style conversational UI
+- **Direct Backend Integration**: Communicates with FastAPI backend
+- **Beautiful Design**: Dark theme with animations and responsive layout
+## What Was Changed
+### ✅ Backend Updates ([src/api/app.py](src/api/app.py))
+1. **Added CORS middleware** for frontend communication
+2. **Created `/chat` endpoint** for conversational interface
+3. **Static file serving** for built React app
+4. **Catch-all route** to serve `index.html` for client-side routing
+### ✅ Frontend Updates
+1. **Removed Google GenAI dependency** from [package.json](FRRONTEEEND/package.json)
+2. **Updated ChatInterface.tsx** to call backend `/chat` endpoint instead of external API
+3. **Added environment configuration**:
+   - `.env` for local development
+   - `.env.production` for production builds
+4. **Updated vite.config.ts** with proxy configuration
+### ✅ Configuration Files
+1. **requirements.txt**: Commented out Gradio (no longer needed)
+2. **Dockerfile**: Added multi-stage build for React frontend
+3. **.dockerignore**: Excluded node_modules and frontend dev files
+4. **New Scripts**:
+   - `start.ps1` / `start.sh` - Quick start scripts
+   - `build-and-deploy.ps1` / `build-and-deploy.sh` - Build scripts
+### ✅ Documentation
+- **FRONTEND_INTEGRATION.md**: Complete integration guide
+- **README.md**: Updated with frontend announcement
+## 🚀 How to Run
+### Quick Start (Recommended)
+**Windows:**
+```powershell
+.\start.ps1
+```
+**Linux/Mac:**
+```bash
+chmod +x start.sh
+./start.sh
+```
+### Manual Steps
+1. **Build Frontend** (already done ✅):
+```bash
+cd FRRONTEEEND
+npm.cmd install
+npm.cmd run build
+cd ..
+```
+2. **Set Environment Variables**:
+```powershell
+# Required
+$env:GROQ_API_KEY="your-groq-api-key-here"
+# Optional
+$env:GOOGLE_API_KEY="your-google-api-key"
+```
+3. **Start Backend**:
+```bash
+python src\api\app.py
+```
+4. **Access Application**:
+Open browser to: **http://localhost:8080**
+## 🏗️ Architecture
+```
+┌─────────────────────────────────────────────────────────┐
+│                      Browser                             │
+│                                                          │
+│  ┌──────────────────────────────────────────────────┐  │
+│  │   React Frontend (Port 8080)                     │  │
+│  │   - Landing Page (HeroGeometric, etc.)           │  │
+│  │   - Chat Interface (ChatInterface.tsx)           │  │
+│  └──────────────────────────────────────────────────┘  │
+│                         │                                │
+│                         │ HTTP POST /chat                │
+└─────────────────────────┼────────────────────────────────┘
+                          │
+                          ▼
+┌─────────────────────────────────────────────────────────┐
+│               FastAPI Backend (Port 8080)                │
+│                                                          │
+│  ┌──────────────────────────────────────────────────┐  │
+│  │   API Endpoints                                   │  │
+│  │   - POST /chat      → Chat with agent            │  │
+│  │   - POST /run       → Full workflow              │  │
+│  │   - POST /profile   → Dataset profiling          │  │
+│  │   - GET  /tools     → List tools                 │  │
+│  │   - GET  /*         → Serve React app            │  │
+│  └──────────────────────────────────────────────────┘  │
+│                         │                                │
+│                         ▼                                │
+│  ┌──────────────────────────────────────────────────┐  │
+│  │   DataScienceCopilot (orchestrator.py)           │  │
+│  │   - 82+ Tools                                     │  │
+│  │   - Groq LLM                                      │  │
+│  │   - Session Memory                                │  │
+│  └──────────────────────────────────────────────────┘  │
+└─────────────────────────────────────────────────────────┘
+```
+## 🎯 Key Endpoints
+### `/chat` - Conversational Interface
+```typescript
+POST /chat
+Content-Type: application/json
+{
+  "messages": [
+    {"role": "user", "content": "Profile my dataset"},
+    {"role": "assistant", "content": "..."}
+  ],
+  "stream": false
+}
+```
+**Response:**
+```json
+{
+  "success": true,
+  "message": "I can help you profile your dataset...",
+  "model": "llama-3.3-70b-versatile",
+  "provider": "groq"
+}
+```
+### `/run` - Complete Workflow
+```bash
+POST /run
+Content-Type: multipart/form-data
+file: <dataset.csv>
+task_description: "Predict house prices"
+target_col: "price"
+```
+### `/profile` - Quick Profiling
+```bash
+POST /profile
+Content-Type: multipart/form-data
+file: <dataset.csv>
+```
+## 📝 Environment Variables
+### Backend (.env or system)
+```env
+# Required
+GROQ_API_KEY=your-groq-api-key
+# Optional
+GOOGLE_API_KEY=your-google-api-key
+GCP_PROJECT_ID=your-project-id
+LLM_PROVIDER=groq  # or "gemini"
+```
+### Frontend (FRRONTEEEND/.env)
+```env
+# Development
+VITE_API_URL=http://localhost:8080
+# Production (FRRONTEEEND/.env.production)
+VITE_API_URL=https://your-cloud-run-url.run.app
+```
+## 🐳 Docker Deployment
+The Dockerfile now includes a multi-stage build:
+```bash
+# Build image
+docker build -t data-science-agent .
+# Run container
+docker run -p 8080:8080 \
+  -e GROQ_API_KEY=your-key \
+  data-science-agent
+```
+## ☁️ Google Cloud Run Deployment
+```bash
+# Build and push
+gcloud builds submit --tag gcr.io/YOUR-PROJECT-ID/data-science-agent
+# Deploy
+gcloud run deploy data-science-agent \
+  --image gcr.io/YOUR-PROJECT-ID/data-science-agent \
+  --platform managed \
+  --region us-central1 \
+  --allow-unauthenticated \
+  --set-env-vars GROQ_API_KEY=your-api-key
+```
+## 🔍 Testing
+### Test Backend API
+```bash
+# Health check
+curl http://localhost:8080/health
+# List tools
+curl http://localhost:8080/tools
+# Chat
+curl -X POST http://localhost:8080/chat \
+  -H "Content-Type: application/json" \
+  -d '{
+    "messages": [
+      {"role": "user", "content": "Hello, what can you do?"}
+    ]
+  }'
+```
+### Test Frontend
+1. Open browser: http://localhost:8080
+2. Click "Launch Console"
+3. Type a message and send
+## 🎨 Frontend Development
+For frontend development with hot-reloading:
+**Terminal 1 - Backend:**
+```bash
+python src\api\app.py
+```
+**Terminal 2 - Frontend:**
+```bash
+cd FRRONTEEEND
+npm.cmd run dev
+```
+Access:
+- Frontend Dev: http://localhost:3000
+- Backend API: http://localhost:8080
+## 📦 Build Status
+✅ **Frontend Built**: FRRONTEEEND/dist/ contains:
+- index.html
+- assets/index-[hash].js (384 KB)
+✅ **Backend Ready**: src/api/app.py configured to:
+- Serve static files from FRRONTEEEND/dist/assets
+- Route all non-API requests to index.html
+- Handle /chat endpoint
+## 🔄 Migration Notes
+### What's Deprecated
+- ❌ `chat_ui.py` - Old Gradio interface (kept for reference)
+- ❌ Direct Google GenAI calls from frontend
+### What's New
+- ✅ React 19 + TypeScript
+- ✅ Vite 6 build system
+- ✅ Tailwind CSS styling
+- ✅ Framer Motion animations
+- ✅ Backend-first architecture
+## 🐛 Troubleshooting
+### Issue: Frontend shows 404
+**Solution**: Make sure you've built the frontend:
+```bash
+cd FRRONTEEEND
+npm.cmd run build
+```
+### Issue: API errors in chat
+**Solution**:
+1. Check backend is running: `python src\api\app.py`
+2. Verify GROQ_API_KEY is set
+3. Check console for errors
+### Issue: CORS errors
+**Solution**: The backend has CORS enabled. If issues persist, check the `allow_origins` in app.py
+### Issue: Module import errors
+**Solution**: Make sure all Python dependencies are installed:
+```bash
+pip install -r requirements.txt
+```
+## 📚 Additional Resources
+- **[FRONTEND_INTEGRATION.md](FRONTEND_INTEGRATION.md)** - Detailed integration guide
+- **[README.md](README.md)** - Main project documentation
+- **[DEPLOYMENT.md](DEPLOYMENT.md)** - Cloud deployment guide
+## ✨ Next Steps
+1. **File Upload**: Add file upload capability to ChatInterface
+2. **Visualizations**: Display charts and plots in chat
+3. **Session Persistence**: Store chat history in backend
+4. **Authentication**: Add user authentication
+5. **Streaming**: Implement streaming responses
+6. **Dark/Light Mode**: Add theme toggle
+---
+**Status**: ✅ Ready to use!
+**Last Updated**: December 27, 2025

QUICK_REFERENCE.txt ADDED Viewed

	@@ -0,0 +1,71 @@

+╔═══════════════════════════════════════════════════════════════╗
+║           🚀 DATA SCIENCE AGENT - QUICK REFERENCE            ║
+║              Now powered by Google Gemini! 🤖                 ║
+╚═══════════════════════════════════════════════════════════════╝
+┌───────────────────────────────────────────────────────────────┐
+│ 1. SET API KEY (REQUIRED!)                                    │
+└───────────────────────────────────────────────────────────────┘
+  PowerShell:
+    $env:GOOGLE_API_KEY="your-google-api-key-here"
+  Get your key: https://aistudio.google.com/app/apikey
+┌───────────────────────────────────────────────────────────────┐
+│ 2. START THE APPLICATION                                      │
+└───────────────────────────────────────────────────────────────┘
+  .\start.ps1
+┌───────────────────────────────────────────────────────────────┐
+│ 3. ACCESS THE APP                                             │
+└───────────────────────────────────────────────────────────────┘
+  Open browser: http://localhost:8080
+┌───────────────────────────────────────────────────────────────┐
+│ WHAT'S INCLUDED                                               │
+└───────────────────────────────────────────────────────────────┘
+  ✅ Modern React frontend with landing page
+  ✅ Professional chat interface
+  ✅ Google Gemini 2.0 Flash integration
+  ✅ 82+ data science tools
+  ✅ Complete ML pipeline automation
+┌───────────────────────────────────────────────────────────────┐
+│ KEY FILES                                                     │
+└───────────────────────────────────────────────────────────────┘
+  📖 GEMINI_UPDATE.md      - Gemini migration details
+  📖 CHECKLIST.md          - Pre-launch checklist
+  📖 MIGRATION_COMPLETE.md - Full change log
+  📖 FRONTEND_INTEGRATION.md - Technical docs
+┌───────────────────────────────────────────────────────────────┐
+│ TROUBLESHOOTING                                               │
+└───────────────────────────────────────────────────────────────┘
+  Issue: "API key not configured"
+  → Set: $env:GOOGLE_API_KEY="your-key"
+  Issue: "Frontend not found"
+  → Run: cd FRRONTEEEND && npm run build
+  Issue: "Module not found"
+  → Run: pip install -r requirements.txt
+┌───────────────────────────────────────────────────────────────┐
+│ API ENDPOINTS                                                 │
+└───────────────────────────────────────────────────────────────┘
+  POST /chat    - Chat with Gemini agent
+  POST /run     - Full ML workflow
+  POST /profile - Quick dataset profiling
+  GET  /tools   - List available tools
+  GET  /docs    - API documentation
+╔═══════════════════════════════════════════════════════════════╗
+║  Ready to start? Run: .\start.ps1                            ║
+���═══════════════════════════════════════════════════════════════╝

README.md ADDED Viewed

	@@ -0,0 +1,632 @@

+# Data Science Agent 🤖
+A production-grade **autonomous AI agent** for end-to-end data science workflows. Upload datasets, describe your goal in natural language, and let the AI handle profiling, cleaning, feature engineering, model training, and visualization.
+**Key Differentiator**: Not just a chatbot - a true AI agent with 75+ specialized tools, intelligent orchestration, dual LLM support, session memory, code interpreter, and Cloud Run API.
+---
+> ## 🎉 **NEW: Modern React Frontend!**
+>
+> The application now features a **professional React-based web interface** with a beautiful landing page and chat UI, replacing the old Gradio interface.
+>
+> **Quick Start:**
+> ```powershell
+> .\start.ps1  # Windows
+> ```
+> or
+> ```bash
+> ./start.sh   # Linux/Mac
+> ```
+>
+> 📖 **[See Full Frontend Integration Guide →](FRONTEND_INTEGRATION.md)**
+---
+## 🎯 Project Vision
+Build an **autonomous data science system** that achieves **50-70th percentile performance** on Kaggle competitions through intelligent automation, proving AI agents can handle real-world ML workflows end-to-end.
+---
+## ✨ Core Features
+### **🤖 Intelligent Agent System**
+- **82+ Specialized Tools** across 11 categories (profiling, cleaning, feature engineering, ML, visualization, BigQuery)
+- **Dual LLM Support**: Groq (llama-3.3-70b) + Google Gemini (2.0-flash-exp)
+- **Smart Orchestration**: LLM-powered function calling with intelligent tool chaining
+- **Session Memory**: Contextual awareness across conversations ("cross-validate it", "try with Ridge")
+- **Code Interpreter**: Write and execute custom Python code for tasks beyond predefined tools
+- **Error Recovery**: Automatic retry with corrected parameters
+- **Reasoning Modules**: Dedicated LLM reasoning layer with 19 specialized functions
+- **Cloud Integration**: BigQuery data access + GCS artifact storage
+### 🎨 **Multiple Interfaces**
+- **Gradio Web UI** (`chat_ui.py`): Upload files, chat interface, visual plots
+- **CLI Interface** (`src/cli.py`): Command-line workflow automation
+- **REST API** (`src/api/app.py`): Cloud Run-ready FastAPI wrapper
+- **Python SDK**: Direct programmatic access
+### 📊 **Complete ML Pipeline**
+1. **Data Profiling** → Statistics, types, quality issues
+2. **Data Cleaning** → Smart imputation, outlier handling, type conversion
+3. **Feature Engineering** → Time features, encoding, interactions, ratios
+4. **Model Training** → XGBoost, LightGBM, CatBoost, ensemble methods
+5. **Hyperparameter Tuning** → Optuna-based optimization
+6. **Visualization** → Matplotlib, Plotly, interactive dashboards
+7. **EDA Reports** → Sweetviz, ydata-profiling HTML reports
+8. **Explainability** → SHAP values, feature importance
+### ⚡ **Performance & Scale**
+- **Token Optimization**: 34% reduction in LLM context (compressed tool schemas)
+- **SQLite Caching**: Memoization of expensive operations with TTL
+- **Polars & DuckDB**: 10-100x faster than pandas for large datasets
+- **Rate Limiting**: Intelligent API call management (Groq: 12K TPM, Gemini: 10 RPM)
+- **Cloud Ready**: FastAPI service for Google Cloud Run deployment
+---
+## 🏗️ Architecture
+### **System Design**
+```
+┌─────────────────────────────────────────────────────────────┐
+│                   User Interfaces                            │
+│  Gradio UI  │  CLI  │  REST API  │  Python SDK               │
+└─────────────────────────┬───────────────────────────────────┘
+                          ▼
+┌─────────────────────────────────────────────────────────────┐
+│              DataScienceCopilot Orchestrator                 │
+│  • LLM Function Calling (Groq/Gemini)                       │
+│  • Session Memory Management                                 │
+│  • Tool Execution & Chaining                                 │
+│  • Error Recovery & Retry Logic                              │
+└─────────────────────────┬───────────────────────────────────┘
+                          ▼
+┌─────────────────────────────────────────────────────────────┐
+│                    75+ Specialized Tools                     │
+│  Data Profiling │ Cleaning │ Feature Engineering             │
+│  Model Training │ Visualization │ EDA Reports                │
+│  NLP/Text │ Computer Vision │ Time Series │ MLOps           │
+└───────���─────────────────┬───────────────────────────────────┘
+                          ▼
+┌─────────────────────────────────────────────────────────────┐
+│              Execution & Storage Backends                    │
+│  Local: Polars, sklearn, XGBoost                            │
+│  Cloud: BigQuery, Vertex AI, Cloud Storage (planned)        │
+│  Cache: SQLite with TTL                                      │
+└─────────────────────────────────────────────────────────────┘
+```
+### **Tech Stack**
+| Layer | Technologies |
+|-------|-------------|
+| **LLM** | Groq (llama-3.3-70b), Google Gemini (2.0-flash-exp) |
+| **Data Processing** | Polars, DuckDB, Pandas, PyArrow, BigQuery |
+| **ML/AI** | scikit-learn, XGBoost, LightGBM, CatBoost, Optuna |
+| **Visualization** | Matplotlib, Seaborn, Plotly |
+| **EDA Reports** | Sweetviz, ydata-profiling |
+| **Explainability** | SHAP, LIME |
+| **APIs** | FastAPI, Uvicorn |
+| **UI** | Gradio, Typer + Rich (CLI) |
+| **Storage** | SQLite (cache), CSV, Parquet, Google Cloud Storage |
+| **Cloud** | Google Cloud Run, BigQuery, GCS, Vertex AI (planned) |
+---
+## 🚀 Quick Start
+### **Prerequisites**
+- Python 3.9+
+- API Keys: [Groq](https://console.groq.com) or [Google AI Studio](https://makersuite.google.com/app/apikey)
+### **Installation**
+```bash
+# Clone repository
+git clone https://github.com/Surfing-Ninja/Data-Science-Agent.git
+cd Data-Science-Agent
+# Create virtual environment
+python -m venv .venv
+source .venv/bin/activate  # On Windows: .venv\Scripts\activate
+# Install dependencies
+pip install -r requirements.txt
+# Set up environment variables
+cp .env.example .env
+# Edit .env and add your API keys:
+# GROQ_API_KEY=your_groq_key
+# GOOGLE_API_KEY=your_google_key (optional)
+# LLM_PROVIDER=groq  # or "gemini"
+```
+### **Usage Examples**
+#### **1. Gradio Web UI** (Recommended for beginners)
+```bash
+python chat_ui.py
+# Opens at http://localhost:7860
+# Upload CSV → Ask: "Analyze this data and predict house prices"
+```
+#### **2. CLI Interface**
+```bash
+# Complete workflow
+python src/cli.py analyze data.csv --target price --task "Predict house prices"
+# Quick profiling
+python src/cli.py profile data.csv
+# Train models only
+python src/cli.py train cleaned.csv Survived --task-type classification
+```
+#### **3. Python SDK**
+```python
+from src.orchestrator import DataScienceCopilot
+# Initialize agent
+agent = DataScienceCopilot(
+    provider="groq",  # or "gemini"
+    reasoning_effort="medium"
+)
+# Run workflow
+result = agent.analyze(
+    file_path="titanic.csv",
+    task_description="Build a model to predict passenger survival",
+    target_col="Survived"
+)
+print(f"Status: {result['status']}")
+print(f"Best Model: {result['best_model']}")
+print(f"Accuracy: {result['best_score']}")
+```
+#### **4. REST API** (Cloud Run Ready)
+```bash
+# Start local server
+cd src/api
+python app.py
+# Server runs at http://localhost:8080
+# Make API call
+curl -X POST http://localhost:8080/run \
+  -F "file=@data.csv" \
+  -F "task_description=Analyze and predict churn" \
+  -F "target_col=churn"
+```
+---
+## 📁 Project Structure
+```
+Data-Science-Agent/
+├── src/
+│   ├── orchestrator.py              # Main agent brain (1,136 lines)
+│   ├── cli.py                       # CLI interface (346 lines)
+│   ├── api/
+│   │   └── app.py                   # FastAPI Cloud Run wrapper (331 lines)
+│   ├── bigquery/                    # BigQuery integration 🆕
+│   │   ├── __init__.py             # BigQuery tools (4 functions)
+│   │   └── client.py               # BigQuery client wrapper
+│   ├── storage/                     # Artifact storage 🆕
+│   │   ├── artifact_store.py       # Local + GCS backends (613 lines)
+│   │   └── helpers.py              # Storage helper functions (125 lines)
+│   ├── reasoning/                   # LLM reasoning layer 🆕
+│   │   ├── __init__.py             # Core reasoning engine (350 lines)
+│   │   ├── data_understanding.py   # Data insights (6 functions)
+│   │   ├── model_explanation.py    # Model interpretation (6 functions)
+│   │   └── business_summary.py     # Business translations (7 functions)
+│   ├── cache/
+│   │   └── cache_manager.py        # SQLite caching with TTL
+│   ├── tools/                       # 82+ specialized tools
+│   │   ├── data_profiling.py       # Dataset analysis
+│   │   ├── data_cleaning.py        # Cleaning & preprocessing
+│   │   ├── feature_engineering.py  # Feature creation
+│   │   ├── model_training.py       # ML training
+│   │   ├── visualization_engine.py # Matplotlib/Seaborn plots
+│   │   ├── plotly_visualizations.py # Interactive charts
+│   │   ├── eda_reports.py          # Sweetviz, ydata-profiling
+│   │   ├── advanced_*.py           # Advanced features
+│   │   └── tools_registry.py       # All 82 tool definitions (1,600+ lines)
+│   └── utils/                       # Helper utilities
+│       ├── polars_helpers.py       # Data manipulation
+│       └── validation.py           # Input validation
+├── chat_ui.py                       # Gradio web interface (912 lines)
+├── examples/
+│   └── titanic_example.py           # Complete workflow demo
+├── outputs/
+│   ├── data/                        # Processed datasets
+│   ├── models/                      # Trained models (.pkl)
+│   ├── plots/                       # Visualizations (.png, .html)
+│   └── reports/                     # EDA reports (.html)
+├── cache_db/                        # SQLite cache storage
+├── requirements.txt                 # Python dependencies
+├── .env.example                     # Environment template
+└── README.md                        # This file
+```
+---
+## 🛠️ Tool Categories (82 Tools Total)
+### **📊 Data Profiling & Analysis (7 tools)**
+- `profile_dataset`, `detect_data_quality_issues`, `analyze_correlations`, `get_smart_summary`, `compare_datasets`, `calculate_statistics`, `detect_skewness`
+### **☁️ BigQuery Integration (4 tools)** 🆕
+- `bigquery_profile_table`, `bigquery_load_table`, `bigquery_execute_query`, `bigquery_write_results`
+### **🧹 Data Cleaning (8 tools)**
+- `clean_missing_values`, `handle_outliers`, `remove_duplicates`, `filter_rows`, `rename_columns`, `drop_columns`, `sort_data`, `fix_data_types`
+### **🔧 Feature Engineering (13 tools)**
+- `encode_categorical`, `force_numeric_conversion`, `smart_type_inference`, `create_time_features`, `create_interaction_features`, `create_aggregation_features`, `create_ratio_features`, `create_statistical_features`, `create_log_features`, `create_binned_features`, `engineer_text_features`, `auto_feature_engineering`, `auto_feature_selection`
+### **🤖 Model Training & Tuning (6 tools)**
+- `train_baseline_models`, `hyperparameter_tuning`, `train_ensemble_models`, `perform_cross_validation`, `generate_model_report`, `auto_ml_pipeline`
+### **📈 Visualization (11 tools)**
+- `generate_all_plots`, `generate_data_quality_plots`, `generate_eda_plots`, `generate_model_performance_plots`, `generate_feature_importance_plot`, `generate_interactive_scatter`, `generate_interactive_histogram`, `generate_interactive_correlation_heatmap`, `generate_interactive_box_plots`, `generate_interactive_time_series`, `generate_plotly_dashboard`
+### **📊 EDA Reports (3 tools)**
+- `generate_sweetviz_report`, `generate_ydata_profiling_report`, `generate_combined_eda_report`
+### **🔬 Advanced Analysis (11 tools)**
+- `perform_eda_analysis`, `detect_model_issues`, `detect_anomalies`, `detect_and_handle_multicollinearity`, `perform_statistical_tests`, `analyze_root_cause`, `detect_trends_and_seasonality`, `detect_anomalies_advanced`, `perform_hypothesis_testing`, `analyze_distribution`, `perform_segment_analysis`
+### **📝 Data Wrangling (3 tools)**
+- `merge_datasets`, `concat_datasets`, `reshape_dataset`
+### **🚀 MLOps & Production (5 tools)**
+- `monitor_model_drift`, `explain_predictions`, `generate_model_card`, `perform_ab_test_analysis`, `detect_feature_leakage`
+### **⏰ Time Series (3 tools)**
+- `forecast_time_series`, `detect_seasonality_trends`, `create_time_series_features`
+### **💼 Business Intelligence (4 tools)**
+- `perform_cohort_analysis`, `perform_rfm_analysis`, `detect_causal_relationships`, `generate_business_insights`
+### **📚 NLP/Text (4 tools)**
+- `perform_topic_modeling`, `perform_named_entity_recognition`, `analyze_sentiment_advanced`, `perform_text_similarity`
+### **🖼️ Computer Vision (3 tools)**
+- `extract_image_features`, `perform_image_clustering`, `analyze_tabular_image_hybrid`
+---
+## 🎯 Advanced Features
+### **1. Session Memory**
+The agent remembers context across conversations:
+```python
+# Conversation 1
+"Train a model on earthquake.csv to predict magnitude"
+→ Agent trains XGBoost, achieves 0.92 R²
+# Conversation 2 (Same session)
+"Cross-validate it"
+→ Agent knows: model=XGBoost, dataset=earthquake.csv, target=magnitude
+→ Runs 5-fold CV automatically
+```
+### **2. Code Interpreter**
+Execute custom Python code for tasks beyond predefined tools:
+```python
+User: "Make a Plotly scatter with custom dropdown filters"
+Agent: execute_python_code(code='''
+import plotly.graph_objects as go
+df = pd.read_csv('./temp/data.csv')
+# Custom visualization code...
+fig.write_html('./outputs/code/custom_plot.html')
+''')
+```
+### **3. Token Optimization**
+System stays under LLM token limits even with 75 tools:
+| Component | Before | After | Savings |
+|-----------|--------|-------|---------|
+| Tool Schemas | 8,193 tokens | 5,463 tokens | 34% |
+| Tool Results | 5,000+ tokens | 50-200 tokens | 90%+ |
+### **4. Error Recovery**
+Agent learns from errors and auto-corrects:
+```python
+# Attempt 1
+train_baseline_models(target_col="magnitude")
+→ Error: Column 'magnitude' not found. Hint: Did you mean 'mag'?
+# Attempt 2 (Automatic)
+train_baseline_models(target_col="mag")
+→ Success! Trained 4 models, best: XGBoost (0.92 R²)
+```
+---
+## ☁️ Cloud Features
+### **1. BigQuery Integration** 🆕
+Direct access to BigQuery tables without local downloads:
+```python
+# Profile a BigQuery table
+agent.chat("Profile the table project.dataset.sales")
+# Query and analyze
+agent.chat("Query top 10 customers by revenue from BigQuery")
+# Write results back
+agent.chat("Write the cleaned data to BigQuery table project.dataset.sales_clean")
+```
+**Available Tools:**
+- `bigquery_profile_table`: Get statistics for any BigQuery table
+- `bigquery_load_table`: Load BigQuery data into local Polars DataFrame
+- `bigquery_execute_query`: Run SQL queries directly on BigQuery
+- `bigquery_write_results`: Write processed data back to BigQuery
+**Setup:**
+```bash
+# Install BigQuery dependencies
+pip install google-cloud-bigquery db-dtypes
+# Set environment variable
+export GOOGLE_APPLICATION_CREDENTIALS="/path/to/service-account-key.json"
+```
+**Looker-Compatible Schemas:**
+The project defines stable BigQuery table schemas for BI tools (see [`BIGQUERY_SCHEMAS.md`](BIGQUERY_SCHEMAS.md)):
+- 📊 `model_metrics` - Model performance tracking over time
+- 🎯 `feature_importance` - Feature impact analysis
+- 🔮 `predictions` - Prediction monitoring with actuals
+- 📋 `data_profile_summary` - Data quality metrics
+**Design Principles:**
+- Stable schemas (no breaking changes without versioning)
+- Consistent snake_case naming
+- Clear dimension/metric separation
+- Dashboard-ready with sample Looker views
+### **2. Artifact Storage** 🆕
+Unified storage abstraction - switch between local and GCS with zero code changes:
+```python
+# Local storage (default)
+agent.save_model(model, "my_model.pkl")
+# → Saves to outputs/models/my_model.pkl
+# GCS storage (automatic when GCS credentials present)
+agent.save_model(model, "my_model.pkl")
+# → Saves to gs://your-bucket/models/my_model_v1.pkl with versioning
+```
+**Features:**
+- **Automatic Backend Selection**: Uses GCS if credentials available, falls back to local
+- **Versioning**: Automatic version suffixes for GCS artifacts
+- **Metadata**: Stores creation time, size, checksums
+- **Unified API**: Same code works for local and cloud storage
+**Setup:**
+```bash
+# Install GCS dependencies
+pip install google-cloud-storage
+# Set bucket (optional, defaults to local)
+export GCS_BUCKET="your-gcs-bucket-name"
+```
+### **3. Reasoning Modules** 🆕
+Dedicated LLM reasoning layer with clear boundaries (no raw data access, no training decisions):
+```python
+from reasoning.data_understanding import explain_dataset
+from reasoning.model_explanation import explain_model_performance
+from reasoning.business_summary import create_executive_summary
+# Data insights
+insights = explain_dataset(summary={
+    "rows": 10000,
+    "columns": 20,
+    "missing_values": {"age": {"count": 150, "percentage": 1.5}}
+})
+# Model explanations
+explanation = explain_model_performance(metrics={
+    "accuracy": 0.95,
+    "precision": 0.92,
+    "recall": 0.88
+}, task_type="classification")
+# Business summaries
+summary = create_executive_summary(
+    project_results={"model_accuracy": 0.95},
+    project_name="churn_prediction",
+    business_objective="Reduce customer churn"
+)
+```
+**19 Reasoning Functions:**
+- **Data Understanding**: explain_dataset, suggest_transformations, identify_feature_engineering_opportunities, explain_missing_values, compare_datasets (6 functions)
+- **Model Explanation**: explain_model_performance, interpret_feature_importance, diagnose_model_failure, explain_prediction, compare_models, explain_overfitting (6 functions)
+- **Business Summary**: create_executive_summary, estimate_business_impact, create_stakeholder_report, translate_technical_to_business, prioritize_next_steps, explain_to_customer, assess_deployment_readiness (7 functions)
+**Design Principles:**
+- ✅ **NO Raw Data Access**: Only summaries/statistics allowed
+- ✅ **NO Training Decisions**: Only explanations, never execution
+- ✅ **Structured Output**: JSON schemas for cacheability
+- ✅ **Dual Backend**: Works with both Gemini and Groq
+---
+## 🔧 Configuration
+### **Environment Variables** (`.env`)
+```bash
+# LLM Provider
+LLM_PROVIDER=groq               # "groq" or "gemini"
+GROQ_API_KEY=your_groq_key
+GOOGLE_API_KEY=your_google_key  # Optional
+# Model Selection
+GROQ_MODEL=llama-3.3-70b-versatile
+GEMINI_MODEL=gemini-2.0-flash-exp
+REASONING_EFFORT=medium         # low, medium, high
+# Cache Settings
+CACHE_DB_PATH=./cache_db/cache.db
+CACHE_TTL_SECONDS=86400         # 24 hours
+# Cloud Features (Optional)
+GCS_BUCKET=your-gcs-bucket-name                           # For artifact storage
+GOOGLE_APPLICATION_CREDENTIALS=/path/to/service-key.json  # For BigQuery + GCS
+# Cloud Run (for API deployment)
+PORT=8080
+```
+### **Provider Comparison**
+| Feature | Groq | Gemini |
+|---------|------|--------|
+| **Model** | llama-3.3-70b-versatile | gemini-2.0-flash-exp |
+| **Speed** | ⚡ Extremely fast (LPU) | 🚀 Very fast |
+| **Free Tier** | 100K tokens/day | 1,500 requests/day |
+| **Rate Limit** | 12K tokens/min | 10 requests/min |
+| **Best For** | High-volume, low-latency | Free tier, high quota |
+---
+## 🚀 Cloud Deployment (Google Cloud Run)
+### **Deploy REST API**
+```bash
+# 1. Build Docker image (Dockerfile provided)
+docker build -t data-science-agent .
+# 2. Push to Google Container Registry
+gcloud builds submit --tag gcr.io/PROJECT_ID/data-science-agent
+# 3. Deploy to Cloud Run
+gcloud run deploy data-science-agent \
+  --image gcr.io/PROJECT_ID/data-science-agent \
+  --platform managed \
+  --region us-central1 \
+  --allow-unauthenticated \
+  --memory 4Gi \
+  --timeout 3600 \
+  --set-env-vars GROQ_API_KEY=your_key,LLM_PROVIDER=groq
+# 4. Test deployment
+curl -X POST https://your-service-url/run \
+  -F "file=@data.csv" \
+  -F "task_description=Predict churn"
+```
+### **API Endpoints**
+- `GET /` - Health check
+- `GET /health` - Readiness probe
+- `POST /run` - Full analysis workflow
+- `POST /profile` - Quick dataset profiling
+- `GET /tools` - List all available tools
+---
+## 🗺️ Roadmap
+### **Phase 1: Core Agent** ✅ COMPLETE
+- [x] 75 specialized tools
+- [x] Dual LLM support (Groq + Gemini)
+- [x] CLI + Gradio UI
+- [x] SQLite caching
+- [x] Token optimization
+### **Phase 2: Intelligence** ✅ COMPLETE
+- [x] Session memory
+- [x] Code interpreter
+- [x] Error recovery
+- [x] EDA reports (Sweetviz, ydata-profiling)
+- [x] Interactive Plotly visualizations
+### **Phase 3: Cloud Native** ✅ COMPLETE
+- [x] FastAPI Cloud Run wrapper with 4 REST endpoints
+- [x] BigQuery integration (4 tools: profile, load, query, write)
+- [x] Artifact Storage abstraction (Local ↔ GCS switching)
+- [x] Reasoning modules for LLM explanations (19 functions)
+- [x] Looker-compatible BigQuery schemas (4 stable tables)
+- [ ] Vertex AI model training (planned)
+- [ ] Cloud Logging & Monitoring (planned)
+### **Phase 4: Enterprise** 📋 PLANNED
+- [ ] Multi-user authentication
+- [ ] Team workspaces
+- [ ] Model registry
+- [ ] Automated retraining pipelines
+### **Phase 5: Kaggle Integration** 🎯 FUTURE
+- [ ] Direct Kaggle API integration
+- [ ] Automated competition workflow
+- [ ] Ensemble strategies
+- [ ] Submission automation
+---
+## 🤝 Contributing
+Contributions welcome! Areas for improvement:
+1. **New Tools**: Time series forecasting, NLP preprocessing, image augmentation
+2. **Cloud Backends**: AWS, Azure support
+3. **Performance**: Optimize tool execution, reduce latency
+4. **UI/UX**: Better visualization, workflow builder
+5. **Documentation**: Tutorials, video guides, blog posts
+---
+## 📜 License
+MIT License - See LICENSE file for details
+---
+## 📧 Support & Community
+- **Issues**: [GitHub Issues](https://github.com/Surfing-Ninja/Data-Science-Agent/issues)
+- **Discussions**: [GitHub Discussions](https://github.com/Surfing-Ninja/Data-Science-Agent/discussions)
+---
+## 📊 Project Stats
+- **Lines of Code**: ~18,000+
+- **Tools**: 82 specialized functions (75 core + 4 BigQuery + 3 storage helpers)
+- **Reasoning Functions**: 19 LLM-powered explanation modules
+- **Supported Models**: 10+ (LR, Ridge, Lasso, RF, XGBoost, LightGBM, CatBoost, etc.)
+- **Visualization Types**: 20+ (static + interactive)
+- **Data Formats**: CSV, Parquet, JSON, BigQuery tables
+- **Cloud Platforms**: Google Cloud (Run, BigQuery, GCS) - AWS/Azure planned
+---
+<div align="center">
+**Built with ❤️ for the Data Science Community**
+*"Making data science accessible through AI automation"*
+⭐ Star this repo if you find it useful! ⭐
+</div>

build-and-deploy.ps1 ADDED Viewed

	@@ -0,0 +1,39 @@

+# Build and Deploy Script for Data Science Agent (Windows)
+Write-Host "🚀 Building and Deploying Data Science Agent..." -ForegroundColor Cyan
+# Step 1: Build React Frontend
+Write-Host ""
+Write-Host "📦 Building React frontend..." -ForegroundColor Yellow
+Set-Location FRRONTEEEND
+npm.cmd install
+if ($LASTEXITCODE -ne 0) {
+    Write-Host "❌ Frontend npm install failed!" -ForegroundColor Red
+    exit 1
+}
+npm.cmd run build
+if ($LASTEXITCODE -ne 0) {
+    Write-Host "❌ Frontend build failed!" -ForegroundColor Red
+    exit 1
+}
+Set-Location ..
+Write-Host ""
+Write-Host "✅ Frontend built successfully!" -ForegroundColor Green
+Write-Host "   Built files are in: FRRONTEEEND\dist" -ForegroundColor Gray
+# Step 2: Install Python dependencies
+Write-Host ""
+Write-Host "📦 Installing Python dependencies..." -ForegroundColor Yellow
+pip install -r requirements.txt
+if ($LASTEXITCODE -ne 0) {
+    Write-Host "⚠️ Some Python dependencies may have failed to install" -ForegroundColor Yellow
+}
+Write-Host ""
+Write-Host "✅ Build complete!" -ForegroundColor Green
+Write-Host ""
+Write-Host "To run the application:" -ForegroundColor Cyan
+Write-Host "  python src\api\app.py" -ForegroundColor White
+Write-Host ""
+Write-Host "Access the app at: http://localhost:8080" -ForegroundColor Green

build-and-deploy.sh ADDED Viewed

	@@ -0,0 +1,33 @@

+#!/bin/bash
+# Build and Deploy Script for Data Science Agent
+set -e  # Exit on error
+echo "🚀 Building and Deploying Data Science Agent..."
+# Step 1: Build React Frontend
+echo ""
+echo "📦 Building React frontend..."
+cd FRRONTEEEND
+npm.cmd install
+npm.cmd run build
+cd ..
+# Step 2: Copy built frontend to deployment location (if needed)
+echo ""
+echo "✅ Frontend built successfully!"
+echo "   Built files are in: FRRONTEEEND/dist"
+# Step 3: Install Python dependencies
+echo ""
+echo "📦 Installing Python dependencies..."
+pip install -r requirements.txt
+echo ""
+echo "✅ Build complete!"
+echo ""
+echo "To run the application:"
+echo "  1. Backend: python -m uvicorn src.api.app:app --host 0.0.0.0 --port 8080"
+echo "  2. Or use: python src/api/app.py"
+echo ""
+echo "Access the app at: http://localhost:8080"

cache_db/.gitkeep ADDED Viewed

File without changes

chat_ui.py ADDED Viewed

	@@ -0,0 +1,1073 @@

+"""
+AI Agent Data Scientist - Interactive Chat UI
+==============================================
+A simple web interface to interact with your AI Agent.
+Upload datasets, ask questions, and get AI-powered insights!
+"""
+import gradio as gr
+import sys
+import os
+import shutil
+from pathlib import Path
+import traceback
+# Add src to path
+sys.path.append('src')
+from tools.data_profiling import profile_dataset, detect_data_quality_issues
+from tools.model_training import train_baseline_models
+# Try to import AI agent (optional)
+try:
+    from orchestrator import DataScienceCopilot
+    agent = DataScienceCopilot()
+    AI_ENABLED = True
+    print("✅ AI Agent loaded successfully!")
+    print(f"📊 Model: {agent.model}")
+    print(f"🔧 Tools available: {len(agent.tool_functions)}")
+except Exception as e:
+    print(f"ℹ️  Running in manual mode (AI agent not available)")
+    print(f"   Error: {str(e)}")
+    print("💡 You can still use all the quick actions and tools!")
+    AI_ENABLED = False
+    agent = None
+# Store uploaded file path
+current_file = None
+current_profile = None
+last_agent_response = None  # Store last agent response for visualization extraction
+# Helper functions for Gradio 6.x message format
+def add_message(history, role, content):
+    """Add a message to history in Gradio 6.x format."""
+    if history is None:
+        history = []
+    history.append({"role": role, "content": content})
+    return history
+def add_user_message(history, content):
+    """Add a user message to history."""
+    return add_message(history, "user", content)
+def add_assistant_message(history, content):
+    """Add an assistant message to history."""
+    return add_message(history, "assistant", content)
+def update_last_assistant_message(history, content):
+    """Update the last assistant message in history."""
+    if history and len(history) > 0 and history[-1].get("role") == "assistant":
+        history[-1]["content"] = content
+    return history
+def get_last_user_content(history):
+    """Get the content of the last user message."""
+    if history:
+        for msg in reversed(history):
+            if msg.get("role") == "user":
+                return msg.get("content", "")
+    return ""
+def analyze_dataset(file, user_message, history):
+    """Process uploaded dataset(s) and user message. Supports single or multiple file uploads."""
+    global current_file, current_profile, last_agent_response
+    # Initialize with empty plot list (will collect PNG file paths)
+    plots_paths = []
+    html_reports = []  # Initialize HTML reports list
+    # Initialize history if None
+    if history is None:
+        history = []
+    # Debug: Log the call
+    print(f"[DEBUG] analyze_dataset called - file: {file is not None}, message: '{user_message}', current_file: {current_file}")
+    try:
+        # Handle file uploads (single or multiple)
+        if file is not None:
+            # file can be a single filepath or a list of filepaths
+            files_to_process = file if isinstance(file, list) else [file]
+            # Filter out None values
+            files_to_process = [f for f in files_to_process if f is not None]
+            if len(files_to_process) > 0:
+                print(f"[DEBUG] Processing {len(files_to_process)} file(s) upload")
+                # Copy all files to simpler paths
+                os.makedirs("./temp", exist_ok=True)
+                processed_files = []
+                seen_files = {}  # Track files by content hash to detect duplicates
+                duplicate_count = 0
+                for uploaded_file in files_to_process:
+                    simple_filename = Path(uploaded_file.name if hasattr(uploaded_file, 'name') else uploaded_file).name
+                    file_source = uploaded_file.name if hasattr(uploaded_file, 'name') else uploaded_file
+                    # Calculate file hash to detect duplicates (even with different names)
+                    import hashlib
+                    hasher = hashlib.md5()
+                    with open(file_source, 'rb') as f:
+                        # Read file in chunks to handle large files efficiently
+                        for chunk in iter(lambda: f.read(8192), b""):
+                            hasher.update(chunk)
+                    file_hash = hasher.hexdigest()
+                    # Check if this exact file was already uploaded
+                    if file_hash in seen_files:
+                        print(f"[DEBUG] Duplicate file detected: {simple_filename} (same as {seen_files[file_hash]})")
+                        duplicate_count += 1
+                        continue  # Skip duplicate
+                    # Not a duplicate - process it
+                    simple_path = f"./temp/{simple_filename}"
+                    # Handle filename collision (different files with same name)
+                    if os.path.exists(simple_path):
+                        # Check if existing file is the same (by comparing with already processed files)
+                        existing_in_processed = simple_path in processed_files
+                        if not existing_in_processed:
+                            # Different file with same name - add suffix
+                            base_name = Path(simple_filename).stem
+                            extension = Path(simple_filename).suffix
+                            counter = 1
+                            while os.path.exists(f"./temp/{base_name}_{counter}{extension}"):
+                                counter += 1
+                            simple_filename = f"{base_name}_{counter}{extension}"
+                            simple_path = f"./temp/{simple_filename}"
+                            print(f"[DEBUG] Filename collision - renamed to: {simple_filename}")
+                    shutil.copy2(file_source, simple_path)
+                    processed_files.append(simple_path)
+                    seen_files[file_hash] = simple_filename
+                    print(f"[DEBUG] Copied file to: {simple_path}")
+                # Set current_file to the first file (for single-file operations)
+                # For multi-file operations, the agent will use all files from ./temp/
+                current_file = processed_files[0] if processed_files else None
+                # Only show file upload response if there's no user message
+                if not (user_message and user_message.strip()):
+                    if len(processed_files) == 0:
+                        # All files were duplicates
+                        response = f"⚠️ **No New Files Uploaded**\n\n"
+                        response += f"All {len(files_to_process)} file(s) were duplicates of already uploaded files.\n\n"
+                        response += "Your previously uploaded dataset is still active."
+                    elif len(processed_files) == 1:
+                        # Single file upload - show detailed profile
+                        response = f"📊 **Dataset Uploaded Successfully!**\n\n"
+                        if duplicate_count > 0:
+                            response += f"ℹ️ *({duplicate_count} duplicate file(s) were skipped)*\n\n"
+                        response += f"**File:** {Path(current_file).name}\n\n"
+                        # Get basic profile
+                        profile = profile_dataset(current_file)
+                        current_profile = profile
+                        response += f"**Dataset Overview:**\n"
+                        response += f"- Rows: {profile['shape']['rows']:,}\n"
+                        response += f"- Columns: {profile['shape']['columns']}\n"
+                        # Handle memory_usage (can be float or dict)
+                        memory = profile.get('memory_usage', 0)
+                        if isinstance(memory, dict):
+                            memory = memory.get('total_mb', 0)
+                        response += f"- Memory: {memory:.2f} MB\n\n"
+                        response += f"**Column Types:**\n"
+                        response += f"- Numeric: {len(profile['column_types']['numeric'])} columns\n"
+                        response += f"- Categorical: {len(profile['column_types']['categorical'])} columns\n"
+                        response += f"- Datetime: {len(profile['column_types']['datetime'])} columns\n\n"
+                        # Check data quality
+                        quality = detect_data_quality_issues(current_file)
+                        if quality['critical']:
+                            response += f"🔴 **Critical Issues:** {len(quality['critical'])}\n"
+                            for issue in quality['critical'][:3]:
+                                response += f"  - {issue['message']}\n"
+                        if quality['warning']:
+                            response += f"🟡 **Warnings:** {len(quality['warning'])}\n"
+                            for issue in quality['warning'][:3]:
+                                response += f"  - {issue['message']}\n"
+                    else:
+                        # Multiple files uploaded
+                        response = f"📊 **{len(processed_files)} Datasets Uploaded Successfully!**\n\n"
+                        if duplicate_count > 0:
+                            response += f"ℹ️ *({duplicate_count} duplicate file(s) were skipped)*\n\n"
+                        response += f"**Files:**\n"
+                        for i, fp in enumerate(processed_files, 1):
+                            response += f"{i}. {Path(fp).name}\n"
+                        response += f"\n**💡 You can now use multi-dataset operations!**\n\n"
+                    response += f"\n\n💬 **What would you like to do with {'this dataset' if len(processed_files) == 1 else 'these datasets'}?**\n\n"
+                    response += "You can ask me to:\n"
+                    if len(processed_files) > 1:
+                        response += "- **Merge these datasets** (e.g., 'merge customers and orders on customer_id')\n"
+                        response += "- **Combine/concatenate** them (e.g., 'combine all monthly sales files')\n"
+                    response += "- Train a classification or regression model\n"
+                    response += "- Analyze specific columns\n"
+                    response += "- Detect outliers\n"
+                    response += "- Engineer features\n"
+                    response += "- Generate predictions\n"
+                    response += "- And much more!\n"
+                    # Add assistant message to history
+                    history = add_assistant_message(history, response)
+                    yield history, "", [], []
+                    return
+                # If user uploaded file AND sent a message, don't return - continue to process the message
+                elif user_message and user_message.strip():
+                    # Continue processing the message below
+                    pass
+        # If user sends a message about the current file
+        print(f"[DEBUG] Checking message conditions: user_message={bool(user_message and user_message.strip())}, current_file={bool(current_file)}")
+        if user_message and user_message.strip() and current_file:
+            print(f"[DEBUG] User message detected. AI_ENABLED={AI_ENABLED}, agent={agent is not None}")
+            if AI_ENABLED and agent:
+                print(f"[DEBUG] Entering AI Agent block...")
+                try:
+                    # Show immediate processing message
+                    print(f"🤖 AI Agent analyzing: {user_message}")
+                    history = add_user_message(history, user_message)
+                    history = add_assistant_message(history, "🤖 **AI Agent is thinking...**\n\n⏳ Analyzing your request and planning the workflow...")
+                    yield history, "", [], []
+                    # Use the AI agent to process the request
+                    print(f"📂 File path: {current_file}")
+                    print(f"📝 Task: {user_message}")
+                    print(f"🚀 Calling agent.analyze()...")
+                    agent_response = agent.analyze(
+                        file_path=current_file,
+                        task_description=user_message,
+                        use_cache=False,  # Disable cache to avoid dict hashing issues
+                        stream=False
+                    )
+                    print(f"✅ Agent response received: {agent_response.get('status', 'unknown')}")
+                    # Store agent response for visualization extraction
+                    last_agent_response = agent_response
+                    # Format the response
+                    if agent_response.get('status') == 'success':
+                        response = f"🤖 **AI Agent Analysis Complete!**\n\n"
+                        response += f"{agent_response.get('summary', '')}\n\n"
+                        if 'workflow_history' in agent_response and agent_response['workflow_history']:
+                            response += f"**Execution Summary:**\n"
+                            response += f"- Tools Executed: {len(agent_response['workflow_history'])}\n"
+                            response += f"- Iterations: {agent_response.get('iterations', 0)}\n"
+                            response += f"- Time: {agent_response.get('execution_time', 0):.1f}s\n\n"
+                            # Find and display MODEL TRAINING RESULTS with ALL METRICS
+                            model_results = None
+                            for step in agent_response['workflow_history']:
+                                if step.get('tool') == 'train_baseline_models':
+                                    result = step.get('result', {})
+                                    if isinstance(result, dict) and 'result' in result:
+                                        model_results = result['result']
+                                    elif isinstance(result, dict):
+                                        model_results = result
+                                    break
+                            if model_results and 'models' in model_results:
+                                response += f"## 🎯 Model Training Results\n\n"
+                                task_type = model_results.get('task_type', 'unknown')
+                                response += f"**Task Type:** {task_type.title()}\n"
+                                response += f"**Features:** {model_results.get('n_features', 0)}\n"
+                                response += f"**Training Samples:** {model_results.get('train_size', 0):,}\n"
+                                response += f"**Test Samples:** {model_results.get('test_size', 0):,}\n\n"
+                                # Show ALL models tested
+                                response += "### 📊 All Models Tested:\n\n"
+                                models_data = model_results.get('models', {})
+                                for model_name, model_info in models_data.items():
+                                    if 'test_metrics' in model_info:
+                                        metrics = model_info['test_metrics']
+                                        response += f"**{model_name}:**\n"
+                                        if task_type == 'classification':
+                                            response += f"- Accuracy: {metrics.get('accuracy', 0):.4f}\n"
+                                            response += f"- Precision: {metrics.get('precision', 0):.4f}\n"
+                                            response += f"- Recall: {metrics.get('recall', 0):.4f}\n"
+                                            response += f"- F1 Score: {metrics.get('f1', 0):.4f}\n"
+                                        else:
+                                            response += f"- R² Score: {metrics.get('r2', 0):.4f}\n"
+                                            response += f"- RMSE: {metrics.get('rmse', 0):.2f}\n"
+                                            response += f"- MAE: {metrics.get('mae', 0):.2f}\n"
+                                            response += f"- MAPE: {metrics.get('mape', 0):.2f}%\n"
+                                        response += "\n"
+                                # Highlight BEST MODEL
+                                best_model = model_results.get('best_model', {})
+                                if best_model and best_model.get('name'):
+                                    response += f"### 🏆 Best Model: **{best_model['name']}**\n"
+                                    response += f"Score: {best_model.get('score', 0):.4f}\n\n"
+                            # Show workflow execution summary
+                            response += "### 🔧 Workflow Steps:\n"
+                            for i, step in enumerate(agent_response['workflow_history'], 1):
+                                tool_name = step['tool']
+                                success = step['result'].get('success', False)
+                                icon = "✅" if success else "❌"
+                                response += f"{i}. {icon} {tool_name}\n"
+                            response += "\n"
+                            # Check for plots AND reports in workflow results
+                            html_reports = []  # Separate list for HTML reports
+                            for step in agent_response['workflow_history']:
+                                result = step.get('result', {})
+                                # Deep search for plots and reports in nested results
+                                def find_plots_and_reports(obj, plots_list, reports_list):
+                                    if isinstance(obj, dict):
+                                        # Check direct plot/report keys
+                                        for key in ['plot_path', 'plot_file', 'output_path', 'html_path', 'report_path',
+                                                   'plots', 'plot_paths', 'performance_plots', 'feature_importance_plot']:
+                                            if key in obj and obj[key]:
+                                                if isinstance(obj[key], list):
+                                                    for path in obj[key]:
+                                                        if isinstance(path, str) and os.path.exists(path):
+                                                            if path.endswith('.html'):
+                                                                # Check if it's a report (in reports folder) or interactive plot
+                                                                if '/reports/' in path or 'report' in Path(path).stem.lower():
+                                                                    reports_list.append(path)
+                                                                else:
+                                                                    reports_list.append(path)  # Interactive plots also go to reports
+                                                            elif path.endswith(('.png', '.jpg', '.jpeg')):
+                                                                plots_list.append(path)
+                                                elif isinstance(obj[key], str) and os.path.exists(obj[key]):
+                                                    if obj[key].endswith('.html'):
+                                                        if '/reports/' in obj[key] or 'report' in Path(obj[key]).stem.lower():
+                                                            reports_list.append(obj[key])
+                                                        else:
+                                                            reports_list.append(obj[key])
+                                                    elif obj[key].endswith(('.png', '.jpg', '.jpeg')):
+                                                        plots_list.append(obj[key])
+                                        # Recursively search nested dicts
+                                        for value in obj.values():
+                                            find_plots_and_reports(value, plots_list, reports_list)
+                                find_plots_and_reports(result, plots_paths, html_reports)
+                            # Remove duplicates while preserving order
+                            plots_paths = list(dict.fromkeys(plots_paths))
+                            html_reports = list(dict.fromkeys(html_reports))
+                            # Display visualization and report information in response
+                            if plots_paths or html_reports:
+                                response += f"## 📊 Generated Outputs\n\n"
+                                if plots_paths:
+                                    response += f"### 📈 Visualizations ({len(plots_paths)} plots)\n"
+                                    response += "✅ Plots are displayed in the **Visualization Gallery** below!\n\n"
+                                    # List plot files
+                                    for i, plot_path in enumerate(plots_paths[:10], 1):
+                                        try:
+                                            plot_name = Path(plot_path).stem.replace('_', ' ').title()
+                                            rel_path = os.path.relpath(plot_path, '.')
+                                            response += f"{i}. 📊 **{plot_name}**\n"
+                                            response += f"   📁 `{rel_path}`\n\n"
+                                        except Exception as e:
+                                            response += f"{i}. ❌ Error: {str(e)}\n"
+                                if html_reports:
+                                    response += f"### 📋 Reports & Interactive Plots ({len(html_reports)} files)\n"
+                                    response += "✅ Reports are displayed in the **Reports Viewer** below!\n\n"
+                                    # List report files
+                                    for i, report_path in enumerate(html_reports[:10], 1):
+                                        try:
+                                            report_name = Path(report_path).stem.replace('_', ' ').title()
+                                            rel_path = os.path.relpath(report_path, '.')
+                                            file_size = os.path.getsize(report_path) / 1024  # KB
+                                            response += f"{i}. 📄 **{report_name}**\n"
+                                            response += f"   📁 `{rel_path}` ({file_size:.1f} KB)\n\n"
+                                        except Exception as e:
+                                            response += f"{i}. ❌ Error: {str(e)}\n"
+                            else:
+                                response += "ℹ️ No visualizations or reports were generated in this workflow.\n"
+                    else:
+                        response = f"⚠️ **AI Agent Status:** {agent_response.get('status', 'unknown')}\n\n"
+                        response += f"{agent_response.get('message', agent_response.get('error', 'Unknown error'))}\n"
+                    # Update the last assistant message with the response
+                    history = update_last_assistant_message(history, response)
+                    # Return plot paths for gallery and html_reports for HTML viewer
+                    # Store html_reports in a format the HTML component can use
+                    yield history, "", plots_paths if plots_paths else [], html_reports if html_reports else []
+                    return
+                except Exception as e:
+                    import sys
+                    exc_type, exc_value, exc_traceback = sys.exc_info()
+                    response = f"⚠️ **AI Agent Error:**\n\n"
+                    response += f"**Error Type:** {exc_type.__name__}\n\n"
+                    response += f"**Error Message:** {str(e)}\n\n"
+                    response += f"**Full Traceback:**\n```python\n{traceback.format_exc()}\n```\n\n"
+                    response += "💡 **Fallback Options:**\n"
+                    response += "- Use the **Quick Train** feature on the right\n"
+                    response += "- Try manual commands: `profile`, `quality`, `columns`\n"
+                    # Update the last assistant message with error
+                    history = update_last_assistant_message(history, response)
+                    yield history, "", plots_paths if plots_paths else []
+                    return
+            else:
+                # Manual mode - Handle commands directly
+                user_msg_lower = user_message.lower().strip()
+                # Handle simple commands manually
+                if 'profile' in user_msg_lower:
+                    response = "📊 **Dataset Profile:**\n\n"
+                    if current_profile:
+                        response += f"**Shape:** {current_profile['shape']['rows']:,} rows × {current_profile['shape']['columns']} columns\n\n"
+                        response += f"**Column Types:**\n"
+                        response += f"- Numeric: {len(current_profile['column_types']['numeric'])} columns\n"
+                        response += f"- Categorical: {len(current_profile['column_types']['categorical'])} columns\n"
+                        response += f"- Datetime: {len(current_profile['column_types']['datetime'])} columns\n\n"
+                        response += f"**Overall Stats:**\n"
+                        response += f"- Total cells: {current_profile['overall_stats']['total_cells']:,}\n"
+                        response += f"- Null values: {current_profile['overall_stats']['total_nulls']} ({current_profile['overall_stats']['null_percentage']:.1f}%)\n"
+                        response += f"- Duplicates: {current_profile['overall_stats']['duplicate_rows']}\n"
+                    else:
+                        response += "Profile information is available at the top of the chat!"
+                elif 'quality' in user_msg_lower or 'issues' in user_msg_lower:
+                    quality = detect_data_quality_issues(current_file)
+                    response = "🔍 **Data Quality Report:**\n\n"
+                    if quality['critical']:
+                        response += f"🔴 **Critical Issues:** {len(quality['critical'])}\n"
+                        for issue in quality['critical']:
+                            response += f"  • {issue['message']}\n"
+                        response += "\n"
+                    if quality['warning']:
+                        response += f"🟡 **Warnings:** {len(quality['warning'])}\n"
+                        for issue in quality['warning'][:5]:  # Show first 5
+                            response += f"  • {issue['message']}\n"
+                        if len(quality['warning']) > 5:
+                            response += f"  • ... and {len(quality['warning']) - 5} more\n"
+                        response += "\n"
+                    if quality['info']:
+                        response += f"🔵 **Info:** {len(quality['info'])} observations\n"
+                    if not quality['critical'] and not quality['warning'] and not quality['info']:
+                        response += "✅ No issues detected! Your data looks good.\n"
+                elif 'columns' in user_msg_lower or 'column' in user_msg_lower:
+                    if current_profile:
+                        response = "📋 **Dataset Columns:**\n\n"
+                        for col, info in current_profile['columns'].items():
+                            nulls = info.get('null_count', 0)
+                            null_pct = (nulls / current_profile['shape']['rows'] * 100) if current_profile['shape']['rows'] > 0 else 0
+                            response += f"• **{col}** ({info['type']})\n"
+                            response += f"  - Nulls: {nulls} ({null_pct:.1f}%)\n"
+                            if 'unique' in info:
+                                response += f"  - Unique: {info['unique']}\n"
+                    else:
+                        response = "📋 **Columns:** Please upload a file first to see column information."
+                elif 'help' in user_msg_lower:
+                    response = "💡 **Available Commands:**\n\n"
+                    response += "**Manual Commands:**\n"
+                    response += "• `profile` - Show detailed dataset statistics\n"
+                    response += "• `quality` - Check data quality issues\n"
+                    response += "• `columns` - List all columns with details\n"
+                    response += "• `help` - Show this help message\n\n"
+                    response += "**Quick Actions:**\n"
+                    response += "• Use the **Quick Train** panel on the right to train models\n"
+                    response += "• Check **Dataset Info** in the sidebar for quick stats\n"
+                else:
+                    # Default response for unrecognized commands
+                    response = f"💬 **You said:** {user_message}\n\n"
+                    response += "⚠️ AI agent is not available. I can respond to these commands:\n\n"
+                    response += "• `profile` - Show detailed statistics\n"
+                    response += "• `quality` - Check data quality\n"
+                    response += "• `columns` - List all columns\n"
+                    response += "• `help` - Show available commands\n\n"
+                    response += "**Or use Quick Train** on the right to train models directly!\n"
+                # Add user message and assistant response
+                history = add_user_message(history, user_message)
+                history = add_assistant_message(history, response)
+                yield history, "", [], []
+                return
+        # If no file is uploaded yet
+        if user_message and user_message.strip() and not current_file:
+            response = "⚠️ **Please upload a dataset first!**\n\n"
+            response += "Click the 'Upload Dataset' button above and select a CSV or Parquet file."
+            # Add user message and assistant response
+            history = add_user_message(history, user_message)
+            history = add_assistant_message(history, response)
+            yield history, "", [], []
+            return
+    except Exception as e:
+        error_msg = f"❌ **Error:** {str(e)}\n\n"
+        error_msg += "**Traceback:**\n```\n" + traceback.format_exc() + "\n```"
+        if user_message:
+            # Check if we already added the user message
+            last_user = get_last_user_content(history)
+            if last_user != user_message:
+                history = add_user_message(history, user_message)
+            history = add_assistant_message(history, error_msg)
+        else:
+            history = add_assistant_message(history, error_msg)
+        yield history, "", [], []
+        return
+    # Default return if nothing matched
+    yield history, "", [], []
+def quick_profile(file):
+    """Quick profile display in the sidebar."""
+    if file is None:
+        return "No file uploaded yet."
+    try:
+        profile = profile_dataset(file.name)
+        info = f"**{Path(file.name).name}**\n\n"
+        info += f"📊 {profile['shape']['rows']:,} rows × {profile['shape']['columns']} cols\n\n"
+        info += f"**Columns:**\n"
+        for col, col_info in list(profile['columns'].items())[:10]:
+            info += f"- {col} ({col_info['type']})\n"
+        if len(profile['columns']) > 10:
+            info += f"- ... and {len(profile['columns']) - 10} more\n"
+        return info
+    except Exception as e:
+        return f"Error: {str(e)}"
+def train_model_ui(file, target_col, model_type, test_size, progress=gr.Progress()):
+    """Train a model directly from the UI."""
+    if file is None:
+        return "⚠️ Please upload a dataset first!"
+    if not target_col:
+        return "⚠️ Please specify a target column!"
+    # Clean up the target column name - remove surrounding quotes if present
+    target_col = target_col.strip().strip("'").strip('"')
+    try:
+        # Show progress
+        progress(0, desc="🔄 Loading dataset...")
+        yield "⏳ **Training in progress...**\n\n📊 Loading dataset..."
+        import time
+        time.sleep(0.5)  # Brief pause for UI feedback
+        progress(0.2, desc="🔄 Preparing data...")
+        yield "⏳ **Training in progress...**\n\n📊 Dataset loaded\n🔄 Preparing data..."
+        time.sleep(0.3)
+        # Determine problem type
+        problem_type = "classification" if model_type == "Classification" else "regression"
+        progress(0.4, desc="🤖 Training models...")
+        yield "⏳ **Training in progress...**\n\n📊 Dataset loaded\n✅ Data prepared\n🤖 Training multiple models..."
+        # Train baseline models
+        result = train_baseline_models(
+            file.name,
+            target_col=target_col,
+            task_type=problem_type,
+            test_size=test_size
+        )
+        progress(0.9, desc="📊 Evaluating results...")
+        # Check if training was successful
+        if result.get('status') == 'error':
+            yield f"❌ **Training Failed**\n\n{result.get('message', 'Unknown error')}"
+            return
+        if 'best_model' not in result:
+            yield f"❌ **Training Failed**\n\nNo models were successfully trained. Result: {result}"
+            return
+        # Get the best model
+        best_model_name = result['best_model']['name']
+        if not best_model_name:
+            yield f"❌ **Training Failed**\n\nNo model could be selected as best model."
+            return
+        best_model_info = result['models'][best_model_name]
+        best_metrics = best_model_info.get('test_metrics', {})
+        output = f"✅ **Model Training Complete!**\n\n"
+        output += f"## 🏆 Best Model: **{best_model_name}**\n\n"
+        output += f"**Dataset Info:**\n"
+        output += f"- Features: {result.get('n_features', 0)}\n"
+        output += f"- Training samples: {result.get('train_size', 0):,}\n"
+        output += f"- Test samples: {result.get('test_size', 0):,}\n\n"
+        if problem_type == "classification":
+            output += f"**Test Metrics:**\n"
+            output += f"- ✅ Accuracy: {best_metrics.get('accuracy', 0):.4f}\n"
+            output += f"- 🎯 Precision: {best_metrics.get('precision', 0):.4f}\n"
+            output += f"- 📊 Recall: {best_metrics.get('recall', 0):.4f}\n"
+            output += f"- 🔥 F1 Score: {best_metrics.get('f1', 0):.4f}\n\n"
+        else:
+            output += f"**Test Metrics:**\n"
+            output += f"- 📈 R² Score: {best_metrics.get('r2', 0):.4f}\n"
+            output += f"- 📉 RMSE: {best_metrics.get('rmse', 0):.2f}\n"
+            output += f"- 📊 MAE: {best_metrics.get('mae', 0):.2f}\n"
+            output += f"- 💯 MAPE: {best_metrics.get('mape', 0):.2f}%\n\n"
+        output += f"## 📊 All Models Comparison:\n\n"
+        for model_name, model_info in result['models'].items():
+            if 'test_metrics' in model_info:
+                test_metrics = model_info['test_metrics']
+                indicator = "🏆 " if model_name == best_model_name else "   "
+                if problem_type == "classification":
+                    f1 = test_metrics.get('f1', 0)
+                    acc = test_metrics.get('accuracy', 0)
+                    output += f"{indicator}**{model_name}:**\n"
+                    output += f"   - F1: {f1:.4f} | Accuracy: {acc:.4f}\n"
+                else:
+                    r2 = test_metrics.get('r2', 0)
+                    rmse = test_metrics.get('rmse', 0)
+                    output += f"{indicator}**{model_name}:**\n"
+                    output += f"   - R²: {r2:.4f} | RMSE: {rmse:.2f}\n"
+            elif 'status' in model_info and model_info['status'] == 'error':
+                output += f"   ❌ **{model_name}:** {model_info.get('message', 'Error')}\n"
+        # Display generated plots if available
+        plots_to_show = []
+        # Check for performance plots
+        if 'performance_plots' in result and result['performance_plots']:
+            if isinstance(result['performance_plots'], list):
+                plots_to_show.extend(result['performance_plots'])
+            else:
+                plots_to_show.append(result['performance_plots'])
+        # Check for feature importance plot
+        if 'feature_importance_plot' in result and result['feature_importance_plot']:
+            plots_to_show.append(result['feature_importance_plot'])
+        # Embed plots
+        if plots_to_show:
+            output += f"\n\n📊 **Visualizations:**\n\n"
+            for plot_path in plots_to_show:
+                if isinstance(plot_path, str) and plot_path.endswith('.html') and os.path.exists(plot_path):
+                    try:
+                        with open(plot_path, 'r', encoding='utf-8') as f:
+                            plot_html = f.read()
+                        # Add plot title based on filename
+                        plot_name = Path(plot_path).stem.replace('_', ' ').title()
+                        output += f"**{plot_name}:**\n"
+                        output += f'<iframe srcdoc="{plot_html.replace(chr(34), "&quot;")}" width="100%" height="500" frameborder="0"></iframe>\n\n'
+                    except Exception as e:
+                        # Fallback to file path
+                        output += f"📁 {Path(plot_path).name}: `{plot_path}`\n"
+        progress(1.0, desc="✅ Complete!")
+        yield output
+    except Exception as e:
+        yield f"❌ **Error:** {str(e)}\n\n```\n{traceback.format_exc()}\n```"
+def clear_conversation():
+    """Clear the conversation and reset state."""
+    global current_file, current_profile
+    current_file = None
+    current_profile = None
+    return [], None, "", [], ""
+def format_html_reports(html_paths):
+    """Format HTML reports/plots for display in HTML component."""
+    if not html_paths or len(html_paths) == 0:
+        return "<div style='text-align:center; padding:40px; color:#666;'>No reports generated yet. Try: 'Generate a quality report' or 'Create interactive visualizations'</div>"
+    html_output = """
+    <style>
+        .report-container {
+            padding: 20px;
+            background: #f8f9fa;
+        }
+        .report-card {
+            margin-bottom: 30px;
+            border: 2px solid #dee2e6;
+            border-radius: 12px;
+            overflow: hidden;
+            background: white;
+            box-shadow: 0 4px 6px rgba(0,0,0,0.1);
+        }
+        .report-header {
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            color: white;
+            padding: 15px 20px;
+            font-weight: bold;
+            font-size: 18px;
+            display: flex;
+            justify-content: space-between;
+            align-items: center;
+        }
+        .report-meta {
+            font-size: 12px;
+            opacity: 0.9;
+        }
+        .report-iframe {
+            width: 100%;
+            min-height: 600px;
+            border: none;
+            background: white;
+        }
+        .report-footer {
+            background: #f8f9fa;
+            padding: 10px 20px;
+            font-size: 12px;
+            color: #666;
+            border-top: 1px solid #dee2e6;
+        }
+    </style>
+    <div class="report-container">
+    """
+    html_output += f"<h2 style='color: #667eea; margin-bottom: 20px;'>📋 {len(html_paths)} Report(s) Generated</h2>"
+    for i, html_path in enumerate(html_paths, 1):
+        try:
+            # Get file metadata
+            file_name = Path(html_path).name
+            file_size = os.path.getsize(html_path) / 1024  # KB
+            report_title = Path(html_path).stem.replace('_', ' ').title()
+            # Read the HTML content
+            with open(html_path, 'r', encoding='utf-8') as f:
+                html_content = f.read()
+            # Escape the content for embedding
+            escaped_content = html_content.replace('\\', '\\\\').replace('"', '&quot;').replace("'", "\\'")
+            html_output += f"""
+            <div class="report-card">
+                <div class="report-header">
+                    <span>📊 {i}. {report_title}</span>
+                    <span class="report-meta">{file_size:.1f} KB</span>
+                </div>
+                <iframe class="report-iframe" srcdoc="{escaped_content}"></iframe>
+                <div class="report-footer">
+                    📁 {html_path}
+                </div>
+            </div>
+            """
+        except Exception as e:
+            html_output += f"""
+            <div class="report-card">
+                <div class="report-header" style="background: linear-gradient(135deg, #f44336 0%, #e91e63 100%);">
+                    <span>❌ Error loading: {Path(html_path).name}</span>
+                </div>
+                <div style="padding: 20px;">
+                    <p><strong>Error:</strong> {str(e)}</p>
+                    <p><strong>Path:</strong> {html_path}</p>
+                </div>
+            </div>
+            """
+    html_output += "</div>"
+    return html_output
+def extract_and_display_plots(agent_response):
+    """Extract plots from agent response and format them for display."""
+    plots_html = ""
+    if not agent_response or agent_response.get('status') != 'success':
+        return gr.update(value="<p style='text-align:center; color:#666;'>No visualizations generated yet. Upload a dataset and run analysis!</p>")
+    workflow_history = agent_response.get('workflow_history', [])
+    if not workflow_history:
+        return gr.update(value="<p style='text-align:center; color:#666;'>No visualizations in this workflow.</p>")
+    # Find all plots
+    plots_paths = []
+    def find_plots(obj, plots_list):
+        if isinstance(obj, dict):
+            # Check direct plot keys
+            for key in ['plot_path', 'plot_file', 'html_path', 'output_path',
+                       'plots', 'plot_paths', 'performance_plots', 'feature_importance_plot']:
+                if key in obj and obj[key]:
+                    if isinstance(obj[key], list):
+                        for plot_path in obj[key]:
+                            if isinstance(plot_path, str) and plot_path.endswith('.html') and os.path.exists(plot_path):
+                                plots_list.append(plot_path)
+                    elif isinstance(obj[key], str) and obj[key].endswith('.html') and os.path.exists(obj[key]):
+                        plots_list.append(obj[key])
+            # Recursively search nested dicts
+            for value in obj.values():
+                find_plots(value, plots_list)
+    for step in workflow_history:
+        result = step.get('result', {})
+        find_plots(result, plots_paths)
+    # Remove duplicates while preserving order
+    plots_paths = list(dict.fromkeys(plots_paths))
+    if not plots_paths:
+        return gr.update(value="<p style='text-align:center; color:#666;'>No plots were generated in this analysis.</p>")
+    # Build HTML gallery
+    plots_html = f"""
+    <div style='padding: 20px;'>
+        <h2 style='color: #1f77b4; margin-bottom: 20px;'>📊 Visualization Gallery ({len(plots_paths)} plots)</h2>
+    """
+    for i, plot_path in enumerate(plots_paths, 1):
+        try:
+            with open(plot_path, 'r', encoding='utf-8') as f:
+                plot_content = f.read()
+            plot_name = Path(plot_path).stem.replace('_', ' ').title()
+            plots_html += f"""
+            <div style='margin-bottom: 30px; border: 1px solid #ddd; border-radius: 8px; overflow: hidden;'>
+                <div style='background: linear-gradient(90deg, #1f77b4, #2ca02c); color: white; padding: 10px 15px; font-weight: bold;'>
+                    {i}. {plot_name}
+                </div>
+                <div style='padding: 10px; background: white;'>
+                    <iframe srcdoc='{plot_content.replace("'", "&apos;").replace('"', "&quot;")}'
+                            width='100%' height='500' frameborder='0'
+                            style='border: none; border-radius: 5px;'></iframe>
+                </div>
+                <div style='background: #f8f9fa; padding: 8px 15px; font-size: 12px; color: #666;'>
+                    📁 {plot_path}
+                </div>
+            </div>
+            """
+        except Exception as e:
+            plots_html += f"""
+            <div style='margin-bottom: 20px; padding: 15px; border: 1px solid #f44336; border-radius: 5px; background: #ffebee;'>
+                <strong>❌ Failed to load: {Path(plot_path).name}</strong><br>
+                <small>{str(e)}</small>
+            </div>
+            """
+    plots_html += "</div>"
+    return gr.update(value=plots_html)
+# Custom CSS for better visual feedback
+custom_css = """
+.status-box {
+    padding: 10px;
+    border-radius: 5px;
+    background: linear-gradient(90deg, #e8f5e9 0%, #c8e6c9 100%);
+    margin-bottom: 10px;
+    text-align: center;
+    font-weight: bold;
+}
+"""
+    # Create the Gradio interface
+with gr.Blocks(title="AI Agent Data Scientist", theme=gr.themes.Soft(), css=custom_css) as demo:
+    gr.Markdown("""
+    # 🤖 AI Agent Data Scientist
+    Upload your dataset and chat with the AI agent to perform data science tasks!
+    **Features:**
+    - 📊 Automatic dataset profiling
+    - 🤖 Natural language queries
+    - 🎯 Model training (classification & regression)
+    - 🔍 Data quality analysis
+    - 📈 Feature engineering
+    - 🎨 **NEW:** Automatic visualization generation!
+    - And 59 tools total!
+    """)
+    # Store agent response for visualization extraction
+    agent_response_state = gr.State(None)
+    with gr.Row():
+        # Left column - Main chat interface
+        with gr.Column(scale=2):
+            # Status indicator
+            status_box = gr.Markdown("🟢 **Ready** - Upload a dataset to begin", elem_classes=["status-box"])
+            chatbot = gr.Chatbot(
+                label="Chat with AI Agent",
+                height=450,
+                show_label=True,
+                avatar_images=(None, "🤖"),
+                sanitize_html=False  # Allow HTML content including iframes
+            )
+            with gr.Row():
+                file_upload = gr.File(
+                    label="📁 Upload Dataset(s) (CSV/Parquet) - Single or Multiple Files",
+                    file_types=[".csv", ".parquet"],
+                    file_count="multiple",  # Allow multiple file uploads
+                    type="filepath"
+                )
+            with gr.Row():
+                user_input = gr.Textbox(
+                    label="Your Message",
+                    placeholder="Ask anything: 'train a model', 'analyze my data', 'generate visualizations'",
+                    lines=2,
+                    scale=4
+                )
+                submit_btn = gr.Button("📤 Send", variant="primary", scale=1)
+            with gr.Row():
+                clear_btn = gr.Button("🗑️ Clear", variant="secondary")        # Right column - Quick actions and info
+        with gr.Column(scale=1):
+            gr.Markdown("## 📊 Dataset Info")
+            dataset_info = gr.Markdown("Upload a dataset to see information here.")
+            gr.Markdown("## 🎯 Quick Train")
+            with gr.Group():
+                target_column = gr.Textbox(
+                    label="Target Column",
+                    placeholder="e.g., 'price', 'class', 'label'"
+                )
+                model_type_choice = gr.Radio(
+                    ["Classification", "Regression"],
+                    label="Model Type",
+                    value="Classification"
+                )
+                test_size_slider = gr.Slider(
+                    0.1, 0.5, 0.3,
+                    label="Test Size",
+                    step=0.05
+                )
+                train_btn = gr.Button("🚀 Train Model", variant="primary")
+            training_output = gr.Markdown("Training results will appear here.")
+            gr.Markdown("""
+            ## 💡 Example Queries
+            - "Train a classification model to predict [target]"
+            - "Show me statistics for [column]"
+            - "Detect outliers in the dataset"
+            - "What are the most important features?"
+            - "Generate a quality report"
+            - "Create polynomial features"
+            - "Balance the dataset using SMOTE"
+            """)
+    # Visualization Gallery Section (Full Width)
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("## 🎨 Visualization Gallery")
+            visualization_gallery = gr.Gallery(
+                label="Generated Plots (PNG/JPG)",
+                show_label=True,
+                elem_id="gallery",
+                columns=2,
+                height=400
+            )
+    # Reports Viewer Section (Full Width)
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("## 📋 Reports & Interactive Visualizations")
+            gr.Markdown("*HTML reports and interactive Plotly charts will be displayed here*")
+            reports_viewer = gr.HTML(
+                value="<div style='text-align:center; padding:40px; color:#666;'>No reports generated yet. Try: 'Generate a quality report' or 'Create interactive visualizations'</div>",
+                elem_id="reports_viewer"
+            )
+    # Create state to hold HTML report paths
+    html_reports_state = gr.State([])
+    # Event handlers with streaming support
+    submit_result = submit_btn.click(
+        fn=analyze_dataset,
+        inputs=[file_upload, user_input, chatbot],
+        outputs=[chatbot, user_input, visualization_gallery, html_reports_state],
+        show_progress="full"  # Show progress bar
+    )
+    submit_result.then(
+        fn=format_html_reports,
+        inputs=[html_reports_state],
+        outputs=[reports_viewer]
+    )
+    user_input_result = user_input.submit(
+        fn=analyze_dataset,
+        inputs=[file_upload, user_input, chatbot],
+        outputs=[chatbot, user_input, visualization_gallery, html_reports_state],
+        show_progress="full"
+    )
+    user_input_result.then(
+        fn=format_html_reports,
+        inputs=[html_reports_state],
+        outputs=[reports_viewer]
+    )
+    file_result = file_upload.change(
+        fn=analyze_dataset,
+        inputs=[file_upload, gr.Textbox(value="", visible=False), chatbot],
+        outputs=[chatbot, user_input, visualization_gallery, html_reports_state],
+        show_progress="full"
+    )
+    file_result.then(
+        fn=quick_profile,
+        inputs=[file_upload],
+        outputs=[dataset_info]
+    )
+    file_result.then(
+        fn=format_html_reports,
+        inputs=[html_reports_state],
+        outputs=[reports_viewer]
+    )
+    train_btn.click(
+        fn=train_model_ui,
+        inputs=[file_upload, target_column, model_type_choice, test_size_slider],
+        outputs=[training_output],
+        show_progress="full"  # Show progress bar
+    )
+    clear_btn.click(
+        clear_conversation,
+        outputs=[chatbot, file_upload, user_input, visualization_gallery, reports_viewer]
+    )
+if __name__ == "__main__":
+    print("=" * 70)
+    print("🚀 Starting AI Agent Data Scientist Chat UI...")
+    print("=" * 70)
+    print("\n🌐 The UI will open in your browser automatically.")
+    print("💡 If it doesn't, copy the URL shown below.\n")
+    demo.launch(
+        share=False,  # Set to True to create a public link
+        server_name="0.0.0.0",  # Listen on all interfaces
+        server_port=7865,  # Changed port to avoid conflict
+        show_error=True,
+        inbrowser=True  # Auto-open browser
+    )

cloudbuild.yaml ADDED Viewed

	@@ -0,0 +1,69 @@

+# Google Cloud Build configuration for automated deployments
+# Triggered on git push to main branch
+steps:
+  # Step 1: Build the container image
+  - name: 'gcr.io/cloud-builders/docker'
+    args:
+      - 'build'
+      - '-t'
+      - 'gcr.io/$PROJECT_ID/data-science-agent:$COMMIT_SHA'
+      - '-t'
+      - 'gcr.io/$PROJECT_ID/data-science-agent:latest'
+      - '.'
+    timeout: 600s
+  # Step 2: Push the container image to Container Registry
+  - name: 'gcr.io/cloud-builders/docker'
+    args:
+      - 'push'
+      - 'gcr.io/$PROJECT_ID/data-science-agent:$COMMIT_SHA'
+  - name: 'gcr.io/cloud-builders/docker'
+    args:
+      - 'push'
+      - 'gcr.io/$PROJECT_ID/data-science-agent:latest'
+  # Step 3: Deploy to Cloud Run
+  - name: 'gcr.io/google.com/cloudsdktool/cloud-sdk'
+    entrypoint: gcloud
+    args:
+      - 'run'
+      - 'deploy'
+      - 'data-science-agent'
+      - '--image'
+      - 'gcr.io/$PROJECT_ID/data-science-agent:$COMMIT_SHA'
+      - '--region'
+      - 'us-central1'
+      - '--platform'
+      - 'managed'
+      - '--allow-unauthenticated'
+      - '--memory'
+      - '4Gi'
+      - '--cpu'
+      - '2'
+      - '--timeout'
+      - '900'
+      - '--max-instances'
+      - '10'
+      - '--min-instances'
+      - '0'
+      - '--concurrency'
+      - '10'
+      - '--set-env-vars'
+      - 'LLM_PROVIDER=groq,REASONING_EFFORT=medium,CACHE_TTL_SECONDS=86400'
+      - '--set-secrets'
+      - 'GROQ_API_KEY=GROQ_API_KEY:latest,GOOGLE_API_KEY=GOOGLE_API_KEY:latest,GOOGLE_APPLICATION_CREDENTIALS=GOOGLE_APPLICATION_CREDENTIALS:latest'
+# Build timeout
+timeout: 1200s
+# Images to push to Container Registry
+images:
+  - 'gcr.io/$PROJECT_ID/data-science-agent:$COMMIT_SHA'
+  - 'gcr.io/$PROJECT_ID/data-science-agent:latest'
+# Build options
+options:
+  machineType: 'N1_HIGHCPU_8'
+  logging: CLOUD_LOGGING_ONLY

data/.gitkeep ADDED Viewed

File without changes

deploy.sh ADDED Viewed

	@@ -0,0 +1,171 @@

+#!/bin/bash
+# Manual deployment script for Google Cloud Run
+# Use this for one-off deployments or CI/CD pipeline integration
+set -e  # Exit on error
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+echo -e "${GREEN}🚀 Data Science Agent - Cloud Run Deployment${NC}"
+echo "=================================================="
+# Check if gcloud is installed
+if ! command -v gcloud &> /dev/null; then
+    echo -e "${RED}❌ Error: gcloud CLI not found. Install it from: https://cloud.google.com/sdk/install${NC}"
+    exit 1
+fi
+# Get GCP Project ID
+if [ -z "$GCP_PROJECT_ID" ]; then
+    echo -e "${YELLOW}⚠️  GCP_PROJECT_ID not set. Using gcloud default project...${NC}"
+    GCP_PROJECT_ID=$(gcloud config get-value project 2>/dev/null)
+    if [ -z "$GCP_PROJECT_ID" ]; then
+        echo -e "${RED}❌ Error: No GCP project configured. Run: gcloud config set project YOUR_PROJECT_ID${NC}"
+        exit 1
+    fi
+fi
+echo -e "${GREEN}📋 Project ID: ${GCP_PROJECT_ID}${NC}"
+# Configuration
+SERVICE_NAME="data-science-agent"
+REGION="${CLOUD_RUN_REGION:-us-central1}"
+IMAGE_NAME="gcr.io/${GCP_PROJECT_ID}/${SERVICE_NAME}"
+MEMORY="${MEMORY:-4Gi}"
+CPU="${CPU:-2}"
+MAX_INSTANCES="${MAX_INSTANCES:-10}"
+TIMEOUT="${TIMEOUT:-900}"
+echo "Region: ${REGION}"
+echo "Image: ${IMAGE_NAME}:latest"
+echo "Memory: ${MEMORY}"
+echo "CPU: ${CPU}"
+echo ""
+# Step 1: Enable required APIs
+echo -e "${YELLOW}🔧 Step 1/5: Enabling required Google Cloud APIs...${NC}"
+gcloud services enable \
+    cloudbuild.googleapis.com \
+    run.googleapis.com \
+    containerregistry.googleapis.com \
+    secretmanager.googleapis.com \
+    --project=${GCP_PROJECT_ID} \
+    --quiet
+echo -e "${GREEN}✅ APIs enabled${NC}"
+echo ""
+# Step 2: Create secrets (if not exist)
+echo -e "${YELLOW}🔐 Step 2/5: Checking secrets...${NC}"
+create_secret_if_not_exists() {
+    local secret_name=$1
+    local secret_value=$2
+    if gcloud secrets describe ${secret_name} --project=${GCP_PROJECT_ID} &>/dev/null; then
+        echo "  ℹ️  Secret ${secret_name} already exists"
+    else
+        if [ -n "${secret_value}" ]; then
+            echo "  ➕ Creating secret: ${secret_name}"
+            echo -n "${secret_value}" | gcloud secrets create ${secret_name} \
+                --data-file=- \
+                --project=${GCP_PROJECT_ID} \
+                --quiet
+        else
+            echo -e "  ${YELLOW}⚠️  ${secret_name} not provided. You'll need to create it manually:${NC}"
+            echo "     gcloud secrets create ${secret_name} --data-file=- --project=${GCP_PROJECT_ID}"
+        fi
+    fi
+}
+create_secret_if_not_exists "GROQ_API_KEY" "${GROQ_API_KEY}"
+create_secret_if_not_exists "GOOGLE_API_KEY" "${GOOGLE_API_KEY}"
+echo -e "${GREEN}✅ Secrets checked${NC}"
+echo ""
+# Step 3: Build container image
+echo -e "${YELLOW}🏗️  Step 3/5: Building container image...${NC}"
+gcloud builds submit \
+    --tag ${IMAGE_NAME}:latest \
+    --project=${GCP_PROJECT_ID} \
+    --timeout=600s \
+    .
+echo -e "${GREEN}✅ Container built: ${IMAGE_NAME}:latest${NC}"
+echo ""
+# Step 4: Deploy to Cloud Run
+echo -e "${YELLOW}🚀 Step 4/5: Deploying to Cloud Run...${NC}"
+# Build the gcloud command
+DEPLOY_CMD="gcloud run deploy ${SERVICE_NAME} \
+    --image ${IMAGE_NAME}:latest \
+    --platform managed \
+    --region ${REGION} \
+    --allow-unauthenticated \
+    --memory ${MEMORY} \
+    --cpu ${CPU} \
+    --timeout ${TIMEOUT} \
+    --max-instances ${MAX_INSTANCES} \
+    --min-instances 0 \
+    --concurrency 10 \
+    --set-env-vars LLM_PROVIDER=groq,REASONING_EFFORT=medium,CACHE_TTL_SECONDS=86400,ARTIFACT_BACKEND=local \
+    --project ${GCP_PROJECT_ID}"
+# Add secrets if they exist
+if gcloud secrets describe GROQ_API_KEY --project=${GCP_PROJECT_ID} &>/dev/null; then
+    DEPLOY_CMD="${DEPLOY_CMD} --set-secrets GROQ_API_KEY=GROQ_API_KEY:latest"
+fi
+if gcloud secrets describe GOOGLE_API_KEY --project=${GCP_PROJECT_ID} &>/dev/null; then
+    DEPLOY_CMD="${DEPLOY_CMD} --set-secrets GOOGLE_API_KEY=GOOGLE_API_KEY:latest"
+fi
+# Execute deployment
+eval ${DEPLOY_CMD}
+echo -e "${GREEN}✅ Deployment complete${NC}"
+echo ""
+# Step 5: Get service URL
+echo -e "${YELLOW}🌐 Step 5/5: Retrieving service URL...${NC}"
+SERVICE_URL=$(gcloud run services describe ${SERVICE_NAME} \
+    --region ${REGION} \
+    --project ${GCP_PROJECT_ID} \
+    --format 'value(status.url)')
+echo ""
+echo -e "${GREEN}========================================${NC}"
+echo -e "${GREEN}✅ DEPLOYMENT SUCCESSFUL!${NC}"
+echo -e "${GREEN}========================================${NC}"
+echo ""
+echo -e "🌐 Service URL: ${GREEN}${SERVICE_URL}${NC}"
+echo ""
+echo "📝 Test endpoints:"
+echo "  Health check:"
+echo "    curl ${SERVICE_URL}/health"
+echo ""
+echo "  List tools:"
+echo "    curl ${SERVICE_URL}/tools"
+echo ""
+echo "  Run analysis:"
+echo "    curl -X POST ${SERVICE_URL}/run \\"
+echo "      -F 'file=@data.csv' \\"
+echo "      -F 'task_description=Analyze this dataset and predict the target column'"
+echo ""
+echo -e "${YELLOW}📊 View logs:${NC}"
+echo "  gcloud run logs read ${SERVICE_NAME} --region ${REGION} --project ${GCP_PROJECT_ID} --limit 50"
+echo ""
+echo -e "${YELLOW}🔧 Manage service:${NC}"
+echo "  gcloud run services describe ${SERVICE_NAME} --region ${REGION} --project ${GCP_PROJECT_ID}"
+echo ""
+# Save service URL to file
+echo "${SERVICE_URL}" > .cloud_run_url
+echo -e "${GREEN}💾 Service URL saved to .cloud_run_url${NC}"

examples/titanic_example.py ADDED Viewed

	@@ -0,0 +1,166 @@

+"""
+Titanic Example - Demonstrating the complete Data Science Copilot workflow
+"""
+import sys
+import os
+from pathlib import Path
+# Add src to path
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
+from orchestrator import DataScienceCopilot
+from rich.console import Console
+from rich.panel import Panel
+console = Console()
+def main():
+    """
+    Complete example using the Titanic dataset.
+    This demonstrates the full workflow:
+    1. Dataset profiling
+    2. Quality issue detection
+    3. Data cleaning
+    4. Feature engineering
+    5. Model training
+    6. Report generation
+    """
+    console.print(Panel.fit(
+        "🚢 Titanic Survival Prediction - Complete Workflow Example",
+        style="bold blue"
+    ))
+    # Setup
+    titanic_path = "./data/titanic.csv"
+    # Check if dataset exists
+    if not Path(titanic_path).exists():
+        console.print("\n[yellow]⚠ Titanic dataset not found at ./data/titanic.csv[/yellow]")
+        console.print("[yellow]Please download it from: https://www.kaggle.com/c/titanic/data[/yellow]")
+        console.print("[yellow]Or place your own CSV file in the data directory[/yellow]\n")
+        # Use a sample path instead
+        console.print("[blue]Using sample dataset path for demonstration...[/blue]\n")
+        titanic_path = "your_dataset.csv"  # User should replace this
+    # Initialize copilot
+    console.print("\n[bold]Step 1: Initialize Data Science Copilot[/bold]")
+    try:
+        copilot = DataScienceCopilot(reasoning_effort="medium")
+        console.print("[green]✓ Copilot initialized successfully[/green]")
+    except Exception as e:
+        console.print(f"[red]✗ Error: {e}[/red]")
+        console.print("[yellow]Make sure to set GROQ_API_KEY in .env file[/yellow]")
+        return
+    # Define the task
+    task_description = """
+    Analyze the Titanic dataset and build a model to predict passenger survival.
+    Key objectives:
+    1. Understand the data structure and identify quality issues
+    2. Handle missing values appropriately
+    3. Engineer relevant features from available data (e.g., family size, titles from names)
+    4. Train and compare multiple baseline models
+    5. Identify the most important features for prediction
+    6. Provide recommendations for improvement
+    Target: Achieve competitive performance (aim for 50-70th percentile on Kaggle leaderboard)
+    """
+    target_column = "Survived"
+    console.print("\n[bold]Step 2: Run Complete Analysis Workflow[/bold]")
+    console.print(f"Dataset: {titanic_path}")
+    console.print(f"Target: {target_column}")
+    console.print(f"Task: Predict passenger survival\n")
+    # Run analysis
+    try:
+        result = copilot.analyze(
+            file_path=titanic_path,
+            task_description=task_description,
+            target_col=target_column,
+            use_cache=True,
+            max_iterations=15  # Allow more iterations for complex workflow
+        )
+        # Display results
+        if result["status"] == "success":
+            console.print("\n[green]✓ Analysis Complete![/green]\n")
+            # Display summary
+            console.print(Panel(
+                result["summary"],
+                title="📋 Final Analysis Summary",
+                border_style="green"
+            ))
+            # Display workflow steps
+            console.print("\n[bold]🔧 Workflow Steps Executed:[/bold]")
+            for i, step in enumerate(result["workflow_history"], 1):
+                tool = step["tool"]
+                success = step["result"].get("success", False)
+                icon = "✓" if success else "✗"
+                color = "green" if success else "red"
+                console.print(f"{i}. [{color}]{icon}[/{color}] {tool}")
+            # Display statistics
+            console.print(f"\n[bold]📊 Execution Statistics:[/bold]")
+            console.print(f"  Total Iterations: {result['iterations']}")
+            console.print(f"  API Calls Made: {result['api_calls']}")
+            console.print(f"  Execution Time: {result['execution_time']}s")
+            # Check for trained models
+            console.print("\n[bold]🤖 Model Training Results:[/bold]")
+            for step in result["workflow_history"]:
+                if step["tool"] == "train_baseline_models":
+                    if step["result"].get("success"):
+                        models_result = step["result"]["result"]
+                        best_model = models_result.get("best_model", {})
+                        console.print(f"  Best Model: {best_model.get('name')}")
+                        console.print(f"  Score: {best_model.get('score'):.4f}")
+                        console.print(f"  Model Path: {best_model.get('model_path')}")
+            # Save results
+            output_file = "./outputs/reports/titanic_analysis.json"
+            Path(output_file).parent.mkdir(parents=True, exist_ok=True)
+            import json
+            with open(output_file, "w") as f:
+                json.dump(result, f, indent=2)
+            console.print(f"\n[cyan]💾 Full results saved to: {output_file}[/cyan]")
+            # Next steps
+            console.print("\n[bold]🎯 Next Steps:[/bold]")
+            console.print("  1. Review the generated models in ./outputs/models/")
+            console.print("  2. Check data quality reports in ./outputs/reports/")
+            console.print("  3. Examine cleaned datasets in ./outputs/data/")
+            console.print("  4. Use the best model for predictions on new data")
+        elif result["status"] == "error":
+            console.print(f"\n[red]✗ Analysis failed: {result['error']}[/red]")
+            console.print(f"Error type: {result['error_type']}")
+        else:
+            console.print(f"\n[yellow]⚠ Analysis incomplete: {result.get('message')}[/yellow]")
+    except Exception as e:
+        console.print(f"\n[red]✗ Unexpected error: {e}[/red]")
+        import traceback
+        console.print(traceback.format_exc())
+    # Cache statistics
+    console.print("\n[bold]📦 Cache Statistics:[/bold]")
+    cache_stats = copilot.get_cache_stats()
+    console.print(f"  Valid Entries: {cache_stats['valid_entries']}")
+    console.print(f"  Cache Size: {cache_stats['size_mb']} MB")
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,98 @@

+# Core Dependencies
+groq==0.11.0
+python-dotenv==1.0.0
+# Data Processing
+polars>=0.20.3
+duckdb>=0.10.0
+pyarrow>=14.0.1
+pandas>=2.2.0  # Updated for Python 3.13 compatibility
+# Machine Learning
+scikit-learn>=1.4.0
+xgboost>=2.0.3
+lightgbm>=4.6.0
+catboost>=1.2.8
+optuna>=3.5.0
+# Explainability
+shap>=0.44.1
+# Advanced ML Tools
+imbalanced-learn>=0.12.0
+# Statistical Analysis
+scipy>=1.11.4
+statsmodels>=0.14.1
+# Visualization
+matplotlib>=3.8.2
+seaborn>=0.13.1
+plotly>=5.18.0  # Interactive visualizations
+# EDA Report Generation
+sweetviz>=2.3.1  # Beautiful fast EDA reports
+ydata-profiling>=4.17.0  # Updated for Python 3.13 compatibility
+# User Interface
+# gradio>=5.49.1  # Replaced with React frontend
+# REST API (Cloud Run)
+fastapi>=0.109.0
+uvicorn>=0.25.0
+python-multipart>=0.0.6  # For file uploads
+# Text Processing
+textblob>=0.17.1
+# Time Series Forecasting
+prophet>=1.1.5
+holidays>=0.38
+# MLOps & Explainability
+lime==0.2.0.1
+fairlearn==0.10.0
+# NLP (Optional - Uncomment for advanced NLP tools)
+# These are optional but recommended for full NLP capabilities
+# spacy==3.7.2  # For named entity recognition (perform_named_entity_recognition)
+# transformers==4.35.2  # For transformer-based sentiment & topic modeling
+# sentence-transformers==2.2.2  # For semantic text similarity
+# bertopic==0.16.0  # For advanced topic modeling
+# Computer Vision (Optional - Uncomment for CV tools)
+# These are optional but recommended for full CV capabilities
+# torch==2.1.0  # For CNN-based image feature extraction
+# torchvision==0.16.0  # For pre-trained models (ResNet, EfficientNet, VGG)
+Pillow==10.1.0  # For basic image processing
+#opencv-python==4.8.1  # For advanced image processing & color features
+# Business Intelligence (Optional - Uncomment for advanced BI tools)
+# These are optional but add specialized capabilities
+# lifetimes==0.11.3  # For customer lifetime value modeling
+# econml==0.15.0  # For advanced causal inference
+# CLI & UI
+typer==0.9.0
+rich==13.7.0
+tqdm==4.66.1
+# Utilities
+pydantic==2.5.3
+joblib==1.3.2
+# Google Cloud Integration
+google-cloud-bigquery==3.14.1
+google-cloud-storage==2.14.0  # For GCS artifact storage
+google-auth==2.25.2
+google-generativeai==0.3.2  # For Gemini LLM support
+# Testing
+pytest==7.4.3
+pytest-mock==3.12.0
+pytest-cov==4.1.0
+# Development
+black==23.12.1
+flake8==7.0.0
+mypy==1.8.0

setup-deployment.sh ADDED Viewed

	@@ -0,0 +1,78 @@

+#!/bin/bash
+# Quick setup script for macOS deployment prerequisites
+set -e
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m'
+echo -e "${BLUE}🔧 Data Science Agent - Deployment Setup${NC}"
+echo "=========================================="
+echo ""
+# Check if Homebrew is installed
+if ! command -v brew &> /dev/null; then
+    echo -e "${RED}❌ Homebrew not found${NC}"
+    echo "Installing Homebrew..."
+    /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
+else
+    echo -e "${GREEN}✅ Homebrew installed${NC}"
+fi
+# Install Docker Desktop
+if ! command -v docker &> /dev/null; then
+    echo -e "${YELLOW}📦 Installing Docker Desktop...${NC}"
+    brew install --cask docker
+    echo -e "${GREEN}✅ Docker Desktop installed${NC}"
+    echo -e "${YELLOW}⚠️  Please start Docker Desktop application, then run this script again${NC}"
+    exit 0
+else
+    echo -e "${GREEN}✅ Docker installed${NC}"
+fi
+# Check if Docker daemon is running
+if ! docker info &> /dev/null; then
+    echo -e "${YELLOW}⚠️  Docker is installed but not running${NC}"
+    echo "Please start Docker Desktop application, then run this script again"
+    exit 0
+fi
+# Install Google Cloud SDK
+if ! command -v gcloud &> /dev/null; then
+    echo -e "${YELLOW}☁️  Installing Google Cloud SDK...${NC}"
+    brew install --cask google-cloud-sdk
+    echo -e "${GREEN}✅ Google Cloud SDK installed${NC}"
+    echo ""
+    echo -e "${YELLOW}📝 Next steps:${NC}"
+    echo "1. Restart your terminal to load gcloud"
+    echo "2. Run: gcloud auth login"
+    echo "3. Run: gcloud auth application-default login"
+    echo "4. Run: gcloud config set project YOUR_PROJECT_ID"
+    echo "5. Run: ./deploy.sh"
+else
+    echo -e "${GREEN}✅ Google Cloud SDK installed${NC}"
+fi
+echo ""
+echo -e "${BLUE}========================================${NC}"
+echo -e "${GREEN}✅ Setup complete!${NC}"
+echo ""
+echo "Next steps:"
+echo "1. Authenticate with Google Cloud:"
+echo "   ${YELLOW}gcloud auth login${NC}"
+echo "   ${YELLOW}gcloud auth application-default login${NC}"
+echo ""
+echo "2. Set your GCP project:"
+echo "   ${YELLOW}gcloud config set project YOUR_PROJECT_ID${NC}"
+echo ""
+echo "3. Set your API keys:"
+echo "   ${YELLOW}export GROQ_API_KEY='your-groq-key'${NC}"
+echo "   ${YELLOW}export GOOGLE_API_KEY='your-google-key'${NC}"
+echo ""
+echo "4. Deploy to Cloud Run:"
+echo "   ${YELLOW}./deploy.sh${NC}"
+echo ""

src/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""Data Science Copilot - AI-powered data science automation."""
+__version__ = "0.1.0"
+from .orchestrator import DataScienceCopilot
+__all__ = ["DataScienceCopilot"]

src/api/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+"""
+Cloud Run API Module
+FastAPI wrapper for DataScienceCopilot
+"""

src/api/app.py ADDED Viewed

	@@ -0,0 +1,513 @@

+"""
+FastAPI Application for Google Cloud Run
+Thin HTTP wrapper around DataScienceCopilot - No logic changes, just API exposure.
+"""
+import os
+import sys
+import tempfile
+import shutil
+from pathlib import Path
+from typing import Optional, Dict, Any, List
+import logging
+from dotenv import load_dotenv
+# Load environment variables from .env file
+load_dotenv()
+from fastapi import FastAPI, File, UploadFile, Form, HTTPException, Request
+from fastapi.responses import JSONResponse, FileResponse
+from fastapi.staticfiles import StaticFiles
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+# Add src to path for imports
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
+from orchestrator import DataScienceCopilot
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Initialize FastAPI
+app = FastAPI(
+    title="Data Science Agent API",
+    description="Cloud Run wrapper for autonomous data science workflows",
+    version="1.0.0"
+)
+# Enable CORS for frontend
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # Configure this properly in production
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Initialize agent once (singleton pattern for stateless service)
+# Agent itself is stateless - no conversation memory between requests
+agent: Optional[DataScienceCopilot] = None
+# Mount static files for React frontend
+frontend_path = Path(__file__).parent.parent.parent / "FRRONTEEEND" / "dist"
+if frontend_path.exists():
+    app.mount("/assets", StaticFiles(directory=str(frontend_path / "assets")), name="assets")
+    logger.info(f"✅ Frontend assets mounted from {frontend_path}")
+@app.on_event("startup")
+async def startup_event():
+    """Initialize DataScienceCopilot on service startup."""
+    global agent
+    try:
+        logger.info("Initializing DataScienceCopilot...")
+        agent = DataScienceCopilot(
+            reasoning_effort="medium",
+            provider=os.getenv("LLM_PROVIDER", "groq")
+        )
+        logger.info(f"✅ Agent initialized with provider: {agent.provider}")
+    except Exception as e:
+        logger.error(f"❌ Failed to initialize agent: {e}")
+        raise
+@app.get("/api/health")
+async def root():
+    """Health check endpoint."""
+    return {
+        "service": "Data Science Agent API",
+        "status": "healthy",
+        "provider": agent.provider if agent else "not initialized",
+        "tools_available": len(agent.tool_functions) if agent else 0
+    }
+@app.get("/health")
+async def health_check():
+    """
+    Health check for Cloud Run.
+    Returns 200 if service is ready to accept requests.
+    """
+    if agent is None:
+        raise HTTPException(status_code=503, detail="Agent not initialized")
+    return {
+        "status": "healthy",
+        "agent_ready": True,
+        "provider": agent.provider,
+        "tools_count": len(agent.tool_functions)
+    }
+class AnalysisRequest(BaseModel):
+    """Request model for analysis endpoint (JSON body)."""
+    task_description: str
+    target_col: Optional[str] = None
+    use_cache: bool = True
+    max_iterations: int = 20
+@app.post("/run")
+async def run_analysis(
+    file: UploadFile = File(..., description="Dataset file (CSV or Parquet)"),
+    task_description: str = Form(..., description="Natural language task description"),
+    target_col: Optional[str] = Form(None, description="Target column name for prediction"),
+    use_cache: bool = Form(True, description="Enable caching for expensive operations"),
+    max_iterations: int = Form(20, description="Maximum workflow iterations")
+) -> JSONResponse:
+    """
+    Run complete data science workflow on uploaded dataset.
+    This is a thin wrapper - all logic lives in DataScienceCopilot.analyze().
+    Args:
+        file: CSV or Parquet file upload
+        task_description: Natural language description of the task
+        target_col: Optional target column for ML tasks
+        use_cache: Whether to use cached results
+        max_iterations: Maximum number of workflow steps
+    Returns:
+        JSON response with analysis results, workflow history, and execution stats
+    Example:
+        ```bash
+        curl -X POST http://localhost:8080/run \
+          -F "file=@data.csv" \
+          -F "task_description=Analyze this dataset and predict house prices" \
+          -F "target_col=price"
+        ```
+    """
+    if agent is None:
+        raise HTTPException(status_code=503, detail="Agent not initialized")
+    # Validate file format
+    filename = file.filename.lower()
+    if not (filename.endswith('.csv') or filename.endswith('.parquet')):
+        raise HTTPException(
+            status_code=400,
+            detail="Invalid file format. Only CSV and Parquet files are supported."
+        )
+    # Use /tmp for Cloud Run (ephemeral storage)
+    temp_dir = Path("/tmp") / "data_science_agent"
+    temp_dir.mkdir(parents=True, exist_ok=True)
+    temp_file_path = None
+    try:
+        # Save uploaded file to temporary location
+        temp_file_path = temp_dir / file.filename
+        logger.info(f"Saving uploaded file to: {temp_file_path}")
+        with open(temp_file_path, "wb") as buffer:
+            shutil.copyfileobj(file.file, buffer)
+        logger.info(f"File saved successfully: {file.filename} ({os.path.getsize(temp_file_path)} bytes)")
+        # Call existing agent logic - NO CHANGES to orchestrator
+        logger.info(f"Starting analysis with task: {task_description}")
+        result = agent.analyze(
+            file_path=str(temp_file_path),
+            task_description=task_description,
+            target_col=target_col,
+            use_cache=use_cache,
+            max_iterations=max_iterations
+        )
+        logger.info(f"Analysis completed: {result.get('status')}")
+        # Filter out non-JSON-serializable objects (like matplotlib/plotly Figures)
+        def make_json_serializable(obj):
+            """Recursively convert objects to JSON-serializable format."""
+            if isinstance(obj, dict):
+                return {k: make_json_serializable(v) for k, v in obj.items()}
+            elif isinstance(obj, list):
+                return [make_json_serializable(item) for item in obj]
+            elif hasattr(obj, '__class__') and obj.__class__.__name__ in ['Figure', 'Axes', 'Artist']:
+                # Skip matplotlib/plotly Figure objects
+                return f"<{obj.__class__.__name__} object - see artifacts>"
+            elif isinstance(obj, (str, int, float, bool, type(None))):
+                return obj
+            else:
+                # Try to convert to string for other types
+                try:
+                    return str(obj)
+                except:
+                    return f"<{type(obj).__name__}>"
+        serializable_result = make_json_serializable(result)
+        # Return result as-is from orchestrator
+        return JSONResponse(
+            content={
+                "success": result.get("status") == "success",
+                "result": serializable_result,
+                "metadata": {
+                    "filename": file.filename,
+                    "task": task_description,
+                    "target": target_col,
+                    "provider": agent.provider
+                }
+            },
+            status_code=200
+        )
+    except Exception as e:
+        logger.error(f"Analysis failed: {str(e)}", exc_info=True)
+        raise HTTPException(
+            status_code=500,
+            detail={
+                "error": str(e),
+                "error_type": type(e).__name__,
+                "message": "Analysis workflow failed. Check logs for details."
+            }
+        )
+    finally:
+        # Cleanup temporary file
+        if temp_file_path and temp_file_path.exists():
+            try:
+                temp_file_path.unlink()
+                logger.info(f"Cleaned up temporary file: {temp_file_path}")
+            except Exception as e:
+                logger.warning(f"Failed to cleanup temp file: {e}")
+@app.post("/profile")
+async def profile_dataset(
+    file: UploadFile = File(..., description="Dataset file (CSV or Parquet)")
+) -> JSONResponse:
+    """
+    Quick dataset profiling without full workflow.
+    Returns basic statistics, data types, and quality issues.
+    Useful for initial data exploration without running full analysis.
+    Example:
+        ```bash
+        curl -X POST http://localhost:8080/profile \
+          -F "file=@data.csv"
+        ```
+    """
+    if agent is None:
+        raise HTTPException(status_code=503, detail="Agent not initialized")
+    filename = file.filename.lower()
+    if not (filename.endswith('.csv') or filename.endswith('.parquet')):
+        raise HTTPException(
+            status_code=400,
+            detail="Invalid file format. Only CSV and Parquet files are supported."
+        )
+    temp_dir = Path("/tmp") / "data_science_agent"
+    temp_dir.mkdir(parents=True, exist_ok=True)
+    temp_file_path = None
+    try:
+        # Save file temporarily
+        temp_file_path = temp_dir / file.filename
+        with open(temp_file_path, "wb") as buffer:
+            shutil.copyfileobj(file.file, buffer)
+        # Import profiling tool directly
+        from tools.data_profiling import profile_dataset as profile_tool
+        from tools.data_profiling import detect_data_quality_issues
+        # Run profiling tools
+        logger.info(f"Profiling dataset: {file.filename}")
+        profile_result = profile_tool(str(temp_file_path))
+        quality_result = detect_data_quality_issues(str(temp_file_path))
+        return JSONResponse(
+            content={
+                "success": True,
+                "filename": file.filename,
+                "profile": profile_result,
+                "quality_issues": quality_result
+            },
+            status_code=200
+        )
+    except Exception as e:
+        logger.error(f"Profiling failed: {str(e)}", exc_info=True)
+        raise HTTPException(
+            status_code=500,
+            detail={
+                "error": str(e),
+                "error_type": type(e).__name__
+            }
+        )
+    finally:
+        if temp_file_path and temp_file_path.exists():
+            try:
+                temp_file_path.unlink()
+            except Exception as e:
+                logger.warning(f"Failed to cleanup temp file: {e}")
+@app.get("/tools")
+async def list_tools():
+    """
+    List all available tools in the agent.
+    Returns tool names organized by category.
+    Useful for understanding agent capabilities.
+    """
+    if agent is None:
+        raise HTTPException(status_code=503, detail="Agent not initialized")
+    from tools.tools_registry import get_tools_by_category
+    return {
+        "total_tools": len(agent.tool_functions),
+        "tools_by_category": get_tools_by_category(),
+        "all_tools": list(agent.tool_functions.keys())
+    }
+class ChatMessage(BaseModel):
+    """Chat message model."""
+    role: str  # 'user' or 'assistant'
+    content: str
+class ChatRequest(BaseModel):
+    """Chat request model."""
+    messages: List[ChatMessage]
+    stream: bool = False
+@app.post("/chat")
+async def chat(request: ChatRequest) -> JSONResponse:
+    """
+    Chat endpoint for conversational interface.
+    Processes chat messages and returns agent responses.
+    Uses the same underlying agent as /run but in chat format.
+    Args:
+        request: Chat request with message history
+    Returns:
+        JSON response with agent's reply
+    """
+    if agent is None:
+        raise HTTPException(status_code=503, detail="Agent not initialized")
+    try:
+        # Extract the latest user message
+        user_messages = [msg for msg in request.messages if msg.role == "user"]
+        if not user_messages:
+            raise HTTPException(status_code=400, detail="No user message found")
+        latest_message = user_messages[-1].content
+        # Check for API key
+        api_key = os.getenv("GOOGLE_API_KEY")
+        if not api_key:
+            raise HTTPException(
+                status_code=500,
+                detail="GOOGLE_API_KEY not configured. Please set the environment variable."
+            )
+        # Use Google Gemini API
+        import google.generativeai as genai
+        logger.info(f"Configuring Gemini with API key (length: {len(api_key)})")
+        genai.configure(api_key=api_key)
+        # Initialize Gemini model
+        model = genai.GenerativeModel(
+            model_name=os.getenv("GEMINI_MODEL", "gemini-2.5-flash-lite"),
+            system_instruction="You are a Senior Data Science Autonomous Agent. You help users with end-to-end machine learning, data profiling, visualization, and strategic insights. Use a professional, technical yet accessible tone. Provide code snippets in Python if requested. You have access to tools for data analysis, ML training, visualization, and more."
+        )
+        # Convert messages to Gemini format (exclude system message, just conversation)
+        chat_history = []
+        for msg in request.messages[:-1]:  # Exclude the latest message
+            chat_history.append({
+                "role": "user" if msg.role == "user" else "model",
+                "parts": [msg.content]
+            })
+        # Start chat with history
+        chat = model.start_chat(history=chat_history)
+        # Send the latest message
+        response = chat.send_message(latest_message)
+        assistant_message = response.text
+        return JSONResponse(
+            content={
+                "success": True,
+                "message": assistant_message,
+                "model": "gemini-2.0-flash-exp",
+                "provider": "gemini"
+            },
+            status_code=200
+        )
+    except Exception as e:
+        logger.error(f"Chat failed: {str(e)}", exc_info=True)
+        raise HTTPException(
+            status_code=500,
+            detail={
+                "error": str(e),
+                "error_type": type(e).__name__
+            }
+        )
+# Error handlers
+@app.exception_handler(HTTPException)
+async def http_exception_handler(request, exc):
+    """Custom error response format."""
+    return JSONResponse(
+        status_code=exc.status_code,
+        content={
+            "success": False,
+            "error": exc.detail,
+            "status_code": exc.status_code
+        }
+    )
+@app.exception_handler(Exception)
+async def general_exception_handler(request, exc):
+    """Catch-all error handler."""
+    logger.error(f"Unhandled exception: {str(exc)}", exc_info=True)
+    return JSONResponse(
+        status_code=500,
+        content={
+            "success": False,
+            "error": "Internal server error",
+            "detail": str(exc),
+            "error_type": type(exc).__name__
+        }
+    )
+@app.get("/outputs/{file_path:path}")
+async def serve_output_files(file_path: str):
+    """
+    Serve generated output files (reports, plots, models, etc.).
+    """
+    output_path = Path("./outputs") / file_path
+    if not output_path.exists():
+        raise HTTPException(status_code=404, detail=f"File not found: {file_path}")
+    if not output_path.is_file():
+        raise HTTPException(status_code=400, detail="Path is not a file")
+    # Security: prevent directory traversal
+    try:
+        output_path.resolve().relative_to(Path("./outputs").resolve())
+    except ValueError:
+        raise HTTPException(status_code=403, detail="Access denied")
+    return FileResponse(output_path)
+@app.get("/{full_path:path}")
+async def serve_frontend(full_path: str):
+    """
+    Serve React frontend for all non-API routes.
+    This should be the last route defined.
+    """
+    frontend_path = Path(__file__).parent.parent.parent / "FRRONTEEEND" / "dist"
+    # Try to serve the requested file
+    file_path = frontend_path / full_path
+    if file_path.is_file():
+        return FileResponse(file_path)
+    # Default to index.html for client-side routing
+    index_path = frontend_path / "index.html"
+    if index_path.exists():
+        return FileResponse(index_path)
+    # Frontend not built
+    raise HTTPException(
+        status_code=404,
+        detail="Frontend not found. Please build the frontend first: cd FRRONTEEEND && npm run build"
+    )
+# Cloud Run listens on PORT environment variable
+if __name__ == "__main__":
+    import uvicorn
+    port = int(os.getenv("PORT", 8080))
+    uvicorn.run(
+        "app:app",
+        host="0.0.0.0",
+        port=port,
+        log_level="info"
+    )

src/cache/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""Cache module initialization."""
+from .cache_manager import CacheManager
+__all__ = ["CacheManager"]

src/cache/cache_manager.py ADDED Viewed

	@@ -0,0 +1,292 @@

+"""
+Cache Manager for Data Science Copilot
+Uses SQLite for persistent caching of API responses and computation results.
+"""
+import hashlib
+import json
+import sqlite3
+import time
+from pathlib import Path
+from typing import Any, Optional
+import pickle
+class CacheManager:
+    """
+    Manages caching of LLM responses and expensive computations.
+    Uses SQLite for persistence and supports TTL-based invalidation.
+    Cache keys are generated from file hashes and operation parameters.
+    """
+    def __init__(self, db_path: str = "./cache_db/cache.db", ttl_seconds: int = 86400):
+        """
+        Initialize cache manager.
+        Args:
+            db_path: Path to SQLite database file
+            ttl_seconds: Time-to-live for cache entries (default 24 hours)
+        """
+        self.db_path = Path(db_path)
+        self.ttl_seconds = ttl_seconds
+        # Ensure cache directory exists
+        self.db_path.parent.mkdir(parents=True, exist_ok=True)
+        # Initialize database
+        self._init_db()
+    def _init_db(self) -> None:
+        """Create cache table if it doesn't exist."""
+        try:
+            conn = sqlite3.connect(self.db_path)
+            cursor = conn.cursor()
+            cursor.execute("""
+                CREATE TABLE IF NOT EXISTS cache (
+                    key TEXT PRIMARY KEY,
+                    value BLOB NOT NULL,
+                    created_at INTEGER NOT NULL,
+                    expires_at INTEGER NOT NULL,
+                    metadata TEXT
+                )
+            """)
+            # Create index on expires_at for efficient cleanup
+            cursor.execute("""
+                CREATE INDEX IF NOT EXISTS idx_expires_at
+                ON cache(expires_at)
+            """)
+            conn.commit()
+            conn.close()
+            print(f"✅ Cache database initialized at {self.db_path}")
+        except Exception as e:
+            print(f"⚠️ Error initializing cache database: {e}")
+            print(f"   Attempting to recreate database...")
+            try:
+                # Remove corrupted database and recreate
+                if self.db_path.exists():
+                    self.db_path.unlink()
+                conn = sqlite3.connect(self.db_path)
+                cursor = conn.cursor()
+                cursor.execute("""
+                    CREATE TABLE cache (
+                        key TEXT PRIMARY KEY,
+                        value BLOB NOT NULL,
+                        created_at INTEGER NOT NULL,
+                        expires_at INTEGER NOT NULL,
+                        metadata TEXT
+                    )
+                """)
+                cursor.execute("""
+                    CREATE INDEX idx_expires_at
+                    ON cache(expires_at)
+                """)
+                conn.commit()
+                conn.close()
+                print(f"✅ Cache database recreated successfully")
+            except Exception as e2:
+                print(f"❌ Failed to recreate cache database: {e2}")
+                print(f"   Cache functionality will be disabled")
+    def _generate_key(self, *args, **kwargs) -> str:
+        """
+        Generate a unique cache key from arguments.
+        Args:
+            *args: Positional arguments to hash
+            **kwargs: Keyword arguments to hash
+        Returns:
+            MD5 hash of the arguments
+        """
+        # Combine args and kwargs into a single string
+        key_data = json.dumps({"args": args, "kwargs": kwargs}, sort_keys=True)
+        return hashlib.md5(key_data.encode()).hexdigest()
+    def get(self, key: str) -> Optional[Any]:
+        """
+        Retrieve value from cache.
+        Args:
+            key: Cache key
+        Returns:
+            Cached value if exists and not expired, None otherwise
+        """
+        try:
+            conn = sqlite3.connect(self.db_path)
+            cursor = conn.cursor()
+            current_time = int(time.time())
+            cursor.execute("""
+                SELECT value, expires_at
+                FROM cache
+                WHERE key = ? AND expires_at > ?
+            """, (key, current_time))
+            result = cursor.fetchone()
+            conn.close()
+        except sqlite3.OperationalError as e:
+            print(f"⚠️ Cache read error: {e}")
+            print(f"   Reinitializing cache database...")
+            self._init_db()
+            return None
+        except Exception as e:
+            print(f"⚠️ Unexpected cache error: {e}")
+            return None
+        if result:
+            value_blob, expires_at = result
+            # Deserialize using pickle for complex Python objects
+            return pickle.loads(value_blob)
+        return None
+    def set(self, key: str, value: Any, ttl_override: Optional[int] = None,
+            metadata: Optional[dict] = None) -> None:
+        """
+        Store value in cache.
+        Args:
+            key: Cache key
+            value: Value to cache (must be pickleable)
+            ttl_override: Optional override for TTL (seconds)
+            metadata: Optional metadata to store with cache entry
+        """
+        try:
+            conn = sqlite3.connect(self.db_path)
+            cursor = conn.cursor()
+            current_time = int(time.time())
+            ttl = ttl_override if ttl_override is not None else self.ttl_seconds
+            expires_at = current_time + ttl
+            # Serialize value using pickle
+            value_blob = pickle.dumps(value)
+            # Serialize metadata as JSON
+            metadata_json = json.dumps(metadata) if metadata else None
+            cursor.execute("""
+                INSERT OR REPLACE INTO cache (key, value, created_at, expires_at, metadata)
+                VALUES (?, ?, ?, ?, ?)
+            """, (key, value_blob, current_time, expires_at, metadata_json))
+            conn.commit()
+            conn.close()
+        except sqlite3.OperationalError as e:
+            print(f"⚠️ Cache write error: {e}")
+            print(f"   Reinitializing cache database...")
+            self._init_db()
+        except Exception as e:
+            print(f"⚠️ Unexpected cache error during write: {e}")
+    def invalidate(self, key: str) -> bool:
+        """
+        Remove specific entry from cache.
+        Args:
+            key: Cache key to invalidate
+        Returns:
+            True if entry was removed, False if not found
+        """
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        cursor.execute("DELETE FROM cache WHERE key = ?", (key,))
+        deleted = cursor.rowcount > 0
+        conn.commit()
+        conn.close()
+        return deleted
+    def clear_expired(self) -> int:
+        """
+        Remove all expired entries from cache.
+        Returns:
+            Number of entries removed
+        """
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        current_time = int(time.time())
+        cursor.execute("DELETE FROM cache WHERE expires_at <= ?", (current_time,))
+        deleted = cursor.rowcount
+        conn.commit()
+        conn.close()
+        return deleted
+    def clear_all(self) -> None:
+        """Remove all entries from cache."""
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        cursor.execute("DELETE FROM cache")
+        conn.commit()
+        conn.close()
+    def get_stats(self) -> dict:
+        """
+        Get cache statistics.
+        Returns:
+            Dictionary with cache stats (total entries, expired, size)
+        """
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        current_time = int(time.time())
+        # Total entries
+        cursor.execute("SELECT COUNT(*) FROM cache")
+        total = cursor.fetchone()[0]
+        # Valid entries
+        cursor.execute("SELECT COUNT(*) FROM cache WHERE expires_at > ?", (current_time,))
+        valid = cursor.fetchone()[0]
+        # Database size
+        cursor.execute("SELECT page_count * page_size FROM pragma_page_count(), pragma_page_size()")
+        size_bytes = cursor.fetchone()[0]
+        conn.close()
+        return {
+            "total_entries": total,
+            "valid_entries": valid,
+            "expired_entries": total - valid,
+            "size_mb": round(size_bytes / (1024 * 1024), 2)
+        }
+    def generate_file_hash(self, file_path: str) -> str:
+        """
+        Generate hash of file contents for cache key.
+        Args:
+            file_path: Path to file
+        Returns:
+            MD5 hash of file contents
+        """
+        hasher = hashlib.md5()
+        with open(file_path, 'rb') as f:
+            # Read file in chunks to handle large files
+            for chunk in iter(lambda: f.read(4096), b""):
+                hasher.update(chunk)
+        return hasher.hexdigest()