diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000000000000000000000000000000000000..31da76a70f16340f00a7a15f371707bdd5660802 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,74 @@ +# Python cache and environment +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +.venv/ +venv/ +ENV/ +env/ + +# Development files +.git/ +.gitignore +.env +.env.local +*.log + +# Output directories (not needed in container) +outputs/ +cache_db/ +temp/ +test_data/ +data/ + +# Frontend development files (will be built in Docker) +FRRONTEEEND/node_modules/ +FRRONTEEEND/.env +FRRONTEEEND/.env.local + +# Documentation and tests +*.md +!README.md +tests/ +test_*.py +check_*.py + +# Old Gradio UI (no longer used) +chat_ui.py + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# OS files +.DS_Store +Thumbs.db + +# Jupyter notebooks +*.ipynb +.ipynb_checkpoints/ + +# Large model files (if any) +*.pkl +*.joblib +*.h5 +*.pt +*.pth + +# Documentation +docs/ +PHASE*.md +PROJECT*.md +TOKEN*.md +TOOL*.md +FEATURE*.md +IMPLEMENTATION*.md +MIGRATION*.md +EDA_REPORTS*.md +GITHUB*.md +BIGQUERY*.md diff --git a/.env.example b/.env.example new file mode 100644 index 0000000000000000000000000000000000000000..3ffe412f7069008a8c3f5cf6ab3284be470fddbd --- /dev/null +++ b/.env.example @@ -0,0 +1,19 @@ +# Google Gemini API Configuration +GOOGLE_API_KEY=your_google_api_key_here + +# Model Configuration +LLM_PROVIDER=gemini +REASONING_EFFORT=medium + +# Cache Configuration +CACHE_DB_PATH=./cache_db/cache.db +CACHE_TTL_SECONDS=86400 + +# Output Configuration +OUTPUT_DIR=./outputs +DATA_DIR=./data + +# Performance Configuration +MAX_PARALLEL_TOOLS=5 +MAX_RETRIES=3 +TIMEOUT_SECONDS=300 diff --git a/.gcloudignore b/.gcloudignore new file mode 100644 index 0000000000000000000000000000000000000000..0a4e0a796dbedc00057e8c2a564a796cdcfb6b24 --- /dev/null +++ b/.gcloudignore @@ -0,0 +1,59 @@ +# This file specifies files that are *not* uploaded to Google Cloud +# using gcloud. It follows the same syntax as .gitignore + +.gcloudignore +.git +.gitignore + +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +.venv/ +venv/ +ENV/ +env/ + +# Local development +.env +.env.local +*.log + +# Outputs and cache (regenerated in cloud) +outputs/ +cache_db/ +temp/ +test_data/ +data/ + +# Documentation +*.md +!README.md + +# Tests +tests/ +test_*.py +check_*.py + +# IDE +.vscode/ +.idea/ +*.swp +*.swo + +# OS +.DS_Store +Thumbs.db + +# Jupyter +*.ipynb +.ipynb_checkpoints/ + +# Build artifacts +*.pkl +*.joblib +*.h5 +*.pt +*.pth diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..fc805344ff2d616e7d19e3fe37cabae534b0a9d2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,71 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Virtual Environment +venv/ +.venv/ +env/ +ENV/ + +# Environment Variables +.env + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# Cache & Outputs +cache_db/*.db +cache_db/*.db-journal +cache_db/ +outputs/ +temp/ +*.pkl +*.joblib + +# Data files (except examples) +data/*.csv +data/*.parquet +!data/.gitkeep + +# Cloud Run URL +.cloud_run_url + +# Jupyter +.ipynb_checkpoints/ +*.ipynb + +# OS +.DS_Store +Thumbs.db + +# Testing +.pytest_cache/ +.coverage +htmlcov/ +.tox/ + +# Logs +*.log diff --git a/BIGQUERY_SCHEMAS.md b/BIGQUERY_SCHEMAS.md new file mode 100644 index 0000000000000000000000000000000000000000..feb751923f08eb893cde28cd9b2f7ee4fc8112b5 --- /dev/null +++ b/BIGQUERY_SCHEMAS.md @@ -0,0 +1,691 @@ +# BigQuery Output Schemas for Looker Compatibility + +**Purpose**: Define stable BigQuery table schemas that BI tools (Looker, Data Studio) can query reliably. + +**Design Principles**: +- ✅ **Stable Schema**: No breaking changes without versioning +- ✅ **Consistent Naming**: snake_case columns, clear dimension/metric separation +- ✅ **BI-Friendly Types**: Standard SQL types, no complex nested structures +- ✅ **Documented Grain**: Clear primary keys and update patterns +- ✅ **Dashboard-Ready**: Metrics aligned with common visualizations + +--- + +## 📊 Table 1: `model_metrics` + +**Description**: Model performance metrics tracked over time for monitoring and comparison. + +**Use Cases**: +- Performance dashboards +- Model comparison reports +- Drift detection alerts +- A/B test analysis + +**Update Frequency**: On every model training run + +**Grain**: One row per model training execution + +### Schema + +| Column Name | Type | Description | Dimension/Metric | Example | +|------------|------|-------------|------------------|---------| +| `project_id` | STRING | Google Cloud project ID | Dimension | `my-ml-project` | +| `dataset_id` | STRING | BigQuery dataset name | Dimension | `ml_models` | +| `model_id` | STRING | Unique model identifier | Dimension (Primary Key) | `xgboost_churn_20251223_153045` | +| `model_name` | STRING | Human-readable model name | Dimension | `Customer Churn Predictor` | +| `model_type` | STRING | Algorithm used | Dimension | `XGBoost`, `RandomForest`, `LightGBM` | +| `task_type` | STRING | ML task category | Dimension | `classification`, `regression` | +| `training_dataset` | STRING | Source table/file reference | Dimension | `project.dataset.train_data` | +| `target_column` | STRING | Prediction target name | Dimension | `churn`, `price`, `survived` | +| `created_at` | TIMESTAMP | Model training timestamp | Dimension (Time) | `2025-12-23 15:30:45 UTC` | +| `created_date` | DATE | Training date (for partitioning) | Dimension (Time) | `2025-12-23` | +| `feature_count` | INTEGER | Number of features used | Metric | `42` | +| `training_rows` | INTEGER | Training set size | Metric | `10000` | +| `test_rows` | INTEGER | Test set size | Metric | `2500` | +| `training_duration_seconds` | FLOAT | Time to train model | Metric | `123.45` | +| `accuracy` | FLOAT | Overall accuracy (0-1) | Metric | `0.95` | +| `precision` | FLOAT | Precision score (0-1) | Metric | `0.92` | +| `recall` | FLOAT | Recall score (0-1) | Metric | `0.88` | +| `f1_score` | FLOAT | F1 score (0-1) | Metric | `0.90` | +| `roc_auc` | FLOAT | ROC AUC score (0-1) | Metric | `0.94` | +| `pr_auc` | FLOAT | Precision-Recall AUC (0-1) | Metric | `0.91` | +| `mae` | FLOAT | Mean Absolute Error (regression) | Metric | `1234.56` | +| `mse` | FLOAT | Mean Squared Error (regression) | Metric | `567890.12` | +| `rmse` | FLOAT | Root Mean Squared Error (regression) | Metric | `753.59` | +| `r2_score` | FLOAT | R² coefficient (regression) | Metric | `0.85` | +| `cross_val_mean` | FLOAT | Mean CV score | Metric | `0.93` | +| `cross_val_std` | FLOAT | CV score std deviation | Metric | `0.02` | +| `hyperparameters` | STRING (JSON) | Model hyperparameters | Metadata | `{"max_depth": 6, "n_estimators": 100}` | +| `version` | STRING | Model version tag | Dimension | `v1.2.3` | +| `environment` | STRING | Training environment | Dimension | `production`, `staging`, `development` | +| `user_email` | STRING | User who trained model | Dimension | `data-scientist@company.com` | + +### Partitioning & Clustering + +```sql +-- Recommended table setup +CREATE TABLE `project.dataset.model_metrics` +( + -- columns as above +) +PARTITION BY created_date +CLUSTER BY model_type, task_type, environment +OPTIONS( + description="Model performance metrics for BI dashboards", + require_partition_filter=true +); +``` + +### Primary Dimensions for Looker + +- **Time**: `created_at`, `created_date` +- **Model**: `model_type`, `model_name`, `task_type` +- **Performance Tier**: CASE expression on `accuracy`/`f1_score` + - `Excellent` (>0.90) + - `Good` (0.80-0.90) + - `Fair` (0.70-0.80) + - `Poor` (<0.70) + +### Sample Looker View + +```lookml +view: model_metrics { + sql_table_name: `project.dataset.model_metrics` ;; + + dimension: model_id { + primary_key: yes + type: string + sql: ${TABLE}.model_id ;; + } + + dimension_group: created { + type: time + timeframes: [date, week, month, quarter, year] + sql: ${TABLE}.created_at ;; + } + + dimension: model_type { + type: string + sql: ${TABLE}.model_type ;; + } + + dimension: performance_tier { + type: string + sql: CASE + WHEN ${TABLE}.accuracy >= 0.90 THEN 'Excellent' + WHEN ${TABLE}.accuracy >= 0.80 THEN 'Good' + WHEN ${TABLE}.accuracy >= 0.70 THEN 'Fair' + ELSE 'Poor' + END ;; + } + + measure: count { + type: count + } + + measure: avg_accuracy { + type: average + sql: ${TABLE}.accuracy ;; + value_format_name: percent_2 + } + + measure: avg_f1_score { + type: average + sql: ${TABLE}.f1_score ;; + value_format_name: percent_2 + } +} +``` + +--- + +## 🎯 Table 2: `feature_importance` + +**Description**: Feature importance scores for model interpretability. + +**Use Cases**: +- Feature impact analysis +- Feature selection dashboards +- Model explainability reports + +**Update Frequency**: On every model training run + +**Grain**: One row per feature per model + +### Schema + +| Column Name | Type | Description | Dimension/Metric | Example | +|------------|------|-------------|------------------|---------| +| `model_id` | STRING | Foreign key to model_metrics | Dimension (Foreign Key) | `xgboost_churn_20251223_153045` | +| `feature_name` | STRING | Name of the feature | Dimension (Primary Key) | `age`, `total_purchases`, `days_since_last_login` | +| `importance_score` | FLOAT | Importance value (0-1) | Metric | `0.35` | +| `importance_rank` | INTEGER | Rank by importance (1=most important) | Metric | `1`, `2`, `3` | +| `importance_type` | STRING | Calculation method | Dimension | `gain`, `weight`, `cover`, `shap` | +| `feature_type` | STRING | Data type category | Dimension | `numeric`, `categorical`, `datetime`, `text` | +| `is_engineered` | BOOLEAN | Created by feature engineering? | Dimension | `true`, `false` | +| `created_at` | TIMESTAMP | When importance was calculated | Dimension (Time) | `2025-12-23 15:30:45 UTC` | +| `created_date` | DATE | Calculation date | Dimension (Time) | `2025-12-23` | + +### Partitioning & Clustering + +```sql +CREATE TABLE `project.dataset.feature_importance` +( + -- columns as above +) +PARTITION BY created_date +CLUSTER BY model_id, importance_rank +OPTIONS( + description="Feature importance scores for model explainability", + require_partition_filter=false -- Allow cross-model queries +); +``` + +### Primary Dimensions for Looker + +- **Feature**: `feature_name`, `feature_type`, `is_engineered` +- **Model**: `model_id` (join to model_metrics) +- **Importance**: `importance_rank`, `importance_type` + +### Sample Looker View + +```lookml +view: feature_importance { + sql_table_name: `project.dataset.feature_importance` ;; + + dimension: compound_key { + primary_key: yes + hidden: yes + sql: CONCAT(${TABLE}.model_id, '|', ${TABLE}.feature_name) ;; + } + + dimension: feature_name { + type: string + sql: ${TABLE}.feature_name ;; + } + + dimension: is_top_10 { + type: yesno + sql: ${TABLE}.importance_rank <= 10 ;; + } + + measure: avg_importance { + type: average + sql: ${TABLE}.importance_score ;; + value_format_name: percent_2 + } + + measure: count_features { + type: count_distinct + sql: ${TABLE}.feature_name ;; + } +} +``` + +--- + +## 🔮 Table 3: `predictions` + +**Description**: Model predictions with actuals for monitoring and evaluation. + +**Use Cases**: +- Prediction monitoring +- Accuracy tracking over time +- Segment performance analysis +- Business impact measurement + +**Update Frequency**: Real-time or batch (daily/hourly) + +**Grain**: One row per prediction + +### Schema + +| Column Name | Type | Description | Dimension/Metric | Example | +|------------|------|-------------|------------------|---------| +| `prediction_id` | STRING | Unique prediction identifier | Dimension (Primary Key) | `pred_abc123xyz` | +| `model_id` | STRING | Model used for prediction | Dimension (Foreign Key) | `xgboost_churn_20251223_153045` | +| `entity_id` | STRING | Entity being predicted (customer_id, product_id, etc.) | Dimension | `customer_12345` | +| `predicted_at` | TIMESTAMP | When prediction was made | Dimension (Time) | `2025-12-23 15:30:45 UTC` | +| `predicted_date` | DATE | Prediction date (for partitioning) | Dimension (Time) | `2025-12-23` | +| `prediction_value` | FLOAT | Predicted value | Metric | `0.85` (probability), `49.99` (price) | +| `prediction_class` | STRING | Predicted class (classification) | Dimension | `churn`, `not_churn` | +| `prediction_confidence` | FLOAT | Model confidence (0-1) | Metric | `0.92` | +| `actual_value` | FLOAT | True value (when available) | Metric | `1.0` (churned), `52.50` (actual price) | +| `actual_class` | STRING | True class (when available) | Dimension | `churn`, `not_churn` | +| `actual_recorded_at` | TIMESTAMP | When actual became known | Dimension (Time) | `2025-12-30 10:00:00 UTC` | +| `is_correct` | BOOLEAN | Prediction was correct? | Dimension | `true`, `false` | +| `absolute_error` | FLOAT | \|predicted - actual\| | Metric | `2.51` | +| `squared_error` | FLOAT | (predicted - actual)² | Metric | `6.30` | +| `feature_values` | STRING (JSON) | Input features used | Metadata | `{"age": 35, "tenure": 24}` | +| `segment` | STRING | Business segment | Dimension | `enterprise`, `smb`, `consumer` | +| `region` | STRING | Geographic region | Dimension | `us-west`, `eu-central` | +| `model_version` | STRING | Model version | Dimension | `v1.2.3` | +| `prediction_latency_ms` | FLOAT | Inference time | Metric | `23.4` | + +### Partitioning & Clustering + +```sql +CREATE TABLE `project.dataset.predictions` +( + -- columns as above +) +PARTITION BY predicted_date +CLUSTER BY model_id, segment, is_correct +OPTIONS( + description="Model predictions with actuals for monitoring", + require_partition_filter=true, + partition_expiration_days=730 -- 2 years retention +); +``` + +### Primary Dimensions for Looker + +- **Time**: `predicted_date`, days since prediction +- **Model**: `model_id`, `model_version` +- **Segment**: `segment`, `region` +- **Accuracy**: `is_correct`, error buckets + +### Sample Looker View + +```lookml +view: predictions { + sql_table_name: `project.dataset.predictions` ;; + + dimension: prediction_id { + primary_key: yes + type: string + sql: ${TABLE}.prediction_id ;; + } + + dimension_group: predicted { + type: time + timeframes: [date, week, month] + sql: ${TABLE}.predicted_at ;; + } + + dimension: segment { + type: string + sql: ${TABLE}.segment ;; + } + + dimension: error_bucket { + type: string + sql: CASE + WHEN ${TABLE}.absolute_error IS NULL THEN 'No Actual Yet' + WHEN ${TABLE}.absolute_error <= 0.1 THEN '0-10%' + WHEN ${TABLE}.absolute_error <= 0.2 THEN '10-20%' + ELSE '>20%' + END ;; + } + + measure: count { + type: count + } + + measure: accuracy_rate { + type: average + sql: CAST(${TABLE}.is_correct AS FLOAT64) ;; + value_format_name: percent_1 + } + + measure: avg_confidence { + type: average + sql: ${TABLE}.prediction_confidence ;; + value_format_name: percent_2 + } + + measure: mae { + type: average + sql: ${TABLE}.absolute_error ;; + value_format_name: decimal_2 + } +} +``` + +--- + +## 📋 Table 4: `data_profile_summary` + +**Description**: Dataset profiling statistics for data quality monitoring. + +**Use Cases**: +- Data quality dashboards +- Schema drift detection +- Data validation reports +- Column-level monitoring + +**Update Frequency**: Daily or on-demand + +**Grain**: One row per column per dataset per run + +### Schema + +| Column Name | Type | Description | Dimension/Metric | Example | +|------------|------|-------------|------------------|---------| +| `profile_id` | STRING | Unique profile run identifier | Dimension (Primary Key) | `profile_abc123xyz` | +| `dataset_name` | STRING | Source table/file name | Dimension | `project.dataset.customers` | +| `column_name` | STRING | Column being profiled | Dimension | `age`, `email`, `signup_date` | +| `profiled_at` | TIMESTAMP | When profiling ran | Dimension (Time) | `2025-12-23 15:30:45 UTC` | +| `profiled_date` | DATE | Profiling date | Dimension (Time) | `2025-12-23` | +| `data_type` | STRING | Column data type | Dimension | `INTEGER`, `STRING`, `FLOAT`, `TIMESTAMP` | +| `inferred_type` | STRING | Smart type inference | Dimension | `numeric`, `categorical`, `datetime`, `text`, `email` | +| `row_count` | INTEGER | Total rows in dataset | Metric | `10000` | +| `non_null_count` | INTEGER | Non-null values | Metric | `9850` | +| `null_count` | INTEGER | Null values | Metric | `150` | +| `null_percentage` | FLOAT | % null (0-100) | Metric | `1.5` | +| `unique_count` | INTEGER | Distinct values | Metric | `450` | +| `uniqueness_percentage` | FLOAT | % unique (0-100) | Metric | `4.5` | +| `min_value` | STRING | Minimum value (as string) | Metadata | `18`, `2020-01-01` | +| `max_value` | STRING | Maximum value (as string) | Metadata | `95`, `2025-12-23` | +| `mean_value` | FLOAT | Mean (numeric only) | Metric | `42.5` | +| `median_value` | FLOAT | Median (numeric only) | Metric | `38.0` | +| `std_dev` | FLOAT | Standard deviation (numeric only) | Metric | `15.2` | +| `skewness` | FLOAT | Distribution skewness | Metric | `0.85` | +| `kurtosis` | FLOAT | Distribution kurtosis | Metric | `2.1` | +| `top_value` | STRING | Most common value | Metadata | `male`, `active` | +| `top_value_frequency` | INTEGER | Count of most common value | Metric | `6500` | +| `top_value_percentage` | FLOAT | % of most common value | Metric | `65.0` | +| `has_outliers` | BOOLEAN | Outliers detected? | Dimension | `true`, `false` | +| `outlier_count` | INTEGER | Number of outliers | Metric | `23` | +| `outlier_percentage` | FLOAT | % outliers | Metric | `0.23` | +| `quality_score` | FLOAT | Overall quality score (0-100) | Metric | `92.5` | +| `quality_issues` | STRING (JSON) | Detected issues | Metadata | `["high_nulls", "duplicate_values"]` | +| `validation_status` | STRING | Quality check result | Dimension | `pass`, `warn`, `fail` | + +### Partitioning & Clustering + +```sql +CREATE TABLE `project.dataset.data_profile_summary` +( + -- columns as above +) +PARTITION BY profiled_date +CLUSTER BY dataset_name, validation_status +OPTIONS( + description="Dataset profiling for data quality monitoring", + require_partition_filter=true, + partition_expiration_days=90 -- 3 months retention +); +``` + +### Primary Dimensions for Looker + +- **Dataset**: `dataset_name` +- **Column**: `column_name`, `data_type`, `inferred_type` +- **Quality**: `validation_status`, `quality_score` buckets +- **Time**: `profiled_date` + +### Sample Looker View + +```lookml +view: data_profile_summary { + sql_table_name: `project.dataset.data_profile_summary` ;; + + dimension: compound_key { + primary_key: yes + hidden: yes + sql: CONCAT(${TABLE}.profile_id, '|', ${TABLE}.column_name) ;; + } + + dimension: column_name { + type: string + sql: ${TABLE}.column_name ;; + } + + dimension: quality_tier { + type: string + sql: CASE + WHEN ${TABLE}.quality_score >= 90 THEN 'Excellent' + WHEN ${TABLE}.quality_score >= 75 THEN 'Good' + WHEN ${TABLE}.quality_score >= 60 THEN 'Fair' + ELSE 'Poor' + END ;; + } + + dimension: has_quality_issues { + type: yesno + sql: ${TABLE}.validation_status IN ('warn', 'fail') ;; + } + + measure: count_columns { + type: count_distinct + sql: ${TABLE}.column_name ;; + } + + measure: avg_quality_score { + type: average + sql: ${TABLE}.quality_score ;; + value_format_name: decimal_1 + } + + measure: avg_null_percentage { + type: average + sql: ${TABLE}.null_percentage ;; + value_format_name: percent_1 + } + + measure: columns_with_issues { + type: count_distinct + sql: ${TABLE}.column_name ;; + filters: [has_quality_issues: "yes"] + } +} +``` + +--- + +## 🔄 Schema Evolution Guidelines + +### ✅ **SAFE Changes** (Non-Breaking) + +1. **Add new columns** (always nullable or with defaults) + ```sql + ALTER TABLE `project.dataset.model_metrics` + ADD COLUMN IF NOT EXISTS new_metric FLOAT64; + ``` + +2. **Add new tables** (doesn't affect existing dashboards) + +3. **Lengthen STRING columns** (VARCHAR(50) → VARCHAR(100)) + +4. **Add indexes/clustering** (performance only) + +5. **Add column descriptions** + ```sql + ALTER TABLE `project.dataset.model_metrics` + ALTER COLUMN accuracy SET OPTIONS (description='Model accuracy (0-1)'); + ``` + +### ❌ **BREAKING Changes** (Require Dashboard Updates) + +1. **Rename columns** → Use views for backward compatibility: + ```sql + CREATE OR REPLACE VIEW `project.dataset.model_metrics_v2` AS + SELECT + model_id, + accuracy AS acc, -- renamed column + ... + FROM `project.dataset.model_metrics`; + ``` + +2. **Change data types** → Create new column, migrate, deprecate old: + ```sql + -- Step 1: Add new column + ALTER TABLE model_metrics ADD COLUMN created_at_new TIMESTAMP; + + -- Step 2: Backfill + UPDATE model_metrics SET created_at_new = CAST(created_at AS TIMESTAMP) WHERE true; + + -- Step 3: Update dashboards to use new column + + -- Step 4: Drop old column after validation period + ALTER TABLE model_metrics DROP COLUMN created_at; + ``` + +3. **Remove columns** → Deprecate first, remove after 90 days + +4. **Change partitioning** → Requires table recreation + +### 🔄 **Versioning Strategy** + +For major schema changes, create versioned tables: + +``` +project.dataset.model_metrics_v1 (deprecated, keep 90 days) +project.dataset.model_metrics_v2 (current) +project.dataset.model_metrics (view pointing to latest version) +``` + +--- + +## 📊 Dashboard-Ready Metrics Catalog + +### Model Performance Metrics + +| Metric Name | Calculation | Use Case | +|------------|-------------|----------| +| **Model Count** | `COUNT(DISTINCT model_id)` | Total models trained | +| **Avg Accuracy** | `AVG(accuracy)` | Overall model quality | +| **Accuracy Trend** | `AVG(accuracy) OVER (ORDER BY created_date)` | Performance over time | +| **Best Model** | `model_id WHERE accuracy = MAX(accuracy)` | Top performer | +| **Models by Type** | `COUNT(*) GROUP BY model_type` | Algorithm distribution | +| **Training Time** | `AVG(training_duration_seconds)` | Resource usage | +| **Recent Models** | `WHERE created_date >= DATE_SUB(CURRENT_DATE(), INTERVAL 7 DAY)` | Latest activity | + +### Feature Importance Metrics + +| Metric Name | Calculation | Use Case | +|------------|-------------|----------| +| **Top Features** | `WHERE importance_rank <= 10` | Most impactful features | +| **Avg Importance** | `AVG(importance_score)` | Feature impact distribution | +| **Engineered Features** | `COUNT(*) WHERE is_engineered = true` | Feature engineering effectiveness | +| **Feature Stability** | `STDDEV(importance_score) GROUP BY feature_name` | Consistent predictors | + +### Prediction Metrics + +| Metric Name | Calculation | Use Case | +|------------|-------------|----------| +| **Accuracy Rate** | `AVG(CAST(is_correct AS FLOAT64))` | Real-world performance | +| **MAE** | `AVG(absolute_error)` | Average error magnitude | +| **RMSE** | `SQRT(AVG(squared_error))` | Error with outlier penalty | +| **Predictions/Day** | `COUNT(*) GROUP BY predicted_date` | Volume tracking | +| **Confidence Distribution** | `APPROX_QUANTILES(prediction_confidence, 10)` | Model calibration | +| **Segment Performance** | `AVG(is_correct) GROUP BY segment` | Fairness check | + +### Data Quality Metrics + +| Metric Name | Calculation | Use Case | +|------------|-------------|----------| +| **Data Quality Score** | `AVG(quality_score)` | Overall health | +| **Null Rate** | `AVG(null_percentage)` | Completeness | +| **Columns with Issues** | `COUNT(DISTINCT column_name) WHERE validation_status != 'pass'` | Problem areas | +| **Quality Trend** | `AVG(quality_score) OVER (ORDER BY profiled_date)` | Improving/degrading? | + +--- + +## 🎯 Sample Looker Explores + +### Explore 1: Model Performance Analysis + +```lookml +explore: model_metrics { + label: "Model Performance" + description: "Track model accuracy, training time, and comparison" + + join: feature_importance { + type: left_outer + sql_on: ${model_metrics.model_id} = ${feature_importance.model_id} ;; + relationship: one_to_many + } +} +``` + +### Explore 2: Prediction Monitoring + +```lookml +explore: predictions { + label: "Prediction Monitoring" + description: "Real-time prediction accuracy and drift" + + join: model_metrics { + type: left_outer + sql_on: ${predictions.model_id} = ${model_metrics.model_id} ;; + relationship: many_to_one + } +} +``` + +### Explore 3: Data Quality Dashboard + +```lookml +explore: data_profile_summary { + label: "Data Quality" + description: "Monitor data health and schema drift" +} +``` + +--- + +## 📝 Implementation Checklist + +### Phase 1: Setup (Week 1) +- [ ] Create all 4 BigQuery tables with partitioning +- [ ] Set up service account permissions +- [ ] Configure table expiration policies +- [ ] Document table owners and update SLAs + +### Phase 2: Integration (Week 2) +- [ ] Update tools to write to these schemas +- [ ] Add schema validation in CI/CD +- [ ] Create data dictionary in Looker +- [ ] Set up table monitoring alerts + +### Phase 3: BI Layer (Week 3) +- [ ] Create Looker views for all 4 tables +- [ ] Build explores with joins +- [ ] Create initial dashboards +- [ ] Set up scheduled data refreshes + +### Phase 4: Validation (Week 4) +- [ ] Backfill historical data +- [ ] Verify dashboard accuracy +- [ ] Train stakeholders on dashboards +- [ ] Document runbooks for common issues + +--- + +## 🔗 Related Tools + +**BigQuery Write Tools** (src/bigquery/): +- `bigquery_write_results()` - Generic write function +- Helper: `bigquery_write_model_metrics()` - Specialized writer +- Helper: `bigquery_write_feature_importance()` - Specialized writer +- Helper: `bigquery_write_predictions()` - Specialized writer +- Helper: `bigquery_write_data_profile()` - Specialized writer + +**Example Usage**: +```python +from src.bigquery import bigquery_write_results + +# Write model metrics +bigquery_write_results( + data=metrics_df, + table_id="project.dataset.model_metrics", + write_disposition="WRITE_APPEND" +) +``` + +--- + +## 📚 Additional Resources + +- [BigQuery Best Practices](https://cloud.google.com/bigquery/docs/best-practices) +- [Looker LookML Reference](https://cloud.google.com/looker/docs/reference/lookml-quick-reference) +- [Schema Design for BI](https://cloud.google.com/architecture/bigquery-data-warehouse) + +--- + +**Last Updated**: December 23, 2025 +**Schema Version**: 1.0.0 +**Maintained By**: Data Science Team +**Review Cadence**: Quarterly diff --git a/CHECKLIST.md b/CHECKLIST.md new file mode 100644 index 0000000000000000000000000000000000000000..2d25ef7533123646a384d6b5740c015dc0839ba7 --- /dev/null +++ b/CHECKLIST.md @@ -0,0 +1,97 @@ +# ✅ Pre-Launch Checklist + +## Before Running the Application + +### 1. Environment Variables ⚠️ **REQUIRED** + +You MUST set your API key before starting: + +```powershell +# Windows PowerShell +$env:GOOGLE_API_KEY="your-google-api-key-here" + +# Verify it's set +echo $env:GOOGLE_API_KEY +``` + +### 2. Build Status ✅ + +- [x] Frontend dependencies installed +- [x] Frontend built (FRRONTEEEND/dist exists) +- [x] Backend code updated with new endpoints +- [x] Configuration files in place + +### 3. Quick Start Commands + +**Option A - Use the start script:** +```powershell +.\start.ps1 +``` + +**Option B - Manual start:** +```powershell +# Make sure you're in the project root +Set-Location "c:\Users\Pulastya\Videos\DS AGENTTTT" + +# Set API key (if not already set) +$env:GOOGLE_API_KEY="your-key-here" + +# Start the server +python src\api\app.py +``` + +### 4. Access the Application + +Once the server starts, open your browser to: +**http://localhost:8080** + +You should see: +1. **Landing Page** - Professional homepage with agent features +2. **Launch Console** button - Click to open the chat interface +3. **Chat Interface** - Modern conversational UI + +### 5. Test the Chat + +Try these sample prompts: +- "What can you do?" +- "Explain your data science capabilities" +- "How do I upload a dataset?" +- "What ML models do you support?" + +### 6. Expected Console Output + +When you start the server, you should see: +``` +INFO: Started server process [####] +INFO: Waiting for application startup. +✅ Agent initialized with provider: groq +✅ Frontend assets mounted from C:\Users\Pulastya\Videos\DS AGENTTTT\FRRONTEEEND\dist +INFO: Application startup complete. +INFO: Uvicorn running on http://0.0.0.0:8080 +``` + +### 7. Troubleshooting Quick Reference + +| Issue | Solution | +|-------|----------| +| "Agent not initialized" | Set GOOGLE_API_KEY environment variable | +| "Frontend not found" | Run `cd FRRONTEEEND && npm run build` | +| Port 8080 in use | Kill the process or change PORT env var | +| Import errors | Run `pip install -r requirements.txt` | + +## Next Steps After Launch + +1. **Test the chat** with the agent +2. **Upload a dataset** (feature coming soon in chat) +3. **Try the API endpoints** at http://localhost:8080/docs +4. **Customize the frontend** in FRRONTEEEND/components/ + +## Documentation + +- 📖 [MIGRATION_COMPLETE.md](MIGRATION_COMPLETE.md) - What was changed +- 📖 [FRONTEND_INTEGRATION.md](FRONTEND_INTEGRATION.md) - Technical details +- 📖 [README.md](README.md) - Main project docs + +--- + +**Ready to launch?** Run `.\start.ps1` and visit http://localhost:8080 🚀 diff --git a/DEPLOYMENT.md b/DEPLOYMENT.md new file mode 100644 index 0000000000000000000000000000000000000000..730a645bc23f32c2e8f96839628cd63a74899218 --- /dev/null +++ b/DEPLOYMENT.md @@ -0,0 +1,495 @@ +# 🚀 Google Cloud Run Deployment Guide + +Complete guide to deploy the Data Science Agent to Google Cloud Run as a serverless API. + +## 📋 Prerequisites + +1. **Google Cloud Platform Account** + - Active GCP account with billing enabled + - Project created (or use existing project) + +2. **Install Google Cloud SDK** + ```bash + # macOS (Homebrew) + brew install --cask google-cloud-sdk + + # Or download from: https://cloud.google.com/sdk/install + ``` + +3. **Authenticate with GCP** + ```bash + gcloud auth login + gcloud auth application-default login + ``` + +4. **Set Your Project** + ```bash + gcloud config set project YOUR_PROJECT_ID + ``` + +--- + +## 🎯 Deployment Options + +### Option 1: Automated Deployment (Recommended) + +Use the provided deployment script for one-command deployment: + +```bash +# Set required environment variables +export GCP_PROJECT_ID="your-project-id" +export GROQ_API_KEY="your-groq-api-key" +export GOOGLE_API_KEY="your-google-api-key" # Optional for Gemini + +# Run deployment script +./deploy.sh +``` + +**What it does:** +- ✅ Enables required GCP APIs (Cloud Build, Cloud Run, Secret Manager) +- ✅ Creates secrets for API keys +- ✅ Builds Docker container +- ✅ Deploys to Cloud Run +- ✅ Returns service URL + +**Configuration options:** +```bash +# Optional: Customize deployment +export CLOUD_RUN_REGION="us-central1" # Change region +export MEMORY="4Gi" # Increase memory +export CPU="2" # Set CPU count +export MAX_INSTANCES="10" # Scale limit +export TIMEOUT="900" # Request timeout (15 min) + +./deploy.sh +``` + +--- + +### Option 2: Manual Deployment + +Step-by-step manual deployment for full control: + +#### Step 1: Enable APIs +```bash +gcloud services enable \ + cloudbuild.googleapis.com \ + run.googleapis.com \ + containerregistry.googleapis.com \ + secretmanager.googleapis.com +``` + +#### Step 2: Create Secrets +```bash +# Create GROQ API key secret +echo -n "your-groq-api-key" | gcloud secrets create GROQ_API_KEY --data-file=- + +# Create Google API key secret (optional) +echo -n "your-google-api-key" | gcloud secrets create GOOGLE_API_KEY --data-file=- + +# Grant Cloud Run access to secrets +PROJECT_NUMBER=$(gcloud projects describe $(gcloud config get-value project) --format="value(projectNumber)") +gcloud secrets add-iam-policy-binding GROQ_API_KEY \ + --member="serviceAccount:${PROJECT_NUMBER}-compute@developer.gserviceaccount.com" \ + --role="roles/secretmanager.secretAccessor" +``` + +#### Step 3: Build Container +```bash +gcloud builds submit --tag gcr.io/$(gcloud config get-value project)/data-science-agent +``` + +#### Step 4: Deploy to Cloud Run +```bash +gcloud run deploy data-science-agent \ + --image gcr.io/$(gcloud config get-value project)/data-science-agent \ + --platform managed \ + --region us-central1 \ + --allow-unauthenticated \ + --memory 4Gi \ + --cpu 2 \ + --timeout 900 \ + --max-instances 10 \ + --set-env-vars LLM_PROVIDER=groq,REASONING_EFFORT=medium \ + --set-secrets GROQ_API_KEY=GROQ_API_KEY:latest,GOOGLE_API_KEY=GOOGLE_API_KEY:latest +``` + +--- + +### Option 3: CI/CD with Cloud Build Triggers + +Automated deployment on git push: + +#### Step 1: Connect Repository +```bash +# Connect GitHub/GitLab/Bitbucket repository +gcloud beta builds connections create github connection-name \ + --region=us-central1 +``` + +#### Step 2: Create Build Trigger +```bash +gcloud builds triggers create github \ + --name="deploy-data-science-agent" \ + --repo-name="Data-Science-Agent" \ + --repo-owner="Surfing-Ninja" \ + --branch-pattern="^main$" \ + --build-config="cloudbuild.yaml" +``` + +Now every push to `main` branch automatically deploys! 🎉 + +--- + +## 🧪 Testing the Deployment + +### 1. Health Check +```bash +SERVICE_URL=$(gcloud run services describe data-science-agent \ + --region us-central1 \ + --format 'value(status.url)') + +curl $SERVICE_URL/health +``` + +**Expected response:** +```json +{ + "status": "healthy", + "agent_ready": true, + "provider": "groq", + "tools_count": 82 +} +``` + +### 2. List Available Tools +```bash +curl $SERVICE_URL/tools | jq +``` + +### 3. Profile a Dataset +```bash +curl -X POST $SERVICE_URL/profile \ + -F "file=@test_data/sample.csv" +``` + +### 4. Run Full Analysis +```bash +curl -X POST $SERVICE_URL/run \ + -F "file=@test_data/sample.csv" \ + -F "task_description=Analyze this dataset, detect outliers, and train a prediction model" \ + -F "target_col=target" \ + | jq +``` + +--- + +## 📊 Monitoring & Logs + +### View Real-time Logs +```bash +gcloud run logs tail data-science-agent --region us-central1 +``` + +### View Recent Logs +```bash +gcloud run logs read data-science-agent \ + --region us-central1 \ + --limit 50 +``` + +### Cloud Console Monitoring +- Go to: https://console.cloud.google.com/run +- Click on `data-science-agent` +- View: Metrics, Logs, Revisions + +--- + +## 💰 Cost Estimation + +### Cloud Run Pricing (as of Dec 2024) +**Free Tier** (per month): +- 2 million requests +- 360,000 GB-seconds of memory +- 180,000 vCPU-seconds + +**Paid Tier** (us-central1): +- CPU: $0.00002400 per vCPU-second +- Memory: $0.00000250 per GB-second +- Requests: $0.40 per million requests + +**Example Cost for 4Gi Memory, 2 vCPU:** +- 1 request taking 60 seconds + - CPU: 2 vCPU × 60s × $0.000024 = $0.00288 + - Memory: 4GB × 60s × $0.0000025 = $0.0006 + - Request: $0.0000004 + - **Total: ~$0.0035 per request** + +**Monthly estimate for 1000 requests/month:** +- ~$3.50/month (well within free tier for testing!) + +--- + +## 🔒 Security Best Practices + +### 1. Enable Authentication (Production) +```bash +# Deploy with authentication required +gcloud run deploy data-science-agent \ + --no-allow-unauthenticated \ + --region us-central1 \ + --image gcr.io/PROJECT_ID/data-science-agent + +# Create service account for clients +gcloud iam service-accounts create api-client + +# Grant invoker role +gcloud run services add-iam-policy-binding data-science-agent \ + --member="serviceAccount:api-client@PROJECT_ID.iam.gserviceaccount.com" \ + --role="roles/run.invoker" \ + --region us-central1 +``` + +### 2. Use VPC Connector (For BigQuery/GCS) +```bash +# Create VPC connector +gcloud compute networks vpc-access connectors create ds-agent-connector \ + --network default \ + --region us-central1 \ + --range 10.8.0.0/28 + +# Deploy with VPC +gcloud run deploy data-science-agent \ + --vpc-connector ds-agent-connector \ + --region us-central1 +``` + +### 3. Restrict API Keys +- Set **Application restrictions** in Google Cloud Console +- Whitelist only Cloud Run service URL +- Set **API restrictions** to only required APIs + +--- + +## 🔧 Configuration Options + +### Environment Variables +```bash +# Set during deployment +--set-env-vars KEY1=value1,KEY2=value2 + +# Available variables: +LLM_PROVIDER=groq # or "gemini" +REASONING_EFFORT=medium # low, medium, high +CACHE_TTL_SECONDS=86400 # Cache lifetime +ARTIFACT_BACKEND=local # or "gcs" for cloud storage +GCS_BUCKET_NAME=your-bucket # If using GCS backend +OUTPUT_DIR=/tmp/outputs # Output directory +MAX_PARALLEL_TOOLS=5 # Concurrent tool execution +MAX_RETRIES=3 # Tool retry attempts +TIMEOUT_SECONDS=300 # Tool timeout +``` + +### Resource Limits +```bash +--memory 4Gi # 128Mi to 32Gi +--cpu 2 # 1 to 8 vCPU +--timeout 900 # Max 3600s (1 hour) +--max-instances 10 # Scale limit +--min-instances 0 # Always-warm instances +--concurrency 10 # Requests per instance +``` + +--- + +## 🐛 Troubleshooting + +### Build Fails +```bash +# Check build logs +gcloud builds list --limit=5 +gcloud builds log BUILD_ID + +# Common fixes: +# - Ensure Dockerfile is in root directory +# - Check requirements.txt has all dependencies +# - Increase build timeout: --timeout=1200s +``` + +### Deployment Fails +```bash +# Check service status +gcloud run services describe data-science-agent --region us-central1 + +# Common fixes: +# - Ensure APIs are enabled +# - Check secrets exist and are accessible +# - Verify service account permissions +``` + +### Runtime Errors +```bash +# View logs +gcloud run logs tail data-science-agent --region us-central1 + +# Common issues: +# - API keys not set: Check secrets +# - Import errors: Ensure all dependencies in requirements.txt +# - Memory issues: Increase --memory limit +# - Timeout: Increase --timeout value +``` + +### Container Crashes +```bash +# Test locally first +docker build -t ds-agent . +docker run -p 8080:8080 \ + -e GROQ_API_KEY="your-key" \ + ds-agent + +curl http://localhost:8080/health +``` + +--- + +## 🚀 Advanced Features + +### Custom Domain +```bash +# Map custom domain +gcloud run domain-mappings create \ + --service data-science-agent \ + --domain api.yourdomain.com \ + --region us-central1 +``` + +### Load Balancing +```bash +# Create multiple regional deployments +for region in us-central1 us-east1 europe-west1; do + gcloud run deploy data-science-agent \ + --image gcr.io/PROJECT_ID/data-science-agent \ + --region $region +done + +# Set up global load balancer +# Follow: https://cloud.google.com/load-balancing/docs/https/setup-global-ext-https-serverless +``` + +### Multi-Region Deployment +```bash +# Deploy to multiple regions for high availability +./deploy.sh CLOUD_RUN_REGION=us-central1 +./deploy.sh CLOUD_RUN_REGION=europe-west1 +./deploy.sh CLOUD_RUN_REGION=asia-east1 +``` + +--- + +## 📝 API Documentation + +Once deployed, access Swagger docs at: +``` +https://YOUR_SERVICE_URL/docs +``` + +### Available Endpoints + +#### `GET /` - Health Check +Returns service status and tool count. + +#### `GET /health` - Detailed Health +Returns agent readiness and provider info. + +#### `GET /tools` - List Tools +Returns all 82 available tools organized by category. + +#### `POST /run` - Run Full Analysis +Upload dataset and execute complete data science workflow. + +**Parameters:** +- `file`: CSV/Parquet file (multipart/form-data) +- `task_description`: Natural language task description +- `target_col`: Target column for ML (optional) +- `use_cache`: Enable caching (default: true) +- `max_iterations`: Max workflow steps (default: 20) + +#### `POST /profile` - Quick Profile +Quick dataset profiling without full workflow. + +**Parameters:** +- `file`: CSV/Parquet file (multipart/form-data) + +--- + +## 🔄 Updates & Rollbacks + +### Update Deployment +```bash +# Rebuild and redeploy +./deploy.sh +``` + +### Rollback to Previous Revision +```bash +# List revisions +gcloud run revisions list --service data-science-agent --region us-central1 + +# Rollback +gcloud run services update-traffic data-science-agent \ + --to-revisions REVISION_NAME=100 \ + --region us-central1 +``` + +### Blue/Green Deployment +```bash +# Deploy new version with tag +gcloud run deploy data-science-agent \ + --tag blue \ + --no-traffic \ + --region us-central1 + +# Test: https://blue---data-science-agent-HASH.run.app + +# Switch traffic +gcloud run services update-traffic data-science-agent \ + --to-tags blue=100 \ + --region us-central1 +``` + +--- + +## 📚 Additional Resources + +- **Cloud Run Docs**: https://cloud.google.com/run/docs +- **Pricing Calculator**: https://cloud.google.com/products/calculator +- **Best Practices**: https://cloud.google.com/run/docs/tips +- **Quotas & Limits**: https://cloud.google.com/run/quotas + +--- + +## ✅ Deployment Checklist + +- [ ] GCP project created and billing enabled +- [ ] Google Cloud SDK installed and authenticated +- [ ] API keys obtained (GROQ_API_KEY, GOOGLE_API_KEY) +- [ ] Secrets created in Secret Manager +- [ ] Docker container builds successfully locally +- [ ] Cloud Run APIs enabled +- [ ] Service deployed to Cloud Run +- [ ] Health check endpoint returns 200 +- [ ] Test dataset profiled successfully +- [ ] Full analysis workflow tested +- [ ] Monitoring/logging configured +- [ ] Cost alerts set up (optional) +- [ ] Custom domain mapped (optional) +- [ ] CI/CD pipeline configured (optional) + +--- + +**Need help?** Check the troubleshooting section or view logs with: +```bash +gcloud run logs tail data-science-agent --region us-central1 +``` + +Happy deploying! 🎉 diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..3efba6cebd27487a0fb3dd7c653390a633e6ecc0 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,78 @@ +# Multi-stage build for Google Cloud Run +# Stage 1: Build Frontend +FROM node:20-alpine as frontend-builder + +WORKDIR /frontend + +# Copy frontend files +COPY FRRONTEEEND/package*.json ./ +RUN npm install + +COPY FRRONTEEEND/ ./ +RUN npm run build + +# Stage 2: Build Python environment +FROM python:3.13-slim as builder + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + gcc \ + g++ \ + make \ + && rm -rf /var/lib/apt/lists/* + +# Create virtual environment +RUN python -m venv /opt/venv +ENV PATH="/opt/venv/bin:$PATH" + +# Copy requirements and install Python packages +COPY requirements.txt . +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir -r requirements.txt + +# Stage 3: Runtime environment +FROM python:3.13-slim + +# Install runtime dependencies only +RUN apt-get update && apt-get install -y \ + libgomp1 \ + && rm -rf /var/lib/apt/lists/* + +# Copy virtual environment from builder +COPY --from=builder /opt/venv /opt/venv +ENV PATH="/opt/venv/bin:$PATH" + +# Set working directory +WORKDIR /app + +# Copy application code +COPY src/ /app/src/ +COPY examples/ /app/examples/ + +# Copy built frontend from frontend-builder +COPY --from=frontend-builder /frontend/dist /app/FRRONTEEEND/dist + +# Create necessary directories for Cloud Run ephemeral storage +RUN mkdir -p /tmp/data_science_agent \ + /tmp/outputs/models \ + /tmp/outputs/plots \ + /tmp/outputs/reports \ + /tmp/outputs/data \ + /tmp/cache_db + +# Set environment variables +ENV PYTHONUNBUFFERED=1 +ENV PORT=8080 +ENV OUTPUT_DIR=/tmp/outputs +ENV CACHE_DB_PATH=/tmp/cache_db/cache.db +ENV ARTIFACT_BACKEND=local + +# Cloud Run expects the service to listen on the PORT env variable +EXPOSE 8080 + +# Health check (optional, Cloud Run handles this) +HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ + CMD python -c "import requests; requests.get('http://localhost:8080/health')" || exit 1 + +# Run the FastAPI application +CMD ["python", "src/api/app.py"] diff --git a/FRONTEND_INTEGRATION.md b/FRONTEND_INTEGRATION.md new file mode 100644 index 0000000000000000000000000000000000000000..15c0063b94d5b6eefe01556751f79eef56784704 --- /dev/null +++ b/FRONTEND_INTEGRATION.md @@ -0,0 +1,234 @@ +# Data Science Agent - Frontend Integration Guide + +## 🎉 New React Frontend + +The application now features a modern, professional React frontend that replaces the old Gradio interface. + +### Features + +- **Beautiful Landing Page**: Showcases the agent's capabilities with modern design +- **Professional Chat Interface**: NextChat-style conversational UI +- **Direct Backend Integration**: Communicates with your FastAPI backend +- **Responsive Design**: Works on all devices +- **Dark Theme**: Modern, eye-friendly interface + +## 🚀 Quick Start + +### Prerequisites + +- Python 3.13+ +- Node.js 20+ +- npm (comes with Node.js) + +### Running the Application + +#### Option 1: Using the Build Script (Recommended) + +**Windows:** +```powershell +.\build-and-deploy.ps1 +``` + +**Linux/Mac:** +```bash +chmod +x build-and-deploy.sh +./build-and-deploy.sh +``` + +Then start the server: +```bash +python src/api/app.py +``` + +#### Option 2: Manual Steps + +1. **Build the Frontend:** +```bash +cd FRRONTEEEND +npm.cmd install +npm.cmd run build +cd .. +``` + +2. **Install Python Dependencies:** +```bash +pip install -r requirements.txt +``` + +3. **Start the Backend Server:** +```bash +python src/api/app.py +``` + +4. **Access the Application:** +Open your browser and navigate to: http://localhost:8080 + +## 🏗️ Architecture + +### Backend (FastAPI) +- **Location**: `src/api/app.py` +- **Port**: 8080 +- **Endpoints**: + - `GET /` - Health check & landing page + - `POST /chat` - Chat interface endpoint + - `POST /run` - Full data science workflow + - `POST /profile` - Dataset profiling + - `GET /tools` - List available tools + +### Frontend (React + Vite) +- **Location**: `FRRONTEEEND/` +- **Build Output**: `FRRONTEEEND/dist/` +- **Dev Port**: 3000 (development mode) +- **Production**: Served by FastAPI at port 8080 + +## 🔧 Development Mode + +If you want to develop the frontend with hot-reloading: + +1. **Terminal 1 - Backend:** +```bash +python src/api/app.py +``` + +2. **Terminal 2 - Frontend:** +```bash +cd FRRONTEEEND +npm.cmd run dev +``` + +Access: +- Frontend (dev): http://localhost:3000 +- Backend API: http://localhost:8080 + +## 🌐 API Integration + +The frontend now communicates with your FastAPI backend instead of calling external APIs directly. + +### Environment Variables + +Create `FRRONTEEEND/.env` for local development: +```env +VITE_API_URL=http://localhost:8080 +``` + +For production, update `FRRONTEEEND/.env.production`: +```env +VITE_API_URL=https://your-cloud-run-url.run.app +``` + +## 📦 Deployment + +### Docker Build + +The Dockerfile now includes a multi-stage build that: +1. Builds the React frontend +2. Builds the Python environment +3. Combines both in the final image + +```bash +docker build -t data-science-agent . +docker run -p 8080:8080 data-science-agent +``` + +### Google Cloud Run + +```bash +gcloud builds submit --tag gcr.io/YOUR-PROJECT-ID/data-science-agent +gcloud run deploy data-science-agent \ + --image gcr.io/YOUR-PROJECT-ID/data-science-agent \ + --platform managed \ + --region us-central1 \ + --allow-unauthenticated \ + --set-env-vars GROQ_API_KEY=your-api-key +``` + +## 🔄 What Changed + +### Removed +- ❌ Gradio interface (`chat_ui.py` - kept for reference) +- ❌ Direct Google GenAI calls from frontend +- ❌ Gradio dependency + +### Added +- ✅ React + TypeScript frontend with Vite +- ✅ Professional landing page +- ✅ Modern chat interface +- ✅ `/chat` API endpoint +- ✅ CORS support in FastAPI +- ✅ Static file serving for React app +- ✅ Multi-stage Docker build + +## 🛠️ Tech Stack + +### Frontend +- React 19 +- TypeScript 5.8 +- Vite 6 +- Tailwind CSS +- Framer Motion (animations) +- Lucide React (icons) + +### Backend (unchanged) +- FastAPI +- Python 3.13 +- Groq API +- Polars, DuckDB +- Scikit-learn, XGBoost, LightGBM + +## 📁 Project Structure + +``` +. +├── FRRONTEEEND/ # React frontend +│ ├── components/ # React components +│ ├── dist/ # Built frontend (after npm run build) +│ ├── package.json +│ ├── vite.config.ts +│ └── .env # Frontend environment variables +├── src/ +│ ├── api/ +│ │ └── app.py # FastAPI backend (updated) +│ ├── tools/ # Data science tools +│ └── orchestrator.py # Main agent logic +├── requirements.txt # Python dependencies (updated) +├── Dockerfile # Multi-stage build (updated) +├── build-and-deploy.ps1 # Windows build script +└── build-and-deploy.sh # Linux/Mac build script +``` + +## 🐛 Troubleshooting + +### Frontend doesn't load +- Make sure you've run `npm run build` in the FRRONTEEEND directory +- Check that `FRRONTEEEND/dist/` exists and contains files + +### API errors in chat +- Ensure the backend is running on port 8080 +- Check that `GROQ_API_KEY` is set in your environment +- Verify the API URL in `.env` file + +### CORS errors +- The backend now has CORS enabled for development +- For production, update the `allow_origins` in `src/api/app.py` + +## 📝 Notes + +- The old `chat_ui.py` has been kept for reference but is no longer used +- All chat functionality now goes through the `/chat` endpoint +- The frontend is automatically served by FastAPI in production mode +- Session history is maintained in the frontend (browser) + +## 🎯 Next Steps + +1. **Customize the frontend**: Edit files in `FRRONTEEEND/components/` +2. **Add file upload**: Extend `ChatInterface.tsx` to handle file uploads +3. **Add visualization**: Display charts from the backend in the chat +4. **Authentication**: Add user authentication if needed + +## 📞 Support + +For issues or questions: +1. Check the console logs (browser & terminal) +2. Verify environment variables +3. Ensure all dependencies are installed +4. Review the API documentation at http://localhost:8080/docs diff --git a/FRRONTEEEND/.env.production b/FRRONTEEEND/.env.production new file mode 100644 index 0000000000000000000000000000000000000000..2cc3ea7198d28d128dee3896ee58004eeb3f053e --- /dev/null +++ b/FRRONTEEEND/.env.production @@ -0,0 +1,3 @@ +# Production API Configuration +# Update this to your production API URL +VITE_API_URL=https://your-cloud-run-url.run.app diff --git a/FRRONTEEEND/.gitignore b/FRRONTEEEND/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..a547bf36d8d11a4f89c59c144f24795749086dd1 --- /dev/null +++ b/FRRONTEEEND/.gitignore @@ -0,0 +1,24 @@ +# Logs +logs +*.log +npm-debug.log* +yarn-debug.log* +yarn-error.log* +pnpm-debug.log* +lerna-debug.log* + +node_modules +dist +dist-ssr +*.local + +# Editor directories and files +.vscode/* +!.vscode/extensions.json +.idea +.DS_Store +*.suo +*.ntvs* +*.njsproj +*.sln +*.sw? diff --git a/FRRONTEEEND/App.tsx b/FRRONTEEEND/App.tsx new file mode 100644 index 0000000000000000000000000000000000000000..52ef8907179ae2d06c533aca1dee7cc1a6c70508 --- /dev/null +++ b/FRRONTEEEND/App.tsx @@ -0,0 +1,59 @@ + +import React, { useState } from 'react'; +import { HeroGeometric } from './components/HeroGeometric'; +import ProblemSolution from './components/ProblemSolution'; +import KeyCapabilities from './components/KeyCapabilities'; +import Process from './components/Process'; +import TechStack from './components/TechStack'; +import Footer from './components/Footer'; +import { BackgroundPaths } from './components/BackgroundPaths'; +import { Logo } from './components/Logo'; +import { ChatInterface } from './components/ChatInterface'; + +const App: React.FC = () => { + const [view, setView] = useState<'landing' | 'chat'>('landing'); + + if (view === 'chat') { + return setView('landing')} />; + } + + return ( +
+ {/* Navigation (Overlay) */} + + +
+ setView('chat')} /> + + + + + {/* Transitional background paths section */} + + + +
+ +
+
+ ); +}; + +export default App; diff --git a/FRRONTEEEND/README.md b/FRRONTEEEND/README.md new file mode 100644 index 0000000000000000000000000000000000000000..8c6da4bbec3fc4886ec282b611d2c717fe6f602e --- /dev/null +++ b/FRRONTEEEND/README.md @@ -0,0 +1,20 @@ +
+GHBanner +
+ +# Run and deploy your AI Studio app + +This contains everything you need to run your app locally. + +View your app in AI Studio: https://ai.studio/apps/drive/1gChoktTuh429q26FzxS4BPo0q0LnlRE9 + +## Run Locally + +**Prerequisites:** Node.js + + +1. Install dependencies: + `npm install` +2. Set the `GEMINI_API_KEY` in [.env.local](.env.local) to your Gemini API key +3. Run the app: + `npm run dev` diff --git a/FRRONTEEEND/components/BackgroundPaths.tsx b/FRRONTEEEND/components/BackgroundPaths.tsx new file mode 100644 index 0000000000000000000000000000000000000000..fcc5d3a741b76fac9313bac49cf130dd5f9abcb5 --- /dev/null +++ b/FRRONTEEEND/components/BackgroundPaths.tsx @@ -0,0 +1,148 @@ + +import React from "react"; +import { motion } from "framer-motion"; +import { ArrowRight } from "lucide-react"; +import { cn } from "../lib/utils"; + +function FloatingPaths({ position }: { position: number }) { + const paths = Array.from({ length: 36 }, (_, i) => ({ + id: i, + d: `M-${380 - i * 5 * position} -${189 + i * 6}C-${ + 380 - i * 5 * position + } -${189 + i * 6} -${312 - i * 5 * position} ${216 - i * 6} ${ + 152 - i * 5 * position + } ${343 - i * 6}C${616 - i * 5 * position} ${470 - i * 6} ${ + 684 - i * 5 * position + } ${875 - i * 6} ${684 - i * 5 * position} ${875 - i * 6}`, + color: `rgba(99,102,241,${0.05 + i * 0.01})`, // Using indigo-500 tint + width: 0.5 + i * 0.03, + })); + + return ( +
+ + Background Paths + {paths.map((path) => ( + + ))} + +
+ ); +} + +export function BackgroundPaths({ + title = "The Future is Autonomous", + subtitle = "Scale your data engineering and predictive modeling beyond human limits.", +}: { + title?: string; + subtitle?: string; +}) { + const words = title.split(" "); + + return ( +
+
+ + +
+ +
+ +

+ {words.map((word, wordIndex) => ( + + {word.split("").map((letter, letterIndex) => ( + + {letter} + + ))} + + ))} +

+ + + {subtitle} + + + + + +
+
+ + {/* Subtle glow effect at the bottom */} +
+
+ ); +} diff --git a/FRRONTEEEND/components/ChatInterface.tsx b/FRRONTEEEND/components/ChatInterface.tsx new file mode 100644 index 0000000000000000000000000000000000000000..f4a750af21511d594edc28a3d68e666d8d40a7d2 --- /dev/null +++ b/FRRONTEEEND/components/ChatInterface.tsx @@ -0,0 +1,571 @@ + +import React, { useState, useRef, useEffect } from 'react'; +import { motion, AnimatePresence } from 'framer-motion'; +import { Send, Plus, Search, Settings, MoreHorizontal, User, Bot, ArrowLeft, Paperclip, Sparkles, Trash2, X, Upload } from 'lucide-react'; +import { cn } from '../lib/utils'; +import { Logo } from './Logo'; +import ReactMarkdown from 'react-markdown'; + +interface Message { + id: string; + role: 'user' | 'assistant'; + content: string; + timestamp: Date; + file?: { + name: string; + size: number; + }; + reports?: Array<{ + name: string; + path: string; + }>; +} + +interface ChatSession { + id: string; + title: string; + messages: Message[]; + updatedAt: Date; +} + +export const ChatInterface: React.FC<{ onBack: () => void }> = ({ onBack }) => { + const [sessions, setSessions] = useState([ + { + id: '1', + title: 'ML Model Analysis', + messages: [], + updatedAt: new Date(), + } + ]); + const [activeSessionId, setActiveSessionId] = useState('1'); + const [input, setInput] = useState(''); + const [isTyping, setIsTyping] = useState(false); + const [uploadedFile, setUploadedFile] = useState(null); + const [reportModalUrl, setReportModalUrl] = useState(null); + const fileInputRef = useRef(null); + const scrollRef = useRef(null); + + const activeSession = sessions.find(s => s.id === activeSessionId) || sessions[0]; + + useEffect(() => { + if (scrollRef.current) { + scrollRef.current.scrollTop = scrollRef.current.scrollHeight; + } + }, [activeSession.messages, isTyping]); + + const handleSend = async () => { + if ((!input.trim() && !uploadedFile) || isTyping) return; + + const userMessage: Message = { + id: Date.now().toString(), + role: 'user', + content: input || (uploadedFile ? `Uploaded: ${uploadedFile.name}` : ''), + timestamp: new Date(), + file: uploadedFile ? { name: uploadedFile.name, size: uploadedFile.size } : undefined, + }; + + const newMessages = [...activeSession.messages, userMessage]; + updateSession(activeSessionId, newMessages); + setInput(''); + setIsTyping(true); + + try { + // Use the current origin if running on same server, otherwise use env variable + const API_URL = window.location.origin; + console.log('API URL:', API_URL); + + let response; + + if (uploadedFile) { + const formData = new FormData(); + formData.append('file', uploadedFile); + formData.append('task_description', input || 'Analyze this dataset and provide insights'); + formData.append('use_cache', 'true'); + formData.append('max_iterations', '20'); + + response = await fetch(`${API_URL}/run`, { + method: 'POST', + body: formData + }); + + setUploadedFile(null); + } else { + response = await fetch(`${API_URL}/chat`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ + messages: newMessages.map(m => ({ + role: m.role, + content: m.content + })), + stream: false + }) + }); + } + + if (!response.ok) { + throw new Error(`API error: ${response.status}`); + } + + const data = await response.json(); + + let assistantContent = ''; + let reports: Array<{name: string, path: string}> = []; + + if (uploadedFile && data.result) { + const result = data.result; + assistantContent = `✅ Analysis Complete!\n\n`; + + // Extract report paths from workflow history + if (result.workflow_history) { + const reportTools = ['generate_ydata_profiling_report', 'generate_sweetviz_report', 'generate_combined_eda_report']; + result.workflow_history.forEach((step: any) => { + if (reportTools.includes(step.tool)) { + // Check multiple possible locations for the report path + const reportPath = step.result?.output_path || step.result?.report_path || step.arguments?.output_path; + + if (reportPath && (step.result?.success !== false)) { + reports.push({ + name: step.tool.replace('generate_', '').replace(/_/g, ' ').replace('report', '').trim(), + path: reportPath + }); + } + } + }); + } + + // Also check for report paths mentioned in the summary text + if (result.summary && !reports.length) { + const reportPathMatch = result.summary.match(/\.(\/outputs\/reports\/[^\s]+\.html)/); + if (reportPathMatch) { + reports.push({ + name: 'ydata profiling', + path: reportPathMatch[1] + }); + } + } + + if (result.summary) { + assistantContent += `**Summary:**\n${result.summary}\n\n`; + } + + if (result.workflow_history && result.workflow_history.length > 0) { + assistantContent += `**Tools Used:** ${result.workflow_history.length} steps\n\n`; + assistantContent += `**Final Result:**\n${result.final_result || 'Analysis completed successfully'}`; + } + } else if (data.success && data.message) { + assistantContent = data.message; + } else { + throw new Error('Invalid response from API'); + } + + updateSession(activeSessionId, [...newMessages, { + id: (Date.now() + 1).toString(), + role: 'assistant', + content: assistantContent, + timestamp: new Date(), + reports: reports.length > 0 ? reports : undefined + }]); + } catch (error: any) { + console.error("Chat Error:", error); + + let errorMessage = "I'm sorry, I encountered an error processing your request."; + + if (error.message) { + errorMessage += `\n\n**Error:** ${error.message}`; + } + + // Try to parse response error + try { + const errorText = await error.text?.(); + if (errorText) { + const errorData = JSON.parse(errorText); + if (errorData.detail) { + errorMessage = `**Error:** ${typeof errorData.detail === 'string' ? errorData.detail : JSON.stringify(errorData.detail)}`; + } + } + } catch (e) { + // Ignore parsing errors + } + + updateSession(activeSessionId, [...newMessages, { + id: 'err-' + Date.now(), + role: 'assistant', + content: errorMessage, + timestamp: new Date() + }]); + } finally { + setIsTyping(false); + } + }; + + const updateSession = (id: string, messages: Message[]) => { + setSessions(prev => prev.map(s => { + if (s.id === id) { + return { ...s, messages, updatedAt: new Date() }; + } + return s; + })); + }; + + const createNewChat = () => { + const newId = Date.now().toString(); + const newSession: ChatSession = { + id: newId, + title: 'New Chat', + messages: [], + updatedAt: new Date() + }; + setSessions([newSession, ...sessions]); + setActiveSessionId(newId); + }; + + const deleteSession = (e: React.MouseEvent, id: string) => { + e.stopPropagation(); + if (sessions.length === 1) return; + setSessions(prev => prev.filter(s => s.id !== id)); + if (activeSessionId === id) { + setActiveSessionId(sessions.find(s => s.id !== id)?.id || ''); + } + }; + + const handleFileSelect = (e: React.ChangeEvent) => { + const file = e.target.files?.[0]; + if (file) { + const validTypes = ['.csv', '.parquet']; + const fileExt = file.name.substring(file.name.lastIndexOf('.')).toLowerCase(); + + if (validTypes.includes(fileExt)) { + setUploadedFile(file); + } else { + alert('Please upload a CSV or Parquet file'); + } + } + }; + + const removeFile = () => { + setUploadedFile(null); + if (fileInputRef.current) { + fileInputRef.current.value = ''; + } + }; + + return ( +
+ {/* Sidebar */} + + + {/* Main Chat Area */} +
+ {/* Top Header */} +
+
+ +
+

{activeSession.title}

+

{activeSession.messages.length} messages in session

+
+
+
+ + +
+
+ + {/* Message List */} +
+ {activeSession.messages.length === 0 ? ( +
+ + + +

Welcome, Data Scientist

+

+ I'm your autonomous agent ready to profile data, train models, or build dashboards. + Try uploading a dataset or describing your ML objective. +

+
+ {[ + "Profile my sales.csv", + "Train a XGBoost classifier", + "Generate a correlation heatmap", + "Explain feature importance" + ].map(prompt => ( + + ))} +
+
+ ) : ( + activeSession.messages.map((msg) => ( + +
+ {msg.role === 'user' ? : } +
+
+ {msg.file && ( +
+ + {msg.file.name} + ({(msg.file.size / 1024).toFixed(1)} KB) +
+ )} + {msg.role === 'assistant' ? ( +

, + ul: ({node, ...props}) =>

    , + ol: ({node, ...props}) =>
      , + li: ({node, ...props}) =>
    1. , + strong: ({node, ...props}) => , + code: ({node, inline, ...props}: any) => + inline ? + : + + }} + > + {msg.content || ''} + + ) : ( + msg.content || (msg.role === 'assistant' && isTyping && "...") + )} + {msg.reports && msg.reports.length > 0 && ( +
      + {msg.reports.map((report, idx) => ( + + ))} +
      + )} +
      + {msg.timestamp.toLocaleTimeString([], { hour: '2-digit', minute: '2-digit' })} +
      +
+
+ )) + )} + {isTyping && activeSession.messages[activeSession.messages.length - 1]?.role === 'user' && ( +
+
+ +
+
+
+ + + +
+
+
+ )} +
+ + {/* Input Bar */} +
+
+
+ + + {uploadedFile && ( +
+ + {uploadedFile.name} + +
+ )} +
+
+