Pulastya B commited on
Commit
226ac39
·
0 Parent(s):

feat: Initial commit - Data Science Agent with React frontend and FastAPI backend

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +74 -0
  2. .env.example +19 -0
  3. .gcloudignore +59 -0
  4. .gitignore +71 -0
  5. BIGQUERY_SCHEMAS.md +691 -0
  6. CHECKLIST.md +97 -0
  7. DEPLOYMENT.md +495 -0
  8. Dockerfile +78 -0
  9. FRONTEND_INTEGRATION.md +234 -0
  10. FRRONTEEEND/.env.production +3 -0
  11. FRRONTEEEND/.gitignore +24 -0
  12. FRRONTEEEND/App.tsx +59 -0
  13. FRRONTEEEND/README.md +20 -0
  14. FRRONTEEEND/components/BackgroundPaths.tsx +148 -0
  15. FRRONTEEEND/components/ChatInterface.tsx +571 -0
  16. FRRONTEEEND/components/Footer.tsx +171 -0
  17. FRRONTEEEND/components/HeroGeometric.tsx +213 -0
  18. FRRONTEEEND/components/KeyCapabilities.tsx +91 -0
  19. FRRONTEEEND/components/Logo.tsx +92 -0
  20. FRRONTEEEND/components/ProblemSolution.tsx +70 -0
  21. FRRONTEEEND/components/Process.tsx +70 -0
  22. FRRONTEEEND/components/ShadowSection.tsx +222 -0
  23. FRRONTEEEND/components/TechStack.tsx +36 -0
  24. FRRONTEEEND/index.html +59 -0
  25. FRRONTEEEND/index.tsx +16 -0
  26. FRRONTEEEND/lib/utils.ts +7 -0
  27. FRRONTEEEND/metadata.json +5 -0
  28. FRRONTEEEND/package-lock.json +0 -0
  29. FRRONTEEEND/package.json +26 -0
  30. FRRONTEEEND/tsconfig.json +29 -0
  31. FRRONTEEEND/vite.config.ts +29 -0
  32. GEMINI_UPDATE.md +93 -0
  33. MIGRATION_COMPLETE.md +325 -0
  34. QUICK_REFERENCE.txt +71 -0
  35. README.md +632 -0
  36. build-and-deploy.ps1 +39 -0
  37. build-and-deploy.sh +33 -0
  38. cache_db/.gitkeep +0 -0
  39. chat_ui.py +1073 -0
  40. cloudbuild.yaml +69 -0
  41. data/.gitkeep +0 -0
  42. deploy.sh +171 -0
  43. examples/titanic_example.py +166 -0
  44. requirements.txt +98 -0
  45. setup-deployment.sh +78 -0
  46. src/__init__.py +7 -0
  47. src/api/__init__.py +4 -0
  48. src/api/app.py +513 -0
  49. src/cache/__init__.py +5 -0
  50. src/cache/cache_manager.py +292 -0
.dockerignore ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python cache and environment
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ .venv/
8
+ venv/
9
+ ENV/
10
+ env/
11
+
12
+ # Development files
13
+ .git/
14
+ .gitignore
15
+ .env
16
+ .env.local
17
+ *.log
18
+
19
+ # Output directories (not needed in container)
20
+ outputs/
21
+ cache_db/
22
+ temp/
23
+ test_data/
24
+ data/
25
+
26
+ # Frontend development files (will be built in Docker)
27
+ FRRONTEEEND/node_modules/
28
+ FRRONTEEEND/.env
29
+ FRRONTEEEND/.env.local
30
+
31
+ # Documentation and tests
32
+ *.md
33
+ !README.md
34
+ tests/
35
+ test_*.py
36
+ check_*.py
37
+
38
+ # Old Gradio UI (no longer used)
39
+ chat_ui.py
40
+
41
+ # IDE
42
+ .vscode/
43
+ .idea/
44
+ *.swp
45
+ *.swo
46
+ *~
47
+
48
+ # OS files
49
+ .DS_Store
50
+ Thumbs.db
51
+
52
+ # Jupyter notebooks
53
+ *.ipynb
54
+ .ipynb_checkpoints/
55
+
56
+ # Large model files (if any)
57
+ *.pkl
58
+ *.joblib
59
+ *.h5
60
+ *.pt
61
+ *.pth
62
+
63
+ # Documentation
64
+ docs/
65
+ PHASE*.md
66
+ PROJECT*.md
67
+ TOKEN*.md
68
+ TOOL*.md
69
+ FEATURE*.md
70
+ IMPLEMENTATION*.md
71
+ MIGRATION*.md
72
+ EDA_REPORTS*.md
73
+ GITHUB*.md
74
+ BIGQUERY*.md
.env.example ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Google Gemini API Configuration
2
+ GOOGLE_API_KEY=your_google_api_key_here
3
+
4
+ # Model Configuration
5
+ LLM_PROVIDER=gemini
6
+ REASONING_EFFORT=medium
7
+
8
+ # Cache Configuration
9
+ CACHE_DB_PATH=./cache_db/cache.db
10
+ CACHE_TTL_SECONDS=86400
11
+
12
+ # Output Configuration
13
+ OUTPUT_DIR=./outputs
14
+ DATA_DIR=./data
15
+
16
+ # Performance Configuration
17
+ MAX_PARALLEL_TOOLS=5
18
+ MAX_RETRIES=3
19
+ TIMEOUT_SECONDS=300
.gcloudignore ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file specifies files that are *not* uploaded to Google Cloud
2
+ # using gcloud. It follows the same syntax as .gitignore
3
+
4
+ .gcloudignore
5
+ .git
6
+ .gitignore
7
+
8
+ # Python
9
+ __pycache__/
10
+ *.py[cod]
11
+ *$py.class
12
+ *.so
13
+ .Python
14
+ .venv/
15
+ venv/
16
+ ENV/
17
+ env/
18
+
19
+ # Local development
20
+ .env
21
+ .env.local
22
+ *.log
23
+
24
+ # Outputs and cache (regenerated in cloud)
25
+ outputs/
26
+ cache_db/
27
+ temp/
28
+ test_data/
29
+ data/
30
+
31
+ # Documentation
32
+ *.md
33
+ !README.md
34
+
35
+ # Tests
36
+ tests/
37
+ test_*.py
38
+ check_*.py
39
+
40
+ # IDE
41
+ .vscode/
42
+ .idea/
43
+ *.swp
44
+ *.swo
45
+
46
+ # OS
47
+ .DS_Store
48
+ Thumbs.db
49
+
50
+ # Jupyter
51
+ *.ipynb
52
+ .ipynb_checkpoints/
53
+
54
+ # Build artifacts
55
+ *.pkl
56
+ *.joblib
57
+ *.h5
58
+ *.pt
59
+ *.pth
.gitignore ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+
23
+ # Virtual Environment
24
+ venv/
25
+ .venv/
26
+ env/
27
+ ENV/
28
+
29
+ # Environment Variables
30
+ .env
31
+
32
+ # IDE
33
+ .vscode/
34
+ .idea/
35
+ *.swp
36
+ *.swo
37
+ *~
38
+
39
+ # Cache & Outputs
40
+ cache_db/*.db
41
+ cache_db/*.db-journal
42
+ cache_db/
43
+ outputs/
44
+ temp/
45
+ *.pkl
46
+ *.joblib
47
+
48
+ # Data files (except examples)
49
+ data/*.csv
50
+ data/*.parquet
51
+ !data/.gitkeep
52
+
53
+ # Cloud Run URL
54
+ .cloud_run_url
55
+
56
+ # Jupyter
57
+ .ipynb_checkpoints/
58
+ *.ipynb
59
+
60
+ # OS
61
+ .DS_Store
62
+ Thumbs.db
63
+
64
+ # Testing
65
+ .pytest_cache/
66
+ .coverage
67
+ htmlcov/
68
+ .tox/
69
+
70
+ # Logs
71
+ *.log
BIGQUERY_SCHEMAS.md ADDED
@@ -0,0 +1,691 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # BigQuery Output Schemas for Looker Compatibility
2
+
3
+ **Purpose**: Define stable BigQuery table schemas that BI tools (Looker, Data Studio) can query reliably.
4
+
5
+ **Design Principles**:
6
+ - ✅ **Stable Schema**: No breaking changes without versioning
7
+ - ✅ **Consistent Naming**: snake_case columns, clear dimension/metric separation
8
+ - ✅ **BI-Friendly Types**: Standard SQL types, no complex nested structures
9
+ - ✅ **Documented Grain**: Clear primary keys and update patterns
10
+ - ✅ **Dashboard-Ready**: Metrics aligned with common visualizations
11
+
12
+ ---
13
+
14
+ ## 📊 Table 1: `model_metrics`
15
+
16
+ **Description**: Model performance metrics tracked over time for monitoring and comparison.
17
+
18
+ **Use Cases**:
19
+ - Performance dashboards
20
+ - Model comparison reports
21
+ - Drift detection alerts
22
+ - A/B test analysis
23
+
24
+ **Update Frequency**: On every model training run
25
+
26
+ **Grain**: One row per model training execution
27
+
28
+ ### Schema
29
+
30
+ | Column Name | Type | Description | Dimension/Metric | Example |
31
+ |------------|------|-------------|------------------|---------|
32
+ | `project_id` | STRING | Google Cloud project ID | Dimension | `my-ml-project` |
33
+ | `dataset_id` | STRING | BigQuery dataset name | Dimension | `ml_models` |
34
+ | `model_id` | STRING | Unique model identifier | Dimension (Primary Key) | `xgboost_churn_20251223_153045` |
35
+ | `model_name` | STRING | Human-readable model name | Dimension | `Customer Churn Predictor` |
36
+ | `model_type` | STRING | Algorithm used | Dimension | `XGBoost`, `RandomForest`, `LightGBM` |
37
+ | `task_type` | STRING | ML task category | Dimension | `classification`, `regression` |
38
+ | `training_dataset` | STRING | Source table/file reference | Dimension | `project.dataset.train_data` |
39
+ | `target_column` | STRING | Prediction target name | Dimension | `churn`, `price`, `survived` |
40
+ | `created_at` | TIMESTAMP | Model training timestamp | Dimension (Time) | `2025-12-23 15:30:45 UTC` |
41
+ | `created_date` | DATE | Training date (for partitioning) | Dimension (Time) | `2025-12-23` |
42
+ | `feature_count` | INTEGER | Number of features used | Metric | `42` |
43
+ | `training_rows` | INTEGER | Training set size | Metric | `10000` |
44
+ | `test_rows` | INTEGER | Test set size | Metric | `2500` |
45
+ | `training_duration_seconds` | FLOAT | Time to train model | Metric | `123.45` |
46
+ | `accuracy` | FLOAT | Overall accuracy (0-1) | Metric | `0.95` |
47
+ | `precision` | FLOAT | Precision score (0-1) | Metric | `0.92` |
48
+ | `recall` | FLOAT | Recall score (0-1) | Metric | `0.88` |
49
+ | `f1_score` | FLOAT | F1 score (0-1) | Metric | `0.90` |
50
+ | `roc_auc` | FLOAT | ROC AUC score (0-1) | Metric | `0.94` |
51
+ | `pr_auc` | FLOAT | Precision-Recall AUC (0-1) | Metric | `0.91` |
52
+ | `mae` | FLOAT | Mean Absolute Error (regression) | Metric | `1234.56` |
53
+ | `mse` | FLOAT | Mean Squared Error (regression) | Metric | `567890.12` |
54
+ | `rmse` | FLOAT | Root Mean Squared Error (regression) | Metric | `753.59` |
55
+ | `r2_score` | FLOAT | R² coefficient (regression) | Metric | `0.85` |
56
+ | `cross_val_mean` | FLOAT | Mean CV score | Metric | `0.93` |
57
+ | `cross_val_std` | FLOAT | CV score std deviation | Metric | `0.02` |
58
+ | `hyperparameters` | STRING (JSON) | Model hyperparameters | Metadata | `{"max_depth": 6, "n_estimators": 100}` |
59
+ | `version` | STRING | Model version tag | Dimension | `v1.2.3` |
60
+ | `environment` | STRING | Training environment | Dimension | `production`, `staging`, `development` |
61
+ | `user_email` | STRING | User who trained model | Dimension | `data-scientist@company.com` |
62
+
63
+ ### Partitioning & Clustering
64
+
65
+ ```sql
66
+ -- Recommended table setup
67
+ CREATE TABLE `project.dataset.model_metrics`
68
+ (
69
+ -- columns as above
70
+ )
71
+ PARTITION BY created_date
72
+ CLUSTER BY model_type, task_type, environment
73
+ OPTIONS(
74
+ description="Model performance metrics for BI dashboards",
75
+ require_partition_filter=true
76
+ );
77
+ ```
78
+
79
+ ### Primary Dimensions for Looker
80
+
81
+ - **Time**: `created_at`, `created_date`
82
+ - **Model**: `model_type`, `model_name`, `task_type`
83
+ - **Performance Tier**: CASE expression on `accuracy`/`f1_score`
84
+ - `Excellent` (>0.90)
85
+ - `Good` (0.80-0.90)
86
+ - `Fair` (0.70-0.80)
87
+ - `Poor` (<0.70)
88
+
89
+ ### Sample Looker View
90
+
91
+ ```lookml
92
+ view: model_metrics {
93
+ sql_table_name: `project.dataset.model_metrics` ;;
94
+
95
+ dimension: model_id {
96
+ primary_key: yes
97
+ type: string
98
+ sql: ${TABLE}.model_id ;;
99
+ }
100
+
101
+ dimension_group: created {
102
+ type: time
103
+ timeframes: [date, week, month, quarter, year]
104
+ sql: ${TABLE}.created_at ;;
105
+ }
106
+
107
+ dimension: model_type {
108
+ type: string
109
+ sql: ${TABLE}.model_type ;;
110
+ }
111
+
112
+ dimension: performance_tier {
113
+ type: string
114
+ sql: CASE
115
+ WHEN ${TABLE}.accuracy >= 0.90 THEN 'Excellent'
116
+ WHEN ${TABLE}.accuracy >= 0.80 THEN 'Good'
117
+ WHEN ${TABLE}.accuracy >= 0.70 THEN 'Fair'
118
+ ELSE 'Poor'
119
+ END ;;
120
+ }
121
+
122
+ measure: count {
123
+ type: count
124
+ }
125
+
126
+ measure: avg_accuracy {
127
+ type: average
128
+ sql: ${TABLE}.accuracy ;;
129
+ value_format_name: percent_2
130
+ }
131
+
132
+ measure: avg_f1_score {
133
+ type: average
134
+ sql: ${TABLE}.f1_score ;;
135
+ value_format_name: percent_2
136
+ }
137
+ }
138
+ ```
139
+
140
+ ---
141
+
142
+ ## 🎯 Table 2: `feature_importance`
143
+
144
+ **Description**: Feature importance scores for model interpretability.
145
+
146
+ **Use Cases**:
147
+ - Feature impact analysis
148
+ - Feature selection dashboards
149
+ - Model explainability reports
150
+
151
+ **Update Frequency**: On every model training run
152
+
153
+ **Grain**: One row per feature per model
154
+
155
+ ### Schema
156
+
157
+ | Column Name | Type | Description | Dimension/Metric | Example |
158
+ |------------|------|-------------|------------------|---------|
159
+ | `model_id` | STRING | Foreign key to model_metrics | Dimension (Foreign Key) | `xgboost_churn_20251223_153045` |
160
+ | `feature_name` | STRING | Name of the feature | Dimension (Primary Key) | `age`, `total_purchases`, `days_since_last_login` |
161
+ | `importance_score` | FLOAT | Importance value (0-1) | Metric | `0.35` |
162
+ | `importance_rank` | INTEGER | Rank by importance (1=most important) | Metric | `1`, `2`, `3` |
163
+ | `importance_type` | STRING | Calculation method | Dimension | `gain`, `weight`, `cover`, `shap` |
164
+ | `feature_type` | STRING | Data type category | Dimension | `numeric`, `categorical`, `datetime`, `text` |
165
+ | `is_engineered` | BOOLEAN | Created by feature engineering? | Dimension | `true`, `false` |
166
+ | `created_at` | TIMESTAMP | When importance was calculated | Dimension (Time) | `2025-12-23 15:30:45 UTC` |
167
+ | `created_date` | DATE | Calculation date | Dimension (Time) | `2025-12-23` |
168
+
169
+ ### Partitioning & Clustering
170
+
171
+ ```sql
172
+ CREATE TABLE `project.dataset.feature_importance`
173
+ (
174
+ -- columns as above
175
+ )
176
+ PARTITION BY created_date
177
+ CLUSTER BY model_id, importance_rank
178
+ OPTIONS(
179
+ description="Feature importance scores for model explainability",
180
+ require_partition_filter=false -- Allow cross-model queries
181
+ );
182
+ ```
183
+
184
+ ### Primary Dimensions for Looker
185
+
186
+ - **Feature**: `feature_name`, `feature_type`, `is_engineered`
187
+ - **Model**: `model_id` (join to model_metrics)
188
+ - **Importance**: `importance_rank`, `importance_type`
189
+
190
+ ### Sample Looker View
191
+
192
+ ```lookml
193
+ view: feature_importance {
194
+ sql_table_name: `project.dataset.feature_importance` ;;
195
+
196
+ dimension: compound_key {
197
+ primary_key: yes
198
+ hidden: yes
199
+ sql: CONCAT(${TABLE}.model_id, '|', ${TABLE}.feature_name) ;;
200
+ }
201
+
202
+ dimension: feature_name {
203
+ type: string
204
+ sql: ${TABLE}.feature_name ;;
205
+ }
206
+
207
+ dimension: is_top_10 {
208
+ type: yesno
209
+ sql: ${TABLE}.importance_rank <= 10 ;;
210
+ }
211
+
212
+ measure: avg_importance {
213
+ type: average
214
+ sql: ${TABLE}.importance_score ;;
215
+ value_format_name: percent_2
216
+ }
217
+
218
+ measure: count_features {
219
+ type: count_distinct
220
+ sql: ${TABLE}.feature_name ;;
221
+ }
222
+ }
223
+ ```
224
+
225
+ ---
226
+
227
+ ## 🔮 Table 3: `predictions`
228
+
229
+ **Description**: Model predictions with actuals for monitoring and evaluation.
230
+
231
+ **Use Cases**:
232
+ - Prediction monitoring
233
+ - Accuracy tracking over time
234
+ - Segment performance analysis
235
+ - Business impact measurement
236
+
237
+ **Update Frequency**: Real-time or batch (daily/hourly)
238
+
239
+ **Grain**: One row per prediction
240
+
241
+ ### Schema
242
+
243
+ | Column Name | Type | Description | Dimension/Metric | Example |
244
+ |------------|------|-------------|------------------|---------|
245
+ | `prediction_id` | STRING | Unique prediction identifier | Dimension (Primary Key) | `pred_abc123xyz` |
246
+ | `model_id` | STRING | Model used for prediction | Dimension (Foreign Key) | `xgboost_churn_20251223_153045` |
247
+ | `entity_id` | STRING | Entity being predicted (customer_id, product_id, etc.) | Dimension | `customer_12345` |
248
+ | `predicted_at` | TIMESTAMP | When prediction was made | Dimension (Time) | `2025-12-23 15:30:45 UTC` |
249
+ | `predicted_date` | DATE | Prediction date (for partitioning) | Dimension (Time) | `2025-12-23` |
250
+ | `prediction_value` | FLOAT | Predicted value | Metric | `0.85` (probability), `49.99` (price) |
251
+ | `prediction_class` | STRING | Predicted class (classification) | Dimension | `churn`, `not_churn` |
252
+ | `prediction_confidence` | FLOAT | Model confidence (0-1) | Metric | `0.92` |
253
+ | `actual_value` | FLOAT | True value (when available) | Metric | `1.0` (churned), `52.50` (actual price) |
254
+ | `actual_class` | STRING | True class (when available) | Dimension | `churn`, `not_churn` |
255
+ | `actual_recorded_at` | TIMESTAMP | When actual became known | Dimension (Time) | `2025-12-30 10:00:00 UTC` |
256
+ | `is_correct` | BOOLEAN | Prediction was correct? | Dimension | `true`, `false` |
257
+ | `absolute_error` | FLOAT | \|predicted - actual\| | Metric | `2.51` |
258
+ | `squared_error` | FLOAT | (predicted - actual)² | Metric | `6.30` |
259
+ | `feature_values` | STRING (JSON) | Input features used | Metadata | `{"age": 35, "tenure": 24}` |
260
+ | `segment` | STRING | Business segment | Dimension | `enterprise`, `smb`, `consumer` |
261
+ | `region` | STRING | Geographic region | Dimension | `us-west`, `eu-central` |
262
+ | `model_version` | STRING | Model version | Dimension | `v1.2.3` |
263
+ | `prediction_latency_ms` | FLOAT | Inference time | Metric | `23.4` |
264
+
265
+ ### Partitioning & Clustering
266
+
267
+ ```sql
268
+ CREATE TABLE `project.dataset.predictions`
269
+ (
270
+ -- columns as above
271
+ )
272
+ PARTITION BY predicted_date
273
+ CLUSTER BY model_id, segment, is_correct
274
+ OPTIONS(
275
+ description="Model predictions with actuals for monitoring",
276
+ require_partition_filter=true,
277
+ partition_expiration_days=730 -- 2 years retention
278
+ );
279
+ ```
280
+
281
+ ### Primary Dimensions for Looker
282
+
283
+ - **Time**: `predicted_date`, days since prediction
284
+ - **Model**: `model_id`, `model_version`
285
+ - **Segment**: `segment`, `region`
286
+ - **Accuracy**: `is_correct`, error buckets
287
+
288
+ ### Sample Looker View
289
+
290
+ ```lookml
291
+ view: predictions {
292
+ sql_table_name: `project.dataset.predictions` ;;
293
+
294
+ dimension: prediction_id {
295
+ primary_key: yes
296
+ type: string
297
+ sql: ${TABLE}.prediction_id ;;
298
+ }
299
+
300
+ dimension_group: predicted {
301
+ type: time
302
+ timeframes: [date, week, month]
303
+ sql: ${TABLE}.predicted_at ;;
304
+ }
305
+
306
+ dimension: segment {
307
+ type: string
308
+ sql: ${TABLE}.segment ;;
309
+ }
310
+
311
+ dimension: error_bucket {
312
+ type: string
313
+ sql: CASE
314
+ WHEN ${TABLE}.absolute_error IS NULL THEN 'No Actual Yet'
315
+ WHEN ${TABLE}.absolute_error <= 0.1 THEN '0-10%'
316
+ WHEN ${TABLE}.absolute_error <= 0.2 THEN '10-20%'
317
+ ELSE '>20%'
318
+ END ;;
319
+ }
320
+
321
+ measure: count {
322
+ type: count
323
+ }
324
+
325
+ measure: accuracy_rate {
326
+ type: average
327
+ sql: CAST(${TABLE}.is_correct AS FLOAT64) ;;
328
+ value_format_name: percent_1
329
+ }
330
+
331
+ measure: avg_confidence {
332
+ type: average
333
+ sql: ${TABLE}.prediction_confidence ;;
334
+ value_format_name: percent_2
335
+ }
336
+
337
+ measure: mae {
338
+ type: average
339
+ sql: ${TABLE}.absolute_error ;;
340
+ value_format_name: decimal_2
341
+ }
342
+ }
343
+ ```
344
+
345
+ ---
346
+
347
+ ## 📋 Table 4: `data_profile_summary`
348
+
349
+ **Description**: Dataset profiling statistics for data quality monitoring.
350
+
351
+ **Use Cases**:
352
+ - Data quality dashboards
353
+ - Schema drift detection
354
+ - Data validation reports
355
+ - Column-level monitoring
356
+
357
+ **Update Frequency**: Daily or on-demand
358
+
359
+ **Grain**: One row per column per dataset per run
360
+
361
+ ### Schema
362
+
363
+ | Column Name | Type | Description | Dimension/Metric | Example |
364
+ |------------|------|-------------|------------------|---------|
365
+ | `profile_id` | STRING | Unique profile run identifier | Dimension (Primary Key) | `profile_abc123xyz` |
366
+ | `dataset_name` | STRING | Source table/file name | Dimension | `project.dataset.customers` |
367
+ | `column_name` | STRING | Column being profiled | Dimension | `age`, `email`, `signup_date` |
368
+ | `profiled_at` | TIMESTAMP | When profiling ran | Dimension (Time) | `2025-12-23 15:30:45 UTC` |
369
+ | `profiled_date` | DATE | Profiling date | Dimension (Time) | `2025-12-23` |
370
+ | `data_type` | STRING | Column data type | Dimension | `INTEGER`, `STRING`, `FLOAT`, `TIMESTAMP` |
371
+ | `inferred_type` | STRING | Smart type inference | Dimension | `numeric`, `categorical`, `datetime`, `text`, `email` |
372
+ | `row_count` | INTEGER | Total rows in dataset | Metric | `10000` |
373
+ | `non_null_count` | INTEGER | Non-null values | Metric | `9850` |
374
+ | `null_count` | INTEGER | Null values | Metric | `150` |
375
+ | `null_percentage` | FLOAT | % null (0-100) | Metric | `1.5` |
376
+ | `unique_count` | INTEGER | Distinct values | Metric | `450` |
377
+ | `uniqueness_percentage` | FLOAT | % unique (0-100) | Metric | `4.5` |
378
+ | `min_value` | STRING | Minimum value (as string) | Metadata | `18`, `2020-01-01` |
379
+ | `max_value` | STRING | Maximum value (as string) | Metadata | `95`, `2025-12-23` |
380
+ | `mean_value` | FLOAT | Mean (numeric only) | Metric | `42.5` |
381
+ | `median_value` | FLOAT | Median (numeric only) | Metric | `38.0` |
382
+ | `std_dev` | FLOAT | Standard deviation (numeric only) | Metric | `15.2` |
383
+ | `skewness` | FLOAT | Distribution skewness | Metric | `0.85` |
384
+ | `kurtosis` | FLOAT | Distribution kurtosis | Metric | `2.1` |
385
+ | `top_value` | STRING | Most common value | Metadata | `male`, `active` |
386
+ | `top_value_frequency` | INTEGER | Count of most common value | Metric | `6500` |
387
+ | `top_value_percentage` | FLOAT | % of most common value | Metric | `65.0` |
388
+ | `has_outliers` | BOOLEAN | Outliers detected? | Dimension | `true`, `false` |
389
+ | `outlier_count` | INTEGER | Number of outliers | Metric | `23` |
390
+ | `outlier_percentage` | FLOAT | % outliers | Metric | `0.23` |
391
+ | `quality_score` | FLOAT | Overall quality score (0-100) | Metric | `92.5` |
392
+ | `quality_issues` | STRING (JSON) | Detected issues | Metadata | `["high_nulls", "duplicate_values"]` |
393
+ | `validation_status` | STRING | Quality check result | Dimension | `pass`, `warn`, `fail` |
394
+
395
+ ### Partitioning & Clustering
396
+
397
+ ```sql
398
+ CREATE TABLE `project.dataset.data_profile_summary`
399
+ (
400
+ -- columns as above
401
+ )
402
+ PARTITION BY profiled_date
403
+ CLUSTER BY dataset_name, validation_status
404
+ OPTIONS(
405
+ description="Dataset profiling for data quality monitoring",
406
+ require_partition_filter=true,
407
+ partition_expiration_days=90 -- 3 months retention
408
+ );
409
+ ```
410
+
411
+ ### Primary Dimensions for Looker
412
+
413
+ - **Dataset**: `dataset_name`
414
+ - **Column**: `column_name`, `data_type`, `inferred_type`
415
+ - **Quality**: `validation_status`, `quality_score` buckets
416
+ - **Time**: `profiled_date`
417
+
418
+ ### Sample Looker View
419
+
420
+ ```lookml
421
+ view: data_profile_summary {
422
+ sql_table_name: `project.dataset.data_profile_summary` ;;
423
+
424
+ dimension: compound_key {
425
+ primary_key: yes
426
+ hidden: yes
427
+ sql: CONCAT(${TABLE}.profile_id, '|', ${TABLE}.column_name) ;;
428
+ }
429
+
430
+ dimension: column_name {
431
+ type: string
432
+ sql: ${TABLE}.column_name ;;
433
+ }
434
+
435
+ dimension: quality_tier {
436
+ type: string
437
+ sql: CASE
438
+ WHEN ${TABLE}.quality_score >= 90 THEN 'Excellent'
439
+ WHEN ${TABLE}.quality_score >= 75 THEN 'Good'
440
+ WHEN ${TABLE}.quality_score >= 60 THEN 'Fair'
441
+ ELSE 'Poor'
442
+ END ;;
443
+ }
444
+
445
+ dimension: has_quality_issues {
446
+ type: yesno
447
+ sql: ${TABLE}.validation_status IN ('warn', 'fail') ;;
448
+ }
449
+
450
+ measure: count_columns {
451
+ type: count_distinct
452
+ sql: ${TABLE}.column_name ;;
453
+ }
454
+
455
+ measure: avg_quality_score {
456
+ type: average
457
+ sql: ${TABLE}.quality_score ;;
458
+ value_format_name: decimal_1
459
+ }
460
+
461
+ measure: avg_null_percentage {
462
+ type: average
463
+ sql: ${TABLE}.null_percentage ;;
464
+ value_format_name: percent_1
465
+ }
466
+
467
+ measure: columns_with_issues {
468
+ type: count_distinct
469
+ sql: ${TABLE}.column_name ;;
470
+ filters: [has_quality_issues: "yes"]
471
+ }
472
+ }
473
+ ```
474
+
475
+ ---
476
+
477
+ ## 🔄 Schema Evolution Guidelines
478
+
479
+ ### ✅ **SAFE Changes** (Non-Breaking)
480
+
481
+ 1. **Add new columns** (always nullable or with defaults)
482
+ ```sql
483
+ ALTER TABLE `project.dataset.model_metrics`
484
+ ADD COLUMN IF NOT EXISTS new_metric FLOAT64;
485
+ ```
486
+
487
+ 2. **Add new tables** (doesn't affect existing dashboards)
488
+
489
+ 3. **Lengthen STRING columns** (VARCHAR(50) → VARCHAR(100))
490
+
491
+ 4. **Add indexes/clustering** (performance only)
492
+
493
+ 5. **Add column descriptions**
494
+ ```sql
495
+ ALTER TABLE `project.dataset.model_metrics`
496
+ ALTER COLUMN accuracy SET OPTIONS (description='Model accuracy (0-1)');
497
+ ```
498
+
499
+ ### ❌ **BREAKING Changes** (Require Dashboard Updates)
500
+
501
+ 1. **Rename columns** → Use views for backward compatibility:
502
+ ```sql
503
+ CREATE OR REPLACE VIEW `project.dataset.model_metrics_v2` AS
504
+ SELECT
505
+ model_id,
506
+ accuracy AS acc, -- renamed column
507
+ ...
508
+ FROM `project.dataset.model_metrics`;
509
+ ```
510
+
511
+ 2. **Change data types** → Create new column, migrate, deprecate old:
512
+ ```sql
513
+ -- Step 1: Add new column
514
+ ALTER TABLE model_metrics ADD COLUMN created_at_new TIMESTAMP;
515
+
516
+ -- Step 2: Backfill
517
+ UPDATE model_metrics SET created_at_new = CAST(created_at AS TIMESTAMP) WHERE true;
518
+
519
+ -- Step 3: Update dashboards to use new column
520
+
521
+ -- Step 4: Drop old column after validation period
522
+ ALTER TABLE model_metrics DROP COLUMN created_at;
523
+ ```
524
+
525
+ 3. **Remove columns** → Deprecate first, remove after 90 days
526
+
527
+ 4. **Change partitioning** → Requires table recreation
528
+
529
+ ### 🔄 **Versioning Strategy**
530
+
531
+ For major schema changes, create versioned tables:
532
+
533
+ ```
534
+ project.dataset.model_metrics_v1 (deprecated, keep 90 days)
535
+ project.dataset.model_metrics_v2 (current)
536
+ project.dataset.model_metrics (view pointing to latest version)
537
+ ```
538
+
539
+ ---
540
+
541
+ ## 📊 Dashboard-Ready Metrics Catalog
542
+
543
+ ### Model Performance Metrics
544
+
545
+ | Metric Name | Calculation | Use Case |
546
+ |------------|-------------|----------|
547
+ | **Model Count** | `COUNT(DISTINCT model_id)` | Total models trained |
548
+ | **Avg Accuracy** | `AVG(accuracy)` | Overall model quality |
549
+ | **Accuracy Trend** | `AVG(accuracy) OVER (ORDER BY created_date)` | Performance over time |
550
+ | **Best Model** | `model_id WHERE accuracy = MAX(accuracy)` | Top performer |
551
+ | **Models by Type** | `COUNT(*) GROUP BY model_type` | Algorithm distribution |
552
+ | **Training Time** | `AVG(training_duration_seconds)` | Resource usage |
553
+ | **Recent Models** | `WHERE created_date >= DATE_SUB(CURRENT_DATE(), INTERVAL 7 DAY)` | Latest activity |
554
+
555
+ ### Feature Importance Metrics
556
+
557
+ | Metric Name | Calculation | Use Case |
558
+ |------------|-------------|----------|
559
+ | **Top Features** | `WHERE importance_rank <= 10` | Most impactful features |
560
+ | **Avg Importance** | `AVG(importance_score)` | Feature impact distribution |
561
+ | **Engineered Features** | `COUNT(*) WHERE is_engineered = true` | Feature engineering effectiveness |
562
+ | **Feature Stability** | `STDDEV(importance_score) GROUP BY feature_name` | Consistent predictors |
563
+
564
+ ### Prediction Metrics
565
+
566
+ | Metric Name | Calculation | Use Case |
567
+ |------------|-------------|----------|
568
+ | **Accuracy Rate** | `AVG(CAST(is_correct AS FLOAT64))` | Real-world performance |
569
+ | **MAE** | `AVG(absolute_error)` | Average error magnitude |
570
+ | **RMSE** | `SQRT(AVG(squared_error))` | Error with outlier penalty |
571
+ | **Predictions/Day** | `COUNT(*) GROUP BY predicted_date` | Volume tracking |
572
+ | **Confidence Distribution** | `APPROX_QUANTILES(prediction_confidence, 10)` | Model calibration |
573
+ | **Segment Performance** | `AVG(is_correct) GROUP BY segment` | Fairness check |
574
+
575
+ ### Data Quality Metrics
576
+
577
+ | Metric Name | Calculation | Use Case |
578
+ |------------|-------------|----------|
579
+ | **Data Quality Score** | `AVG(quality_score)` | Overall health |
580
+ | **Null Rate** | `AVG(null_percentage)` | Completeness |
581
+ | **Columns with Issues** | `COUNT(DISTINCT column_name) WHERE validation_status != 'pass'` | Problem areas |
582
+ | **Quality Trend** | `AVG(quality_score) OVER (ORDER BY profiled_date)` | Improving/degrading? |
583
+
584
+ ---
585
+
586
+ ## 🎯 Sample Looker Explores
587
+
588
+ ### Explore 1: Model Performance Analysis
589
+
590
+ ```lookml
591
+ explore: model_metrics {
592
+ label: "Model Performance"
593
+ description: "Track model accuracy, training time, and comparison"
594
+
595
+ join: feature_importance {
596
+ type: left_outer
597
+ sql_on: ${model_metrics.model_id} = ${feature_importance.model_id} ;;
598
+ relationship: one_to_many
599
+ }
600
+ }
601
+ ```
602
+
603
+ ### Explore 2: Prediction Monitoring
604
+
605
+ ```lookml
606
+ explore: predictions {
607
+ label: "Prediction Monitoring"
608
+ description: "Real-time prediction accuracy and drift"
609
+
610
+ join: model_metrics {
611
+ type: left_outer
612
+ sql_on: ${predictions.model_id} = ${model_metrics.model_id} ;;
613
+ relationship: many_to_one
614
+ }
615
+ }
616
+ ```
617
+
618
+ ### Explore 3: Data Quality Dashboard
619
+
620
+ ```lookml
621
+ explore: data_profile_summary {
622
+ label: "Data Quality"
623
+ description: "Monitor data health and schema drift"
624
+ }
625
+ ```
626
+
627
+ ---
628
+
629
+ ## 📝 Implementation Checklist
630
+
631
+ ### Phase 1: Setup (Week 1)
632
+ - [ ] Create all 4 BigQuery tables with partitioning
633
+ - [ ] Set up service account permissions
634
+ - [ ] Configure table expiration policies
635
+ - [ ] Document table owners and update SLAs
636
+
637
+ ### Phase 2: Integration (Week 2)
638
+ - [ ] Update tools to write to these schemas
639
+ - [ ] Add schema validation in CI/CD
640
+ - [ ] Create data dictionary in Looker
641
+ - [ ] Set up table monitoring alerts
642
+
643
+ ### Phase 3: BI Layer (Week 3)
644
+ - [ ] Create Looker views for all 4 tables
645
+ - [ ] Build explores with joins
646
+ - [ ] Create initial dashboards
647
+ - [ ] Set up scheduled data refreshes
648
+
649
+ ### Phase 4: Validation (Week 4)
650
+ - [ ] Backfill historical data
651
+ - [ ] Verify dashboard accuracy
652
+ - [ ] Train stakeholders on dashboards
653
+ - [ ] Document runbooks for common issues
654
+
655
+ ---
656
+
657
+ ## 🔗 Related Tools
658
+
659
+ **BigQuery Write Tools** (src/bigquery/):
660
+ - `bigquery_write_results()` - Generic write function
661
+ - Helper: `bigquery_write_model_metrics()` - Specialized writer
662
+ - Helper: `bigquery_write_feature_importance()` - Specialized writer
663
+ - Helper: `bigquery_write_predictions()` - Specialized writer
664
+ - Helper: `bigquery_write_data_profile()` - Specialized writer
665
+
666
+ **Example Usage**:
667
+ ```python
668
+ from src.bigquery import bigquery_write_results
669
+
670
+ # Write model metrics
671
+ bigquery_write_results(
672
+ data=metrics_df,
673
+ table_id="project.dataset.model_metrics",
674
+ write_disposition="WRITE_APPEND"
675
+ )
676
+ ```
677
+
678
+ ---
679
+
680
+ ## 📚 Additional Resources
681
+
682
+ - [BigQuery Best Practices](https://cloud.google.com/bigquery/docs/best-practices)
683
+ - [Looker LookML Reference](https://cloud.google.com/looker/docs/reference/lookml-quick-reference)
684
+ - [Schema Design for BI](https://cloud.google.com/architecture/bigquery-data-warehouse)
685
+
686
+ ---
687
+
688
+ **Last Updated**: December 23, 2025
689
+ **Schema Version**: 1.0.0
690
+ **Maintained By**: Data Science Team
691
+ **Review Cadence**: Quarterly
CHECKLIST.md ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ✅ Pre-Launch Checklist
2
+
3
+ ## Before Running the Application
4
+
5
+ ### 1. Environment Variables ⚠️ **REQUIRED**
6
+
7
+ You MUST set your API key before starting:
8
+
9
+ ```powershell
10
+ # Windows PowerShell
11
+ $env:GOOGLE_API_KEY="your-google-api-key-here"
12
+
13
+ # Verify it's set
14
+ echo $env:GOOGLE_API_KEY
15
+ ```
16
+
17
+ ### 2. Build Status ✅
18
+
19
+ - [x] Frontend dependencies installed
20
+ - [x] Frontend built (FRRONTEEEND/dist exists)
21
+ - [x] Backend code updated with new endpoints
22
+ - [x] Configuration files in place
23
+
24
+ ### 3. Quick Start Commands
25
+
26
+ **Option A - Use the start script:**
27
+ ```powershell
28
+ .\start.ps1
29
+ ```
30
+
31
+ **Option B - Manual start:**
32
+ ```powershell
33
+ # Make sure you're in the project root
34
+ Set-Location "c:\Users\Pulastya\Videos\DS AGENTTTT"
35
+
36
+ # Set API key (if not already set)
37
+ $env:GOOGLE_API_KEY="your-key-here"
38
+
39
+ # Start the server
40
+ python src\api\app.py
41
+ ```
42
+
43
+ ### 4. Access the Application
44
+
45
+ Once the server starts, open your browser to:
46
+ **http://localhost:8080**
47
+
48
+ You should see:
49
+ 1. **Landing Page** - Professional homepage with agent features
50
+ 2. **Launch Console** button - Click to open the chat interface
51
+ 3. **Chat Interface** - Modern conversational UI
52
+
53
+ ### 5. Test the Chat
54
+
55
+ Try these sample prompts:
56
+ - "What can you do?"
57
+ - "Explain your data science capabilities"
58
+ - "How do I upload a dataset?"
59
+ - "What ML models do you support?"
60
+
61
+ ### 6. Expected Console Output
62
+
63
+ When you start the server, you should see:
64
+ ```
65
+ INFO: Started server process [####]
66
+ INFO: Waiting for application startup.
67
+ ✅ Agent initialized with provider: groq
68
+ ✅ Frontend assets mounted from C:\Users\Pulastya\Videos\DS AGENTTTT\FRRONTEEEND\dist
69
+ INFO: Application startup complete.
70
+ INFO: Uvicorn running on http://0.0.0.0:8080
71
+ ```
72
+
73
+ ### 7. Troubleshooting Quick Reference
74
+
75
+ | Issue | Solution |
76
+ |-------|----------|
77
+ | "Agent not initialized" | Set GOOGLE_API_KEY environment variable |
78
+ | "Frontend not found" | Run `cd FRRONTEEEND && npm run build` |
79
+ | Port 8080 in use | Kill the process or change PORT env var |
80
+ | Import errors | Run `pip install -r requirements.txt` |
81
+
82
+ ## Next Steps After Launch
83
+
84
+ 1. **Test the chat** with the agent
85
+ 2. **Upload a dataset** (feature coming soon in chat)
86
+ 3. **Try the API endpoints** at http://localhost:8080/docs
87
+ 4. **Customize the frontend** in FRRONTEEEND/components/
88
+
89
+ ## Documentation
90
+
91
+ - 📖 [MIGRATION_COMPLETE.md](MIGRATION_COMPLETE.md) - What was changed
92
+ - 📖 [FRONTEND_INTEGRATION.md](FRONTEND_INTEGRATION.md) - Technical details
93
+ - 📖 [README.md](README.md) - Main project docs
94
+
95
+ ---
96
+
97
+ **Ready to launch?** Run `.\start.ps1` and visit http://localhost:8080 🚀
DEPLOYMENT.md ADDED
@@ -0,0 +1,495 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🚀 Google Cloud Run Deployment Guide
2
+
3
+ Complete guide to deploy the Data Science Agent to Google Cloud Run as a serverless API.
4
+
5
+ ## 📋 Prerequisites
6
+
7
+ 1. **Google Cloud Platform Account**
8
+ - Active GCP account with billing enabled
9
+ - Project created (or use existing project)
10
+
11
+ 2. **Install Google Cloud SDK**
12
+ ```bash
13
+ # macOS (Homebrew)
14
+ brew install --cask google-cloud-sdk
15
+
16
+ # Or download from: https://cloud.google.com/sdk/install
17
+ ```
18
+
19
+ 3. **Authenticate with GCP**
20
+ ```bash
21
+ gcloud auth login
22
+ gcloud auth application-default login
23
+ ```
24
+
25
+ 4. **Set Your Project**
26
+ ```bash
27
+ gcloud config set project YOUR_PROJECT_ID
28
+ ```
29
+
30
+ ---
31
+
32
+ ## 🎯 Deployment Options
33
+
34
+ ### Option 1: Automated Deployment (Recommended)
35
+
36
+ Use the provided deployment script for one-command deployment:
37
+
38
+ ```bash
39
+ # Set required environment variables
40
+ export GCP_PROJECT_ID="your-project-id"
41
+ export GROQ_API_KEY="your-groq-api-key"
42
+ export GOOGLE_API_KEY="your-google-api-key" # Optional for Gemini
43
+
44
+ # Run deployment script
45
+ ./deploy.sh
46
+ ```
47
+
48
+ **What it does:**
49
+ - ✅ Enables required GCP APIs (Cloud Build, Cloud Run, Secret Manager)
50
+ - ✅ Creates secrets for API keys
51
+ - ✅ Builds Docker container
52
+ - ✅ Deploys to Cloud Run
53
+ - ✅ Returns service URL
54
+
55
+ **Configuration options:**
56
+ ```bash
57
+ # Optional: Customize deployment
58
+ export CLOUD_RUN_REGION="us-central1" # Change region
59
+ export MEMORY="4Gi" # Increase memory
60
+ export CPU="2" # Set CPU count
61
+ export MAX_INSTANCES="10" # Scale limit
62
+ export TIMEOUT="900" # Request timeout (15 min)
63
+
64
+ ./deploy.sh
65
+ ```
66
+
67
+ ---
68
+
69
+ ### Option 2: Manual Deployment
70
+
71
+ Step-by-step manual deployment for full control:
72
+
73
+ #### Step 1: Enable APIs
74
+ ```bash
75
+ gcloud services enable \
76
+ cloudbuild.googleapis.com \
77
+ run.googleapis.com \
78
+ containerregistry.googleapis.com \
79
+ secretmanager.googleapis.com
80
+ ```
81
+
82
+ #### Step 2: Create Secrets
83
+ ```bash
84
+ # Create GROQ API key secret
85
+ echo -n "your-groq-api-key" | gcloud secrets create GROQ_API_KEY --data-file=-
86
+
87
+ # Create Google API key secret (optional)
88
+ echo -n "your-google-api-key" | gcloud secrets create GOOGLE_API_KEY --data-file=-
89
+
90
+ # Grant Cloud Run access to secrets
91
+ PROJECT_NUMBER=$(gcloud projects describe $(gcloud config get-value project) --format="value(projectNumber)")
92
+ gcloud secrets add-iam-policy-binding GROQ_API_KEY \
93
+ --member="serviceAccount:${PROJECT_NUMBER}-compute@developer.gserviceaccount.com" \
94
+ --role="roles/secretmanager.secretAccessor"
95
+ ```
96
+
97
+ #### Step 3: Build Container
98
+ ```bash
99
+ gcloud builds submit --tag gcr.io/$(gcloud config get-value project)/data-science-agent
100
+ ```
101
+
102
+ #### Step 4: Deploy to Cloud Run
103
+ ```bash
104
+ gcloud run deploy data-science-agent \
105
+ --image gcr.io/$(gcloud config get-value project)/data-science-agent \
106
+ --platform managed \
107
+ --region us-central1 \
108
+ --allow-unauthenticated \
109
+ --memory 4Gi \
110
+ --cpu 2 \
111
+ --timeout 900 \
112
+ --max-instances 10 \
113
+ --set-env-vars LLM_PROVIDER=groq,REASONING_EFFORT=medium \
114
+ --set-secrets GROQ_API_KEY=GROQ_API_KEY:latest,GOOGLE_API_KEY=GOOGLE_API_KEY:latest
115
+ ```
116
+
117
+ ---
118
+
119
+ ### Option 3: CI/CD with Cloud Build Triggers
120
+
121
+ Automated deployment on git push:
122
+
123
+ #### Step 1: Connect Repository
124
+ ```bash
125
+ # Connect GitHub/GitLab/Bitbucket repository
126
+ gcloud beta builds connections create github connection-name \
127
+ --region=us-central1
128
+ ```
129
+
130
+ #### Step 2: Create Build Trigger
131
+ ```bash
132
+ gcloud builds triggers create github \
133
+ --name="deploy-data-science-agent" \
134
+ --repo-name="Data-Science-Agent" \
135
+ --repo-owner="Surfing-Ninja" \
136
+ --branch-pattern="^main$" \
137
+ --build-config="cloudbuild.yaml"
138
+ ```
139
+
140
+ Now every push to `main` branch automatically deploys! 🎉
141
+
142
+ ---
143
+
144
+ ## 🧪 Testing the Deployment
145
+
146
+ ### 1. Health Check
147
+ ```bash
148
+ SERVICE_URL=$(gcloud run services describe data-science-agent \
149
+ --region us-central1 \
150
+ --format 'value(status.url)')
151
+
152
+ curl $SERVICE_URL/health
153
+ ```
154
+
155
+ **Expected response:**
156
+ ```json
157
+ {
158
+ "status": "healthy",
159
+ "agent_ready": true,
160
+ "provider": "groq",
161
+ "tools_count": 82
162
+ }
163
+ ```
164
+
165
+ ### 2. List Available Tools
166
+ ```bash
167
+ curl $SERVICE_URL/tools | jq
168
+ ```
169
+
170
+ ### 3. Profile a Dataset
171
+ ```bash
172
+ curl -X POST $SERVICE_URL/profile \
173
+ -F "file=@test_data/sample.csv"
174
+ ```
175
+
176
+ ### 4. Run Full Analysis
177
+ ```bash
178
+ curl -X POST $SERVICE_URL/run \
179
+ -F "file=@test_data/sample.csv" \
180
+ -F "task_description=Analyze this dataset, detect outliers, and train a prediction model" \
181
+ -F "target_col=target" \
182
+ | jq
183
+ ```
184
+
185
+ ---
186
+
187
+ ## 📊 Monitoring & Logs
188
+
189
+ ### View Real-time Logs
190
+ ```bash
191
+ gcloud run logs tail data-science-agent --region us-central1
192
+ ```
193
+
194
+ ### View Recent Logs
195
+ ```bash
196
+ gcloud run logs read data-science-agent \
197
+ --region us-central1 \
198
+ --limit 50
199
+ ```
200
+
201
+ ### Cloud Console Monitoring
202
+ - Go to: https://console.cloud.google.com/run
203
+ - Click on `data-science-agent`
204
+ - View: Metrics, Logs, Revisions
205
+
206
+ ---
207
+
208
+ ## 💰 Cost Estimation
209
+
210
+ ### Cloud Run Pricing (as of Dec 2024)
211
+ **Free Tier** (per month):
212
+ - 2 million requests
213
+ - 360,000 GB-seconds of memory
214
+ - 180,000 vCPU-seconds
215
+
216
+ **Paid Tier** (us-central1):
217
+ - CPU: $0.00002400 per vCPU-second
218
+ - Memory: $0.00000250 per GB-second
219
+ - Requests: $0.40 per million requests
220
+
221
+ **Example Cost for 4Gi Memory, 2 vCPU:**
222
+ - 1 request taking 60 seconds
223
+ - CPU: 2 vCPU × 60s × $0.000024 = $0.00288
224
+ - Memory: 4GB × 60s × $0.0000025 = $0.0006
225
+ - Request: $0.0000004
226
+ - **Total: ~$0.0035 per request**
227
+
228
+ **Monthly estimate for 1000 requests/month:**
229
+ - ~$3.50/month (well within free tier for testing!)
230
+
231
+ ---
232
+
233
+ ## 🔒 Security Best Practices
234
+
235
+ ### 1. Enable Authentication (Production)
236
+ ```bash
237
+ # Deploy with authentication required
238
+ gcloud run deploy data-science-agent \
239
+ --no-allow-unauthenticated \
240
+ --region us-central1 \
241
+ --image gcr.io/PROJECT_ID/data-science-agent
242
+
243
+ # Create service account for clients
244
+ gcloud iam service-accounts create api-client
245
+
246
+ # Grant invoker role
247
+ gcloud run services add-iam-policy-binding data-science-agent \
248
+ --member="serviceAccount:api-client@PROJECT_ID.iam.gserviceaccount.com" \
249
+ --role="roles/run.invoker" \
250
+ --region us-central1
251
+ ```
252
+
253
+ ### 2. Use VPC Connector (For BigQuery/GCS)
254
+ ```bash
255
+ # Create VPC connector
256
+ gcloud compute networks vpc-access connectors create ds-agent-connector \
257
+ --network default \
258
+ --region us-central1 \
259
+ --range 10.8.0.0/28
260
+
261
+ # Deploy with VPC
262
+ gcloud run deploy data-science-agent \
263
+ --vpc-connector ds-agent-connector \
264
+ --region us-central1
265
+ ```
266
+
267
+ ### 3. Restrict API Keys
268
+ - Set **Application restrictions** in Google Cloud Console
269
+ - Whitelist only Cloud Run service URL
270
+ - Set **API restrictions** to only required APIs
271
+
272
+ ---
273
+
274
+ ## 🔧 Configuration Options
275
+
276
+ ### Environment Variables
277
+ ```bash
278
+ # Set during deployment
279
+ --set-env-vars KEY1=value1,KEY2=value2
280
+
281
+ # Available variables:
282
+ LLM_PROVIDER=groq # or "gemini"
283
+ REASONING_EFFORT=medium # low, medium, high
284
+ CACHE_TTL_SECONDS=86400 # Cache lifetime
285
+ ARTIFACT_BACKEND=local # or "gcs" for cloud storage
286
+ GCS_BUCKET_NAME=your-bucket # If using GCS backend
287
+ OUTPUT_DIR=/tmp/outputs # Output directory
288
+ MAX_PARALLEL_TOOLS=5 # Concurrent tool execution
289
+ MAX_RETRIES=3 # Tool retry attempts
290
+ TIMEOUT_SECONDS=300 # Tool timeout
291
+ ```
292
+
293
+ ### Resource Limits
294
+ ```bash
295
+ --memory 4Gi # 128Mi to 32Gi
296
+ --cpu 2 # 1 to 8 vCPU
297
+ --timeout 900 # Max 3600s (1 hour)
298
+ --max-instances 10 # Scale limit
299
+ --min-instances 0 # Always-warm instances
300
+ --concurrency 10 # Requests per instance
301
+ ```
302
+
303
+ ---
304
+
305
+ ## 🐛 Troubleshooting
306
+
307
+ ### Build Fails
308
+ ```bash
309
+ # Check build logs
310
+ gcloud builds list --limit=5
311
+ gcloud builds log BUILD_ID
312
+
313
+ # Common fixes:
314
+ # - Ensure Dockerfile is in root directory
315
+ # - Check requirements.txt has all dependencies
316
+ # - Increase build timeout: --timeout=1200s
317
+ ```
318
+
319
+ ### Deployment Fails
320
+ ```bash
321
+ # Check service status
322
+ gcloud run services describe data-science-agent --region us-central1
323
+
324
+ # Common fixes:
325
+ # - Ensure APIs are enabled
326
+ # - Check secrets exist and are accessible
327
+ # - Verify service account permissions
328
+ ```
329
+
330
+ ### Runtime Errors
331
+ ```bash
332
+ # View logs
333
+ gcloud run logs tail data-science-agent --region us-central1
334
+
335
+ # Common issues:
336
+ # - API keys not set: Check secrets
337
+ # - Import errors: Ensure all dependencies in requirements.txt
338
+ # - Memory issues: Increase --memory limit
339
+ # - Timeout: Increase --timeout value
340
+ ```
341
+
342
+ ### Container Crashes
343
+ ```bash
344
+ # Test locally first
345
+ docker build -t ds-agent .
346
+ docker run -p 8080:8080 \
347
+ -e GROQ_API_KEY="your-key" \
348
+ ds-agent
349
+
350
+ curl http://localhost:8080/health
351
+ ```
352
+
353
+ ---
354
+
355
+ ## 🚀 Advanced Features
356
+
357
+ ### Custom Domain
358
+ ```bash
359
+ # Map custom domain
360
+ gcloud run domain-mappings create \
361
+ --service data-science-agent \
362
+ --domain api.yourdomain.com \
363
+ --region us-central1
364
+ ```
365
+
366
+ ### Load Balancing
367
+ ```bash
368
+ # Create multiple regional deployments
369
+ for region in us-central1 us-east1 europe-west1; do
370
+ gcloud run deploy data-science-agent \
371
+ --image gcr.io/PROJECT_ID/data-science-agent \
372
+ --region $region
373
+ done
374
+
375
+ # Set up global load balancer
376
+ # Follow: https://cloud.google.com/load-balancing/docs/https/setup-global-ext-https-serverless
377
+ ```
378
+
379
+ ### Multi-Region Deployment
380
+ ```bash
381
+ # Deploy to multiple regions for high availability
382
+ ./deploy.sh CLOUD_RUN_REGION=us-central1
383
+ ./deploy.sh CLOUD_RUN_REGION=europe-west1
384
+ ./deploy.sh CLOUD_RUN_REGION=asia-east1
385
+ ```
386
+
387
+ ---
388
+
389
+ ## 📝 API Documentation
390
+
391
+ Once deployed, access Swagger docs at:
392
+ ```
393
+ https://YOUR_SERVICE_URL/docs
394
+ ```
395
+
396
+ ### Available Endpoints
397
+
398
+ #### `GET /` - Health Check
399
+ Returns service status and tool count.
400
+
401
+ #### `GET /health` - Detailed Health
402
+ Returns agent readiness and provider info.
403
+
404
+ #### `GET /tools` - List Tools
405
+ Returns all 82 available tools organized by category.
406
+
407
+ #### `POST /run` - Run Full Analysis
408
+ Upload dataset and execute complete data science workflow.
409
+
410
+ **Parameters:**
411
+ - `file`: CSV/Parquet file (multipart/form-data)
412
+ - `task_description`: Natural language task description
413
+ - `target_col`: Target column for ML (optional)
414
+ - `use_cache`: Enable caching (default: true)
415
+ - `max_iterations`: Max workflow steps (default: 20)
416
+
417
+ #### `POST /profile` - Quick Profile
418
+ Quick dataset profiling without full workflow.
419
+
420
+ **Parameters:**
421
+ - `file`: CSV/Parquet file (multipart/form-data)
422
+
423
+ ---
424
+
425
+ ## 🔄 Updates & Rollbacks
426
+
427
+ ### Update Deployment
428
+ ```bash
429
+ # Rebuild and redeploy
430
+ ./deploy.sh
431
+ ```
432
+
433
+ ### Rollback to Previous Revision
434
+ ```bash
435
+ # List revisions
436
+ gcloud run revisions list --service data-science-agent --region us-central1
437
+
438
+ # Rollback
439
+ gcloud run services update-traffic data-science-agent \
440
+ --to-revisions REVISION_NAME=100 \
441
+ --region us-central1
442
+ ```
443
+
444
+ ### Blue/Green Deployment
445
+ ```bash
446
+ # Deploy new version with tag
447
+ gcloud run deploy data-science-agent \
448
+ --tag blue \
449
+ --no-traffic \
450
+ --region us-central1
451
+
452
+ # Test: https://blue---data-science-agent-HASH.run.app
453
+
454
+ # Switch traffic
455
+ gcloud run services update-traffic data-science-agent \
456
+ --to-tags blue=100 \
457
+ --region us-central1
458
+ ```
459
+
460
+ ---
461
+
462
+ ## 📚 Additional Resources
463
+
464
+ - **Cloud Run Docs**: https://cloud.google.com/run/docs
465
+ - **Pricing Calculator**: https://cloud.google.com/products/calculator
466
+ - **Best Practices**: https://cloud.google.com/run/docs/tips
467
+ - **Quotas & Limits**: https://cloud.google.com/run/quotas
468
+
469
+ ---
470
+
471
+ ## ✅ Deployment Checklist
472
+
473
+ - [ ] GCP project created and billing enabled
474
+ - [ ] Google Cloud SDK installed and authenticated
475
+ - [ ] API keys obtained (GROQ_API_KEY, GOOGLE_API_KEY)
476
+ - [ ] Secrets created in Secret Manager
477
+ - [ ] Docker container builds successfully locally
478
+ - [ ] Cloud Run APIs enabled
479
+ - [ ] Service deployed to Cloud Run
480
+ - [ ] Health check endpoint returns 200
481
+ - [ ] Test dataset profiled successfully
482
+ - [ ] Full analysis workflow tested
483
+ - [ ] Monitoring/logging configured
484
+ - [ ] Cost alerts set up (optional)
485
+ - [ ] Custom domain mapped (optional)
486
+ - [ ] CI/CD pipeline configured (optional)
487
+
488
+ ---
489
+
490
+ **Need help?** Check the troubleshooting section or view logs with:
491
+ ```bash
492
+ gcloud run logs tail data-science-agent --region us-central1
493
+ ```
494
+
495
+ Happy deploying! 🎉
Dockerfile ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Multi-stage build for Google Cloud Run
2
+ # Stage 1: Build Frontend
3
+ FROM node:20-alpine as frontend-builder
4
+
5
+ WORKDIR /frontend
6
+
7
+ # Copy frontend files
8
+ COPY FRRONTEEEND/package*.json ./
9
+ RUN npm install
10
+
11
+ COPY FRRONTEEEND/ ./
12
+ RUN npm run build
13
+
14
+ # Stage 2: Build Python environment
15
+ FROM python:3.13-slim as builder
16
+
17
+ # Install system dependencies
18
+ RUN apt-get update && apt-get install -y \
19
+ gcc \
20
+ g++ \
21
+ make \
22
+ && rm -rf /var/lib/apt/lists/*
23
+
24
+ # Create virtual environment
25
+ RUN python -m venv /opt/venv
26
+ ENV PATH="/opt/venv/bin:$PATH"
27
+
28
+ # Copy requirements and install Python packages
29
+ COPY requirements.txt .
30
+ RUN pip install --no-cache-dir --upgrade pip && \
31
+ pip install --no-cache-dir -r requirements.txt
32
+
33
+ # Stage 3: Runtime environment
34
+ FROM python:3.13-slim
35
+
36
+ # Install runtime dependencies only
37
+ RUN apt-get update && apt-get install -y \
38
+ libgomp1 \
39
+ && rm -rf /var/lib/apt/lists/*
40
+
41
+ # Copy virtual environment from builder
42
+ COPY --from=builder /opt/venv /opt/venv
43
+ ENV PATH="/opt/venv/bin:$PATH"
44
+
45
+ # Set working directory
46
+ WORKDIR /app
47
+
48
+ # Copy application code
49
+ COPY src/ /app/src/
50
+ COPY examples/ /app/examples/
51
+
52
+ # Copy built frontend from frontend-builder
53
+ COPY --from=frontend-builder /frontend/dist /app/FRRONTEEEND/dist
54
+
55
+ # Create necessary directories for Cloud Run ephemeral storage
56
+ RUN mkdir -p /tmp/data_science_agent \
57
+ /tmp/outputs/models \
58
+ /tmp/outputs/plots \
59
+ /tmp/outputs/reports \
60
+ /tmp/outputs/data \
61
+ /tmp/cache_db
62
+
63
+ # Set environment variables
64
+ ENV PYTHONUNBUFFERED=1
65
+ ENV PORT=8080
66
+ ENV OUTPUT_DIR=/tmp/outputs
67
+ ENV CACHE_DB_PATH=/tmp/cache_db/cache.db
68
+ ENV ARTIFACT_BACKEND=local
69
+
70
+ # Cloud Run expects the service to listen on the PORT env variable
71
+ EXPOSE 8080
72
+
73
+ # Health check (optional, Cloud Run handles this)
74
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
75
+ CMD python -c "import requests; requests.get('http://localhost:8080/health')" || exit 1
76
+
77
+ # Run the FastAPI application
78
+ CMD ["python", "src/api/app.py"]
FRONTEND_INTEGRATION.md ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Data Science Agent - Frontend Integration Guide
2
+
3
+ ## 🎉 New React Frontend
4
+
5
+ The application now features a modern, professional React frontend that replaces the old Gradio interface.
6
+
7
+ ### Features
8
+
9
+ - **Beautiful Landing Page**: Showcases the agent's capabilities with modern design
10
+ - **Professional Chat Interface**: NextChat-style conversational UI
11
+ - **Direct Backend Integration**: Communicates with your FastAPI backend
12
+ - **Responsive Design**: Works on all devices
13
+ - **Dark Theme**: Modern, eye-friendly interface
14
+
15
+ ## 🚀 Quick Start
16
+
17
+ ### Prerequisites
18
+
19
+ - Python 3.13+
20
+ - Node.js 20+
21
+ - npm (comes with Node.js)
22
+
23
+ ### Running the Application
24
+
25
+ #### Option 1: Using the Build Script (Recommended)
26
+
27
+ **Windows:**
28
+ ```powershell
29
+ .\build-and-deploy.ps1
30
+ ```
31
+
32
+ **Linux/Mac:**
33
+ ```bash
34
+ chmod +x build-and-deploy.sh
35
+ ./build-and-deploy.sh
36
+ ```
37
+
38
+ Then start the server:
39
+ ```bash
40
+ python src/api/app.py
41
+ ```
42
+
43
+ #### Option 2: Manual Steps
44
+
45
+ 1. **Build the Frontend:**
46
+ ```bash
47
+ cd FRRONTEEEND
48
+ npm.cmd install
49
+ npm.cmd run build
50
+ cd ..
51
+ ```
52
+
53
+ 2. **Install Python Dependencies:**
54
+ ```bash
55
+ pip install -r requirements.txt
56
+ ```
57
+
58
+ 3. **Start the Backend Server:**
59
+ ```bash
60
+ python src/api/app.py
61
+ ```
62
+
63
+ 4. **Access the Application:**
64
+ Open your browser and navigate to: http://localhost:8080
65
+
66
+ ## 🏗️ Architecture
67
+
68
+ ### Backend (FastAPI)
69
+ - **Location**: `src/api/app.py`
70
+ - **Port**: 8080
71
+ - **Endpoints**:
72
+ - `GET /` - Health check & landing page
73
+ - `POST /chat` - Chat interface endpoint
74
+ - `POST /run` - Full data science workflow
75
+ - `POST /profile` - Dataset profiling
76
+ - `GET /tools` - List available tools
77
+
78
+ ### Frontend (React + Vite)
79
+ - **Location**: `FRRONTEEEND/`
80
+ - **Build Output**: `FRRONTEEEND/dist/`
81
+ - **Dev Port**: 3000 (development mode)
82
+ - **Production**: Served by FastAPI at port 8080
83
+
84
+ ## 🔧 Development Mode
85
+
86
+ If you want to develop the frontend with hot-reloading:
87
+
88
+ 1. **Terminal 1 - Backend:**
89
+ ```bash
90
+ python src/api/app.py
91
+ ```
92
+
93
+ 2. **Terminal 2 - Frontend:**
94
+ ```bash
95
+ cd FRRONTEEEND
96
+ npm.cmd run dev
97
+ ```
98
+
99
+ Access:
100
+ - Frontend (dev): http://localhost:3000
101
+ - Backend API: http://localhost:8080
102
+
103
+ ## 🌐 API Integration
104
+
105
+ The frontend now communicates with your FastAPI backend instead of calling external APIs directly.
106
+
107
+ ### Environment Variables
108
+
109
+ Create `FRRONTEEEND/.env` for local development:
110
+ ```env
111
+ VITE_API_URL=http://localhost:8080
112
+ ```
113
+
114
+ For production, update `FRRONTEEEND/.env.production`:
115
+ ```env
116
+ VITE_API_URL=https://your-cloud-run-url.run.app
117
+ ```
118
+
119
+ ## 📦 Deployment
120
+
121
+ ### Docker Build
122
+
123
+ The Dockerfile now includes a multi-stage build that:
124
+ 1. Builds the React frontend
125
+ 2. Builds the Python environment
126
+ 3. Combines both in the final image
127
+
128
+ ```bash
129
+ docker build -t data-science-agent .
130
+ docker run -p 8080:8080 data-science-agent
131
+ ```
132
+
133
+ ### Google Cloud Run
134
+
135
+ ```bash
136
+ gcloud builds submit --tag gcr.io/YOUR-PROJECT-ID/data-science-agent
137
+ gcloud run deploy data-science-agent \
138
+ --image gcr.io/YOUR-PROJECT-ID/data-science-agent \
139
+ --platform managed \
140
+ --region us-central1 \
141
+ --allow-unauthenticated \
142
+ --set-env-vars GROQ_API_KEY=your-api-key
143
+ ```
144
+
145
+ ## 🔄 What Changed
146
+
147
+ ### Removed
148
+ - ❌ Gradio interface (`chat_ui.py` - kept for reference)
149
+ - ❌ Direct Google GenAI calls from frontend
150
+ - ❌ Gradio dependency
151
+
152
+ ### Added
153
+ - ✅ React + TypeScript frontend with Vite
154
+ - ✅ Professional landing page
155
+ - ✅ Modern chat interface
156
+ - ✅ `/chat` API endpoint
157
+ - ✅ CORS support in FastAPI
158
+ - ✅ Static file serving for React app
159
+ - ✅ Multi-stage Docker build
160
+
161
+ ## 🛠️ Tech Stack
162
+
163
+ ### Frontend
164
+ - React 19
165
+ - TypeScript 5.8
166
+ - Vite 6
167
+ - Tailwind CSS
168
+ - Framer Motion (animations)
169
+ - Lucide React (icons)
170
+
171
+ ### Backend (unchanged)
172
+ - FastAPI
173
+ - Python 3.13
174
+ - Groq API
175
+ - Polars, DuckDB
176
+ - Scikit-learn, XGBoost, LightGBM
177
+
178
+ ## 📁 Project Structure
179
+
180
+ ```
181
+ .
182
+ ├── FRRONTEEEND/ # React frontend
183
+ │ ├── components/ # React components
184
+ │ ├── dist/ # Built frontend (after npm run build)
185
+ │ ├── package.json
186
+ │ ├── vite.config.ts
187
+ │ └── .env # Frontend environment variables
188
+ ├── src/
189
+ │ ├── api/
190
+ │ │ └── app.py # FastAPI backend (updated)
191
+ │ ├── tools/ # Data science tools
192
+ │ └── orchestrator.py # Main agent logic
193
+ ├── requirements.txt # Python dependencies (updated)
194
+ ├── Dockerfile # Multi-stage build (updated)
195
+ ├── build-and-deploy.ps1 # Windows build script
196
+ └── build-and-deploy.sh # Linux/Mac build script
197
+ ```
198
+
199
+ ## 🐛 Troubleshooting
200
+
201
+ ### Frontend doesn't load
202
+ - Make sure you've run `npm run build` in the FRRONTEEEND directory
203
+ - Check that `FRRONTEEEND/dist/` exists and contains files
204
+
205
+ ### API errors in chat
206
+ - Ensure the backend is running on port 8080
207
+ - Check that `GROQ_API_KEY` is set in your environment
208
+ - Verify the API URL in `.env` file
209
+
210
+ ### CORS errors
211
+ - The backend now has CORS enabled for development
212
+ - For production, update the `allow_origins` in `src/api/app.py`
213
+
214
+ ## 📝 Notes
215
+
216
+ - The old `chat_ui.py` has been kept for reference but is no longer used
217
+ - All chat functionality now goes through the `/chat` endpoint
218
+ - The frontend is automatically served by FastAPI in production mode
219
+ - Session history is maintained in the frontend (browser)
220
+
221
+ ## 🎯 Next Steps
222
+
223
+ 1. **Customize the frontend**: Edit files in `FRRONTEEEND/components/`
224
+ 2. **Add file upload**: Extend `ChatInterface.tsx` to handle file uploads
225
+ 3. **Add visualization**: Display charts from the backend in the chat
226
+ 4. **Authentication**: Add user authentication if needed
227
+
228
+ ## 📞 Support
229
+
230
+ For issues or questions:
231
+ 1. Check the console logs (browser & terminal)
232
+ 2. Verify environment variables
233
+ 3. Ensure all dependencies are installed
234
+ 4. Review the API documentation at http://localhost:8080/docs
FRRONTEEEND/.env.production ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Production API Configuration
2
+ # Update this to your production API URL
3
+ VITE_API_URL=https://your-cloud-run-url.run.app
FRRONTEEEND/.gitignore ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Logs
2
+ logs
3
+ *.log
4
+ npm-debug.log*
5
+ yarn-debug.log*
6
+ yarn-error.log*
7
+ pnpm-debug.log*
8
+ lerna-debug.log*
9
+
10
+ node_modules
11
+ dist
12
+ dist-ssr
13
+ *.local
14
+
15
+ # Editor directories and files
16
+ .vscode/*
17
+ !.vscode/extensions.json
18
+ .idea
19
+ .DS_Store
20
+ *.suo
21
+ *.ntvs*
22
+ *.njsproj
23
+ *.sln
24
+ *.sw?
FRRONTEEEND/App.tsx ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import React, { useState } from 'react';
3
+ import { HeroGeometric } from './components/HeroGeometric';
4
+ import ProblemSolution from './components/ProblemSolution';
5
+ import KeyCapabilities from './components/KeyCapabilities';
6
+ import Process from './components/Process';
7
+ import TechStack from './components/TechStack';
8
+ import Footer from './components/Footer';
9
+ import { BackgroundPaths } from './components/BackgroundPaths';
10
+ import { Logo } from './components/Logo';
11
+ import { ChatInterface } from './components/ChatInterface';
12
+
13
+ const App: React.FC = () => {
14
+ const [view, setView] = useState<'landing' | 'chat'>('landing');
15
+
16
+ if (view === 'chat') {
17
+ return <ChatInterface onBack={() => setView('landing')} />;
18
+ }
19
+
20
+ return (
21
+ <div className="min-h-screen bg-[#030303] text-white selection:bg-indigo-500/30">
22
+ {/* Navigation (Overlay) */}
23
+ <nav className="fixed top-0 left-0 right-0 z-50 flex justify-between items-center px-6 py-4 backdrop-blur-md bg-[#030303]/20 border-b border-white/5">
24
+ <div className="flex items-center gap-3 cursor-pointer" onClick={() => setView('landing')}>
25
+ <Logo className="w-10 h-10" />
26
+ <span className="font-bold tracking-tight text-lg hidden sm:block uppercase text-white">
27
+ DATA SCIENCE AGENT
28
+ </span>
29
+ </div>
30
+
31
+ <button
32
+ onClick={() => setView('chat')}
33
+ className="px-5 py-2 bg-white/5 hover:bg-white/10 border border-white/10 rounded-lg text-sm font-medium transition-all"
34
+ >
35
+ Launch Console
36
+ </button>
37
+ </nav>
38
+
39
+ <main>
40
+ <HeroGeometric onChatClick={() => setView('chat')} />
41
+ <TechStack />
42
+ <ProblemSolution />
43
+ <KeyCapabilities />
44
+
45
+ {/* Transitional background paths section */}
46
+ <BackgroundPaths
47
+ title="Intelligence Without Limits"
48
+ subtitle="The agent continuously learns from your specific domain, optimizing its own tools and reasoning strategies to solve your hardest data challenges."
49
+ />
50
+
51
+ <Process />
52
+ </main>
53
+
54
+ <Footer />
55
+ </div>
56
+ );
57
+ };
58
+
59
+ export default App;
FRRONTEEEND/README.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div align="center">
2
+ <img width="1200" height="475" alt="GHBanner" src="https://github.com/user-attachments/assets/0aa67016-6eaf-458a-adb2-6e31a0763ed6" />
3
+ </div>
4
+
5
+ # Run and deploy your AI Studio app
6
+
7
+ This contains everything you need to run your app locally.
8
+
9
+ View your app in AI Studio: https://ai.studio/apps/drive/1gChoktTuh429q26FzxS4BPo0q0LnlRE9
10
+
11
+ ## Run Locally
12
+
13
+ **Prerequisites:** Node.js
14
+
15
+
16
+ 1. Install dependencies:
17
+ `npm install`
18
+ 2. Set the `GEMINI_API_KEY` in [.env.local](.env.local) to your Gemini API key
19
+ 3. Run the app:
20
+ `npm run dev`
FRRONTEEEND/components/BackgroundPaths.tsx ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import React from "react";
3
+ import { motion } from "framer-motion";
4
+ import { ArrowRight } from "lucide-react";
5
+ import { cn } from "../lib/utils";
6
+
7
+ function FloatingPaths({ position }: { position: number }) {
8
+ const paths = Array.from({ length: 36 }, (_, i) => ({
9
+ id: i,
10
+ d: `M-${380 - i * 5 * position} -${189 + i * 6}C-${
11
+ 380 - i * 5 * position
12
+ } -${189 + i * 6} -${312 - i * 5 * position} ${216 - i * 6} ${
13
+ 152 - i * 5 * position
14
+ } ${343 - i * 6}C${616 - i * 5 * position} ${470 - i * 6} ${
15
+ 684 - i * 5 * position
16
+ } ${875 - i * 6} ${684 - i * 5 * position} ${875 - i * 6}`,
17
+ color: `rgba(99,102,241,${0.05 + i * 0.01})`, // Using indigo-500 tint
18
+ width: 0.5 + i * 0.03,
19
+ }));
20
+
21
+ return (
22
+ <div className="absolute inset-0 pointer-events-none">
23
+ <svg
24
+ className="w-full h-full text-indigo-500/20"
25
+ viewBox="0 0 696 316"
26
+ fill="none"
27
+ >
28
+ <title>Background Paths</title>
29
+ {paths.map((path) => (
30
+ <motion.path
31
+ key={path.id}
32
+ d={path.d}
33
+ stroke="currentColor"
34
+ strokeWidth={path.width}
35
+ strokeOpacity={0.1 + path.id * 0.02}
36
+ initial={{ pathLength: 0.3, opacity: 0.4 }}
37
+ animate={{
38
+ pathLength: 1,
39
+ opacity: [0.2, 0.5, 0.2],
40
+ pathOffset: [0, 1, 0],
41
+ }}
42
+ transition={{
43
+ duration: 15 + Math.random() * 10,
44
+ repeat: Number.POSITIVE_INFINITY,
45
+ ease: "linear",
46
+ }}
47
+ />
48
+ ))}
49
+ </svg>
50
+ </div>
51
+ );
52
+ }
53
+
54
+ export function BackgroundPaths({
55
+ title = "The Future is Autonomous",
56
+ subtitle = "Scale your data engineering and predictive modeling beyond human limits.",
57
+ }: {
58
+ title?: string;
59
+ subtitle?: string;
60
+ }) {
61
+ const words = title.split(" ");
62
+
63
+ return (
64
+ <section className="relative min-h-[80vh] w-full flex items-center justify-center overflow-hidden bg-[#030303]">
65
+ <div className="absolute inset-0">
66
+ <FloatingPaths position={1} />
67
+ <FloatingPaths position={-1} />
68
+ </div>
69
+
70
+ <div className="relative z-10 container mx-auto px-4 md:px-6 text-center">
71
+ <motion.div
72
+ initial={{ opacity: 0 }}
73
+ animate={{ opacity: 1 }}
74
+ transition={{ duration: 2 }}
75
+ className="max-w-4xl mx-auto"
76
+ >
77
+ <h2 className="text-5xl sm:text-6xl md:text-8xl font-extrabold mb-8 tracking-tighter">
78
+ {words.map((word, wordIndex) => (
79
+ <span
80
+ key={wordIndex}
81
+ className="inline-block mr-4 last:mr-0"
82
+ >
83
+ {word.split("").map((letter, letterIndex) => (
84
+ <motion.span
85
+ key={`${wordIndex}-${letterIndex}`}
86
+ initial={{ y: 50, opacity: 0 }}
87
+ whileInView={{ y: 0, opacity: 1 }}
88
+ viewport={{ once: true }}
89
+ transition={{
90
+ delay:
91
+ wordIndex * 0.1 +
92
+ letterIndex * 0.02,
93
+ type: "spring",
94
+ stiffness: 150,
95
+ damping: 25,
96
+ }}
97
+ className="inline-block text-transparent bg-clip-text
98
+ bg-gradient-to-r from-white via-white/90 to-white/70"
99
+ >
100
+ {letter}
101
+ </motion.span>
102
+ ))}
103
+ </span>
104
+ ))}
105
+ </h2>
106
+
107
+ <motion.p
108
+ initial={{ opacity: 0, y: 20 }}
109
+ whileInView={{ opacity: 1, y: 0 }}
110
+ viewport={{ once: true }}
111
+ transition={{ delay: 0.5 }}
112
+ className="text-white/40 text-xl font-medium mb-12 max-w-2xl mx-auto tracking-tight"
113
+ >
114
+ {subtitle}
115
+ </motion.p>
116
+
117
+ <motion.div
118
+ initial={{ opacity: 0, scale: 0.9 }}
119
+ whileInView={{ opacity: 1, scale: 1 }}
120
+ viewport={{ once: true }}
121
+ transition={{ delay: 0.8 }}
122
+ className="inline-block group relative bg-gradient-to-b from-white/10 to-indigo-500/10
123
+ p-px rounded-2xl backdrop-blur-lg
124
+ overflow-hidden shadow-lg hover:shadow-indigo-500/20 transition-all duration-300"
125
+ >
126
+ <button
127
+ className="rounded-[1.15rem] px-10 py-5 text-lg font-bold backdrop-blur-md
128
+ bg-white/95 hover:bg-white text-black transition-all duration-300
129
+ group-hover:-translate-y-0.5 border border-white/10
130
+ flex items-center gap-3"
131
+ >
132
+ <span className="opacity-90 group-hover:opacity-100 transition-opacity">
133
+ Deploy Your First Agent
134
+ </span>
135
+ <ArrowRight
136
+ className="w-5 h-5 opacity-70 group-hover:opacity-100 group-hover:translate-x-1.5
137
+ transition-all duration-300"
138
+ />
139
+ </button>
140
+ </motion.div>
141
+ </motion.div>
142
+ </div>
143
+
144
+ {/* Subtle glow effect at the bottom */}
145
+ <div className="absolute bottom-0 left-1/2 -translate-x-1/2 w-full h-px bg-gradient-to-r from-transparent via-indigo-500/50 to-transparent shadow-[0_0_50px_2px_rgba(99,102,241,0.2)]" />
146
+ </section>
147
+ );
148
+ }
FRRONTEEEND/components/ChatInterface.tsx ADDED
@@ -0,0 +1,571 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import React, { useState, useRef, useEffect } from 'react';
3
+ import { motion, AnimatePresence } from 'framer-motion';
4
+ import { Send, Plus, Search, Settings, MoreHorizontal, User, Bot, ArrowLeft, Paperclip, Sparkles, Trash2, X, Upload } from 'lucide-react';
5
+ import { cn } from '../lib/utils';
6
+ import { Logo } from './Logo';
7
+ import ReactMarkdown from 'react-markdown';
8
+
9
+ interface Message {
10
+ id: string;
11
+ role: 'user' | 'assistant';
12
+ content: string;
13
+ timestamp: Date;
14
+ file?: {
15
+ name: string;
16
+ size: number;
17
+ };
18
+ reports?: Array<{
19
+ name: string;
20
+ path: string;
21
+ }>;
22
+ }
23
+
24
+ interface ChatSession {
25
+ id: string;
26
+ title: string;
27
+ messages: Message[];
28
+ updatedAt: Date;
29
+ }
30
+
31
+ export const ChatInterface: React.FC<{ onBack: () => void }> = ({ onBack }) => {
32
+ const [sessions, setSessions] = useState<ChatSession[]>([
33
+ {
34
+ id: '1',
35
+ title: 'ML Model Analysis',
36
+ messages: [],
37
+ updatedAt: new Date(),
38
+ }
39
+ ]);
40
+ const [activeSessionId, setActiveSessionId] = useState('1');
41
+ const [input, setInput] = useState('');
42
+ const [isTyping, setIsTyping] = useState(false);
43
+ const [uploadedFile, setUploadedFile] = useState<File | null>(null);
44
+ const [reportModalUrl, setReportModalUrl] = useState<string | null>(null);
45
+ const fileInputRef = useRef<HTMLInputElement>(null);
46
+ const scrollRef = useRef<HTMLDivElement>(null);
47
+
48
+ const activeSession = sessions.find(s => s.id === activeSessionId) || sessions[0];
49
+
50
+ useEffect(() => {
51
+ if (scrollRef.current) {
52
+ scrollRef.current.scrollTop = scrollRef.current.scrollHeight;
53
+ }
54
+ }, [activeSession.messages, isTyping]);
55
+
56
+ const handleSend = async () => {
57
+ if ((!input.trim() && !uploadedFile) || isTyping) return;
58
+
59
+ const userMessage: Message = {
60
+ id: Date.now().toString(),
61
+ role: 'user',
62
+ content: input || (uploadedFile ? `Uploaded: ${uploadedFile.name}` : ''),
63
+ timestamp: new Date(),
64
+ file: uploadedFile ? { name: uploadedFile.name, size: uploadedFile.size } : undefined,
65
+ };
66
+
67
+ const newMessages = [...activeSession.messages, userMessage];
68
+ updateSession(activeSessionId, newMessages);
69
+ setInput('');
70
+ setIsTyping(true);
71
+
72
+ try {
73
+ // Use the current origin if running on same server, otherwise use env variable
74
+ const API_URL = window.location.origin;
75
+ console.log('API URL:', API_URL);
76
+
77
+ let response;
78
+
79
+ if (uploadedFile) {
80
+ const formData = new FormData();
81
+ formData.append('file', uploadedFile);
82
+ formData.append('task_description', input || 'Analyze this dataset and provide insights');
83
+ formData.append('use_cache', 'true');
84
+ formData.append('max_iterations', '20');
85
+
86
+ response = await fetch(`${API_URL}/run`, {
87
+ method: 'POST',
88
+ body: formData
89
+ });
90
+
91
+ setUploadedFile(null);
92
+ } else {
93
+ response = await fetch(`${API_URL}/chat`, {
94
+ method: 'POST',
95
+ headers: {
96
+ 'Content-Type': 'application/json',
97
+ },
98
+ body: JSON.stringify({
99
+ messages: newMessages.map(m => ({
100
+ role: m.role,
101
+ content: m.content
102
+ })),
103
+ stream: false
104
+ })
105
+ });
106
+ }
107
+
108
+ if (!response.ok) {
109
+ throw new Error(`API error: ${response.status}`);
110
+ }
111
+
112
+ const data = await response.json();
113
+
114
+ let assistantContent = '';
115
+ let reports: Array<{name: string, path: string}> = [];
116
+
117
+ if (uploadedFile && data.result) {
118
+ const result = data.result;
119
+ assistantContent = `✅ Analysis Complete!\n\n`;
120
+
121
+ // Extract report paths from workflow history
122
+ if (result.workflow_history) {
123
+ const reportTools = ['generate_ydata_profiling_report', 'generate_sweetviz_report', 'generate_combined_eda_report'];
124
+ result.workflow_history.forEach((step: any) => {
125
+ if (reportTools.includes(step.tool)) {
126
+ // Check multiple possible locations for the report path
127
+ const reportPath = step.result?.output_path || step.result?.report_path || step.arguments?.output_path;
128
+
129
+ if (reportPath && (step.result?.success !== false)) {
130
+ reports.push({
131
+ name: step.tool.replace('generate_', '').replace(/_/g, ' ').replace('report', '').trim(),
132
+ path: reportPath
133
+ });
134
+ }
135
+ }
136
+ });
137
+ }
138
+
139
+ // Also check for report paths mentioned in the summary text
140
+ if (result.summary && !reports.length) {
141
+ const reportPathMatch = result.summary.match(/\.(\/outputs\/reports\/[^\s]+\.html)/);
142
+ if (reportPathMatch) {
143
+ reports.push({
144
+ name: 'ydata profiling',
145
+ path: reportPathMatch[1]
146
+ });
147
+ }
148
+ }
149
+
150
+ if (result.summary) {
151
+ assistantContent += `**Summary:**\n${result.summary}\n\n`;
152
+ }
153
+
154
+ if (result.workflow_history && result.workflow_history.length > 0) {
155
+ assistantContent += `**Tools Used:** ${result.workflow_history.length} steps\n\n`;
156
+ assistantContent += `**Final Result:**\n${result.final_result || 'Analysis completed successfully'}`;
157
+ }
158
+ } else if (data.success && data.message) {
159
+ assistantContent = data.message;
160
+ } else {
161
+ throw new Error('Invalid response from API');
162
+ }
163
+
164
+ updateSession(activeSessionId, [...newMessages, {
165
+ id: (Date.now() + 1).toString(),
166
+ role: 'assistant',
167
+ content: assistantContent,
168
+ timestamp: new Date(),
169
+ reports: reports.length > 0 ? reports : undefined
170
+ }]);
171
+ } catch (error: any) {
172
+ console.error("Chat Error:", error);
173
+
174
+ let errorMessage = "I'm sorry, I encountered an error processing your request.";
175
+
176
+ if (error.message) {
177
+ errorMessage += `\n\n**Error:** ${error.message}`;
178
+ }
179
+
180
+ // Try to parse response error
181
+ try {
182
+ const errorText = await error.text?.();
183
+ if (errorText) {
184
+ const errorData = JSON.parse(errorText);
185
+ if (errorData.detail) {
186
+ errorMessage = `**Error:** ${typeof errorData.detail === 'string' ? errorData.detail : JSON.stringify(errorData.detail)}`;
187
+ }
188
+ }
189
+ } catch (e) {
190
+ // Ignore parsing errors
191
+ }
192
+
193
+ updateSession(activeSessionId, [...newMessages, {
194
+ id: 'err-' + Date.now(),
195
+ role: 'assistant',
196
+ content: errorMessage,
197
+ timestamp: new Date()
198
+ }]);
199
+ } finally {
200
+ setIsTyping(false);
201
+ }
202
+ };
203
+
204
+ const updateSession = (id: string, messages: Message[]) => {
205
+ setSessions(prev => prev.map(s => {
206
+ if (s.id === id) {
207
+ return { ...s, messages, updatedAt: new Date() };
208
+ }
209
+ return s;
210
+ }));
211
+ };
212
+
213
+ const createNewChat = () => {
214
+ const newId = Date.now().toString();
215
+ const newSession: ChatSession = {
216
+ id: newId,
217
+ title: 'New Chat',
218
+ messages: [],
219
+ updatedAt: new Date()
220
+ };
221
+ setSessions([newSession, ...sessions]);
222
+ setActiveSessionId(newId);
223
+ };
224
+
225
+ const deleteSession = (e: React.MouseEvent, id: string) => {
226
+ e.stopPropagation();
227
+ if (sessions.length === 1) return;
228
+ setSessions(prev => prev.filter(s => s.id !== id));
229
+ if (activeSessionId === id) {
230
+ setActiveSessionId(sessions.find(s => s.id !== id)?.id || '');
231
+ }
232
+ };
233
+
234
+ const handleFileSelect = (e: React.ChangeEvent<HTMLInputElement>) => {
235
+ const file = e.target.files?.[0];
236
+ if (file) {
237
+ const validTypes = ['.csv', '.parquet'];
238
+ const fileExt = file.name.substring(file.name.lastIndexOf('.')).toLowerCase();
239
+
240
+ if (validTypes.includes(fileExt)) {
241
+ setUploadedFile(file);
242
+ } else {
243
+ alert('Please upload a CSV or Parquet file');
244
+ }
245
+ }
246
+ };
247
+
248
+ const removeFile = () => {
249
+ setUploadedFile(null);
250
+ if (fileInputRef.current) {
251
+ fileInputRef.current.value = '';
252
+ }
253
+ };
254
+
255
+ return (
256
+ <div className="flex h-screen w-full bg-[#050505] overflow-hidden text-white/90">
257
+ {/* Sidebar */}
258
+ <aside className="w-[280px] hidden md:flex flex-col border-r border-white/5 bg-[#0a0a0a]/50 backdrop-blur-xl">
259
+ <div className="p-4 flex flex-col h-full">
260
+ <div className="flex items-center gap-3 mb-8 px-2">
261
+ <Logo className="w-8 h-8" />
262
+ <span className="font-bold tracking-tight text-sm uppercase">Console</span>
263
+ </div>
264
+
265
+ <button
266
+ onClick={createNewChat}
267
+ className="w-full flex items-center gap-3 px-4 py-3 rounded-xl bg-white/5 hover:bg-white/10 border border-white/10 transition-all text-sm font-medium mb-6 group"
268
+ >
269
+ <Plus className="w-4 h-4 group-hover:scale-110 transition-transform" />
270
+ New Conversation
271
+ </button>
272
+
273
+ <div className="flex-1 overflow-y-auto space-y-2 custom-scrollbar">
274
+ <p className="px-3 text-[10px] uppercase tracking-widest text-white/30 font-bold mb-2">History</p>
275
+ {sessions.map(session => (
276
+ <div
277
+ key={session.id}
278
+ onClick={() => setActiveSessionId(session.id)}
279
+ className={cn(
280
+ "group flex items-center justify-between px-4 py-3 rounded-xl cursor-pointer transition-all text-sm",
281
+ activeSessionId === session.id
282
+ ? "bg-white/10 text-white border border-white/10 shadow-lg"
283
+ : "text-white/40 hover:text-white/70 hover:bg-white/5"
284
+ )}
285
+ >
286
+ <span className="truncate flex-1 pr-2">{session.title}</span>
287
+ <Trash2
288
+ onClick={(e) => deleteSession(e, session.id)}
289
+ className="w-4 h-4 opacity-0 group-hover:opacity-100 hover:text-rose-400 transition-all"
290
+ />
291
+ </div>
292
+ ))}
293
+ </div>
294
+
295
+ <div className="mt-auto pt-4 border-t border-white/5 flex items-center justify-between px-2">
296
+ <button onClick={onBack} className="p-2 hover:bg-white/5 rounded-lg transition-colors text-white/40 hover:text-white">
297
+ <ArrowLeft className="w-5 h-5" />
298
+ </button>
299
+ <div className="flex gap-2">
300
+ <button className="p-2 hover:bg-white/5 rounded-lg transition-colors text-white/40 hover:text-white">
301
+ <Settings className="w-5 h-5" />
302
+ </button>
303
+ <button className="p-2 hover:bg-white/5 rounded-lg transition-colors text-white/40 hover:text-white">
304
+ <User className="w-5 h-5" />
305
+ </button>
306
+ </div>
307
+ </div>
308
+ </div>
309
+ </aside>
310
+
311
+ {/* Main Chat Area */}
312
+ <main className="flex-1 flex flex-col relative bg-gradient-to-b from-[#080808] to-[#050505]">
313
+ {/* Top Header */}
314
+ <header className="h-16 flex items-center justify-between px-6 border-b border-white/5 backdrop-blur-md bg-black/20 sticky top-0 z-10">
315
+ <div className="flex items-center gap-4">
316
+ <button onClick={onBack} className="md:hidden p-2 hover:bg-white/5 rounded-lg">
317
+ <ArrowLeft className="w-5 h-5" />
318
+ </button>
319
+ <div>
320
+ <h2 className="text-sm font-bold text-white tracking-tight">{activeSession.title}</h2>
321
+ <p className="text-[10px] text-white/30 font-medium">{activeSession.messages.length} messages in session</p>
322
+ </div>
323
+ </div>
324
+ <div className="flex items-center gap-3">
325
+ <button className="p-2 text-white/40 hover:text-white transition-colors">
326
+ <Search className="w-5 h-5" />
327
+ </button>
328
+ <button className="p-2 text-white/40 hover:text-white transition-colors">
329
+ <MoreHorizontal className="w-5 h-5" />
330
+ </button>
331
+ </div>
332
+ </header>
333
+
334
+ {/* Message List */}
335
+ <div
336
+ ref={scrollRef}
337
+ className="flex-1 overflow-y-auto p-4 md:p-8 space-y-8 scroll-smooth"
338
+ >
339
+ {activeSession.messages.length === 0 ? (
340
+ <div className="h-full flex flex-col items-center justify-center text-center px-4">
341
+ <motion.div
342
+ initial={{ opacity: 0, scale: 0.9 }}
343
+ animate={{ opacity: 1, scale: 1 }}
344
+ className="w-16 h-16 bg-gradient-to-br from-indigo-500/20 to-rose-500/20 rounded-2xl flex items-center justify-center mb-6 border border-white/10"
345
+ >
346
+ <Sparkles className="w-8 h-8 text-indigo-400" />
347
+ </motion.div>
348
+ <h1 className="text-2xl font-extrabold text-white mb-3">Welcome, Data Scientist</h1>
349
+ <p className="text-white/40 max-w-sm leading-relaxed text-sm">
350
+ I'm your autonomous agent ready to profile data, train models, or build dashboards.
351
+ Try uploading a dataset or describing your ML objective.
352
+ </p>
353
+ <div className="grid grid-cols-1 sm:grid-cols-2 gap-3 mt-8 w-full max-w-lg">
354
+ {[
355
+ "Profile my sales.csv",
356
+ "Train a XGBoost classifier",
357
+ "Generate a correlation heatmap",
358
+ "Explain feature importance"
359
+ ].map(prompt => (
360
+ <button
361
+ key={prompt}
362
+ onClick={() => setInput(prompt)}
363
+ className="text-left px-4 py-3 rounded-xl bg-white/[0.03] border border-white/5 hover:bg-white/5 transition-all text-xs text-white/60 hover:text-white"
364
+ >
365
+ "{prompt}"
366
+ </button>
367
+ ))}
368
+ </div>
369
+ </div>
370
+ ) : (
371
+ activeSession.messages.map((msg) => (
372
+ <motion.div
373
+ key={msg.id}
374
+ initial={{ opacity: 0, y: 10 }}
375
+ animate={{ opacity: 1, y: 0 }}
376
+ className={cn(
377
+ "flex w-full gap-4",
378
+ msg.role === 'user' ? "flex-row-reverse" : "flex-row"
379
+ )}
380
+ >
381
+ <div className={cn(
382
+ "w-8 h-8 rounded-lg flex items-center justify-center shrink-0 border border-white/10",
383
+ msg.role === 'user' ? "bg-indigo-500/20" : "bg-white/5"
384
+ )}>
385
+ {msg.role === 'user' ? <User className="w-4 h-4" /> : <Bot className="w-4 h-4 text-indigo-400" />}
386
+ </div>
387
+ <div className={cn(
388
+ "max-w-[80%] md:max-w-[70%] p-4 rounded-2xl text-sm leading-relaxed",
389
+ msg.role === 'user'
390
+ ? "bg-indigo-600/20 text-indigo-50 border border-indigo-500/20"
391
+ : "bg-white/[0.03] text-white/80 border border-white/5"
392
+ )}>
393
+ {msg.file && (
394
+ <div className="mb-2 flex items-center gap-2 text-xs bg-white/5 rounded-lg px-3 py-2 border border-white/10">
395
+ <Paperclip className="w-3 h-3" />
396
+ <span className="font-medium">{msg.file.name}</span>
397
+ <span className="text-white/40">({(msg.file.size / 1024).toFixed(1)} KB)</span>
398
+ </div>
399
+ )}
400
+ {msg.role === 'assistant' ? (
401
+ <ReactMarkdown
402
+ className="prose prose-invert prose-sm max-w-none prose-p:leading-relaxed prose-pre:bg-black/40 prose-pre:border prose-pre:border-white/10 prose-headings:text-white prose-strong:text-white prose-li:text-white/80"
403
+ components={{
404
+ p: ({node, ...props}) => <p className="mb-3 last:mb-0" {...props} />,
405
+ ul: ({node, ...props}) => <ul className="mb-3 space-y-1" {...props} />,
406
+ ol: ({node, ...props}) => <ol className="mb-3 space-y-1" {...props} />,
407
+ li: ({node, ...props}) => <li className="ml-4" {...props} />,
408
+ strong: ({node, ...props}) => <strong className="font-semibold text-white" {...props} />,
409
+ code: ({node, inline, ...props}: any) =>
410
+ inline ?
411
+ <code className="px-1.5 py-0.5 rounded bg-white/10 text-indigo-300 text-xs font-mono" {...props} /> :
412
+ <code className="block p-3 rounded-lg bg-black/40 border border-white/10 text-xs font-mono overflow-x-auto" {...props} />
413
+ }}
414
+ >
415
+ {msg.content || ''}
416
+ </ReactMarkdown>
417
+ ) : (
418
+ msg.content || (msg.role === 'assistant' && isTyping && "...")
419
+ )}
420
+ {msg.reports && msg.reports.length > 0 && (
421
+ <div className="mt-4 flex flex-wrap gap-2">
422
+ {msg.reports.map((report, idx) => (
423
+ <button
424
+ key={idx}
425
+ onClick={() => setReportModalUrl(`${window.location.origin}${report.path}`)}
426
+ className="flex items-center gap-2 px-4 py-2 rounded-lg bg-indigo-500/20 hover:bg-indigo-500/30 border border-indigo-500/30 text-indigo-200 text-xs font-medium transition-all group"
427
+ >
428
+ <Sparkles className="w-3.5 h-3.5 group-hover:scale-110 transition-transform" />
429
+ View {report.name} Report
430
+ </button>
431
+ ))}
432
+ </div>
433
+ )}
434
+ <div className="mt-2 text-[10px] opacity-20 font-mono">
435
+ {msg.timestamp.toLocaleTimeString([], { hour: '2-digit', minute: '2-digit' })}
436
+ </div>
437
+ </div>
438
+ </motion.div>
439
+ ))
440
+ )}
441
+ {isTyping && activeSession.messages[activeSession.messages.length - 1]?.role === 'user' && (
442
+ <div className="flex gap-4">
443
+ <div className="w-8 h-8 rounded-lg flex items-center justify-center shrink-0 bg-white/5 border border-white/10">
444
+ <Bot className="w-4 h-4 text-indigo-400" />
445
+ </div>
446
+ <div className="bg-white/[0.03] p-4 rounded-2xl border border-white/5">
447
+ <div className="flex gap-1">
448
+ <span className="w-1.5 h-1.5 bg-white/20 rounded-full animate-bounce [animation-delay:-0.3s]"></span>
449
+ <span className="w-1.5 h-1.5 bg-white/20 rounded-full animate-bounce [animation-delay:-0.15s]"></span>
450
+ <span className="w-1.5 h-1.5 bg-white/20 rounded-full animate-bounce"></span>
451
+ </div>
452
+ </div>
453
+ </div>
454
+ )}
455
+ </div>
456
+
457
+ {/* Input Bar */}
458
+ <div className="p-4 md:p-8 pt-0">
459
+ <div className="max-w-4xl mx-auto relative">
460
+ <div className="absolute -top-10 left-4 flex gap-2">
461
+ <input
462
+ ref={fileInputRef}
463
+ type="file"
464
+ accept=".csv,.parquet"
465
+ onChange={handleFileSelect}
466
+ className="hidden"
467
+ id="file-upload"
468
+ />
469
+ <label
470
+ htmlFor="file-upload"
471
+ className="flex items-center gap-1.5 px-3 py-1 rounded-full bg-white/[0.03] border border-white/5 text-[10px] text-white/40 hover:text-white hover:bg-white/5 transition-all cursor-pointer"
472
+ >
473
+ <Upload className="w-3 h-3" /> Upload Dataset
474
+ </label>
475
+ {uploadedFile && (
476
+ <div className="flex items-center gap-2 px-3 py-1 rounded-full bg-indigo-500/20 border border-indigo-500/30 text-[10px] text-indigo-200">
477
+ <Paperclip className="w-3 h-3" />
478
+ <span className="max-w-[150px] truncate">{uploadedFile.name}</span>
479
+ <button onClick={removeFile} className="hover:text-white transition-colors">
480
+ <X className="w-3 h-3" />
481
+ </button>
482
+ </div>
483
+ )}
484
+ </div>
485
+ <div className="relative group">
486
+ <textarea
487
+ value={input}
488
+ onChange={(e) => setInput(e.target.value)}
489
+ onKeyDown={(e) => {
490
+ if (e.key === 'Enter' && !e.shiftKey) {
491
+ e.preventDefault();
492
+ handleSend();
493
+ }
494
+ }}
495
+ placeholder={uploadedFile ? "Describe what you want to do with this dataset..." : "Ask your agent anything or upload a dataset..."}
496
+ className="w-full bg-[#0d0d0d] border border-white/10 rounded-2xl p-4 pr-16 text-sm min-h-[56px] max-h-48 resize-none focus:outline-none focus:border-indigo-500/50 focus:ring-1 focus:ring-indigo-500/20 transition-all text-white/90 placeholder:text-white/20 shadow-2xl"
497
+ />
498
+ <button
499
+ onClick={handleSend}
500
+ disabled={(!input.trim() && !uploadedFile) || isTyping}
501
+ className={cn(
502
+ "absolute right-3 bottom-3 p-2.5 rounded-xl transition-all",
503
+ (input.trim() || uploadedFile) && !isTyping
504
+ ? "bg-white text-black hover:scale-105 active:scale-95"
505
+ : "bg-white/5 text-white/20 cursor-not-allowed"
506
+ )}
507
+ >
508
+ <Send className="w-4 h-4" />
509
+ </button>
510
+ </div>
511
+ <p className="text-center mt-3 text-[10px] text-white/20 font-medium">
512
+ Enterprise Data Agent v3.1 | Secured with end-to-end encryption
513
+ </p>
514
+ </div>
515
+ </div>
516
+ </main>
517
+
518
+ {/* Report Modal */}
519
+ <AnimatePresence>
520
+ {reportModalUrl && (
521
+ <motion.div
522
+ initial={{ opacity: 0 }}
523
+ animate={{ opacity: 1 }}
524
+ exit={{ opacity: 0 }}
525
+ className="fixed inset-0 bg-black/80 backdrop-blur-sm z-50 flex items-center justify-center p-4"
526
+ onClick={() => setReportModalUrl(null)}
527
+ >
528
+ <motion.div
529
+ initial={{ scale: 0.95, opacity: 0 }}
530
+ animate={{ scale: 1, opacity: 1 }}
531
+ exit={{ scale: 0.95, opacity: 0 }}
532
+ className="bg-[#0a0a0a] border border-white/10 rounded-2xl w-full max-w-7xl h-[90vh] flex flex-col overflow-hidden shadow-2xl"
533
+ onClick={(e) => e.stopPropagation()}
534
+ >
535
+ <div className="flex items-center justify-between p-4 border-b border-white/5">
536
+ <h3 className="text-lg font-semibold text-white">Data Profiling Report</h3>
537
+ <button
538
+ onClick={() => setReportModalUrl(null)}
539
+ className="p-2 rounded-lg hover:bg-white/5 transition-colors"
540
+ >
541
+ <X className="w-5 h-5" />
542
+ </button>
543
+ </div>
544
+ <iframe
545
+ src={reportModalUrl}
546
+ className="flex-1 w-full bg-white"
547
+ title="Report Viewer"
548
+ />
549
+ </motion.div>
550
+ </motion.div>
551
+ )}
552
+ </AnimatePresence>
553
+
554
+ <style>{`
555
+ .custom-scrollbar::-webkit-scrollbar {
556
+ width: 4px;
557
+ }
558
+ .custom-scrollbar::-webkit-scrollbar-track {
559
+ background: transparent;
560
+ }
561
+ .custom-scrollbar::-webkit-scrollbar-thumb {
562
+ background: rgba(255, 255, 255, 0.05);
563
+ border-radius: 10px;
564
+ }
565
+ .custom-scrollbar::-webkit-scrollbar-thumb:hover {
566
+ background: rgba(255, 255, 255, 0.1);
567
+ }
568
+ `}</style>
569
+ </div>
570
+ );
571
+ };
FRRONTEEEND/components/Footer.tsx ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import React, { useRef, useId, useEffect } from 'react';
3
+ import { motion, animate, useMotionValue, AnimationPlaybackControls } from 'framer-motion';
4
+ import { ArrowRight } from 'lucide-react';
5
+ import { Logo } from './Logo';
6
+
7
+ function mapRange(
8
+ value: number,
9
+ fromLow: number,
10
+ fromHigh: number,
11
+ toLow: number,
12
+ toHigh: number
13
+ ): number {
14
+ if (fromLow === fromHigh) {
15
+ return toLow;
16
+ }
17
+ const percentage = (value - fromLow) / (fromHigh - fromLow);
18
+ return toLow + percentage * (toHigh - toLow);
19
+ }
20
+
21
+ const Footer = () => {
22
+ const id = useId().replace(/:/g, "");
23
+ const instanceId = `footer-shadow-${id}`;
24
+ const feColorMatrixRef = useRef<SVGFEColorMatrixElement>(null);
25
+ const hueRotateMotionValue = useMotionValue(0);
26
+ const hueRotateAnimation = useRef<AnimationPlaybackControls | null>(null);
27
+
28
+ // Configuration from ShadowSection
29
+ const animationScale = 50;
30
+ const animationSpeed = 15;
31
+ const displacementScale = mapRange(animationScale, 1, 100, 20, 100);
32
+ const animationDuration = mapRange(animationSpeed, 1, 100, 1000, 50);
33
+
34
+ useEffect(() => {
35
+ if (feColorMatrixRef.current) {
36
+ hueRotateAnimation.current = animate(hueRotateMotionValue, 360, {
37
+ duration: animationDuration / 25,
38
+ repeat: Infinity,
39
+ repeatType: "loop",
40
+ ease: "linear",
41
+ onUpdate: (value: number) => {
42
+ if (feColorMatrixRef.current) {
43
+ feColorMatrixRef.current.setAttribute("values", String(value));
44
+ }
45
+ }
46
+ });
47
+ return () => hueRotateAnimation.current?.stop();
48
+ }
49
+ }, [animationDuration, hueRotateMotionValue]);
50
+
51
+ return (
52
+ <footer className="bg-[#030303] overflow-hidden">
53
+ {/* High-Impact CTA with Atmospheric Shadow UI */}
54
+ <section className="relative w-full py-32 md:py-48 flex items-center justify-center border-t border-white/5">
55
+ <div
56
+ className="absolute inset-0 pointer-events-none overflow-hidden"
57
+ style={{
58
+ filter: `url(#${instanceId}) blur(12px)`,
59
+ opacity: 0.8
60
+ }}
61
+ >
62
+ <svg style={{ position: "absolute", width: 0, height: 0 }}>
63
+ <defs>
64
+ <filter id={instanceId}>
65
+ <feTurbulence
66
+ result="undulation"
67
+ numOctaves="2"
68
+ baseFrequency={`${mapRange(animationScale, 0, 100, 0.001, 0.0005)},${mapRange(animationScale, 0, 100, 0.004, 0.002)}`}
69
+ seed="0"
70
+ type="turbulence"
71
+ />
72
+ <feColorMatrix
73
+ ref={feColorMatrixRef}
74
+ in="undulation"
75
+ type="hueRotate"
76
+ values="180"
77
+ />
78
+ <feColorMatrix
79
+ in="dist"
80
+ result="circulation"
81
+ type="matrix"
82
+ values="4 0 0 0 1 4 0 0 0 1 4 0 0 0 1 1 0 0 0 0"
83
+ />
84
+ <feDisplacementMap
85
+ in="SourceGraphic"
86
+ in2="circulation"
87
+ scale={displacementScale}
88
+ result="dist"
89
+ />
90
+ <feDisplacementMap
91
+ in="dist"
92
+ in2="undulation"
93
+ scale={displacementScale}
94
+ result="output"
95
+ />
96
+ </filter>
97
+ </defs>
98
+ </svg>
99
+ <div
100
+ style={{
101
+ backgroundColor: 'rgba(99, 102, 241, 0.4)',
102
+ maskImage: `url('https://framerusercontent.com/images/ceBGguIpUU8luwByxuQz79t7To.png')`,
103
+ maskSize: "cover",
104
+ maskRepeat: "no-repeat",
105
+ maskPosition: "center",
106
+ width: "120%",
107
+ height: "120%",
108
+ position: 'absolute',
109
+ top: '-10%',
110
+ left: '-10%'
111
+ }}
112
+ />
113
+ </div>
114
+
115
+ {/* Noise overlay */}
116
+ <div
117
+ className="absolute inset-0 pointer-events-none opacity-[0.03]"
118
+ style={{
119
+ backgroundImage: `url("https://framerusercontent.com/images/g0QcWrxr87K0ufOxIUFBakwYA8.png")`,
120
+ backgroundSize: '100px',
121
+ backgroundRepeat: "repeat",
122
+ }}
123
+ />
124
+
125
+ <div className="relative z-20 max-w-7xl mx-auto px-6 text-center">
126
+ <motion.div
127
+ initial={{ opacity: 0, y: 30 }}
128
+ whileInView={{ opacity: 1, y: 0 }}
129
+ viewport={{ once: true }}
130
+ transition={{ duration: 0.8 }}
131
+ >
132
+ <h2 className="text-4xl md:text-7xl font-extrabold text-white mb-8 tracking-tighter">
133
+ Ready to automate your workflow?
134
+ </h2>
135
+ <p className="text-white/50 text-xl md:text-2xl mb-12 max-w-2xl mx-auto font-medium leading-relaxed">
136
+ Build smarter ML workflows with AI autonomy. Join the next generation of data scientists.
137
+ </p>
138
+ <button className="group relative px-10 py-5 bg-white text-black font-extrabold rounded-2xl transition-all hover:scale-105 active:scale-95 shadow-[0_0_50px_-12px_rgba(255,255,255,0.5)] flex items-center gap-3 mx-auto">
139
+ Get Started Now
140
+ <ArrowRight className="w-5 h-5 group-hover:translate-x-1 transition-transform" />
141
+ </button>
142
+ </motion.div>
143
+ </div>
144
+
145
+ {/* Gradient fades to blend with rest of footer */}
146
+ <div className="absolute inset-x-0 bottom-0 h-40 bg-gradient-to-t from-[#030303] to-transparent z-10" />
147
+ <div className="absolute inset-x-0 top-0 h-40 bg-gradient-to-b from-[#030303] to-transparent z-10" />
148
+ </section>
149
+
150
+ {/* Main Footer Links */}
151
+ <div className="max-w-7xl mx-auto px-6 pb-20">
152
+ <div className="pt-8 border-t border-white/5 flex flex-col md:flex-row justify-between items-center gap-6">
153
+ <div className="flex items-center gap-4">
154
+ <Logo className="w-8 h-8" />
155
+ <span className="text-white font-extrabold tracking-tight uppercase">DATA SCIENCE AGENT</span>
156
+ </div>
157
+ <div className="text-white/30 text-[10px] sm:text-xs font-semibold uppercase tracking-wider">
158
+ © 2025 Data Science Agent. Built for the autonomous future.
159
+ </div>
160
+ <div className="flex gap-8 text-white/40 text-sm font-bold italic">
161
+ <a href="#" className="hover:text-white transition-colors">Twitter</a>
162
+ <a href="#" className="hover:text-white transition-colors">GitHub</a>
163
+ <a href="#" className="hover:text-white transition-colors">Docs</a>
164
+ </div>
165
+ </div>
166
+ </div>
167
+ </footer>
168
+ );
169
+ };
170
+
171
+ export default Footer;
FRRONTEEEND/components/HeroGeometric.tsx ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import React from 'react';
3
+ import { motion, Variants } from "framer-motion";
4
+ import { Circle, MessageSquare } from "lucide-react";
5
+ import { cn } from "../lib/utils";
6
+
7
+ function ElegantShape({
8
+ className,
9
+ delay = 0,
10
+ width = 400,
11
+ height = 100,
12
+ rotate = 0,
13
+ gradient = "from-white/[0.08]",
14
+ }: {
15
+ className?: string;
16
+ delay?: number;
17
+ width?: number;
18
+ height?: number;
19
+ rotate?: number;
20
+ gradient?: string;
21
+ }) {
22
+ return (
23
+ <motion.div
24
+ initial={{
25
+ opacity: 0,
26
+ y: -150,
27
+ rotate: rotate - 15,
28
+ }}
29
+ animate={{
30
+ opacity: 1,
31
+ y: 0,
32
+ rotate: rotate,
33
+ }}
34
+ transition={{
35
+ duration: 2.4,
36
+ delay,
37
+ ease: [0.23, 0.86, 0.39, 0.96],
38
+ opacity: { duration: 1.2 },
39
+ }}
40
+ className={cn("absolute", className)}
41
+ >
42
+ <motion.div
43
+ animate={{
44
+ y: [0, 15, 0],
45
+ }}
46
+ transition={{
47
+ duration: 12,
48
+ repeat: Number.POSITIVE_INFINITY,
49
+ ease: "easeInOut",
50
+ }}
51
+ style={{
52
+ width,
53
+ height,
54
+ }}
55
+ className="relative"
56
+ >
57
+ <div
58
+ className={cn(
59
+ "absolute inset-0 rounded-full",
60
+ "bg-gradient-to-r to-transparent",
61
+ gradient,
62
+ "backdrop-blur-[2px] border-2 border-white/[0.15]",
63
+ "shadow-[0_8px_32px_0_rgba(255,255,255,0.1)]",
64
+ "after:absolute after:inset-0 after:rounded-full",
65
+ "after:bg-[radial-gradient(circle_at_50%_50%,rgba(255,255,255,0.2),transparent_70%)]"
66
+ )}
67
+ />
68
+ </motion.div>
69
+ </motion.div>
70
+ );
71
+ }
72
+
73
+ export function HeroGeometric({
74
+ badge = "Autonomous AI for Data Science",
75
+ title1 = "DATA SCIENCE AGENT",
76
+ title2 = "Autonomous AI for End-to-End ML",
77
+ onChatClick,
78
+ }: {
79
+ badge?: string;
80
+ title1?: string;
81
+ title2?: string;
82
+ onChatClick?: () => void;
83
+ }) {
84
+ const fadeUpVariants: Variants = {
85
+ hidden: { opacity: 0, y: 30 },
86
+ visible: (i: number) => ({
87
+ opacity: 1,
88
+ y: 0,
89
+ transition: {
90
+ duration: 1,
91
+ delay: 0.5 + i * 0.2,
92
+ ease: [0.25, 0.4, 0.25, 1] as [number, number, number, number],
93
+ },
94
+ }),
95
+ };
96
+
97
+ return (
98
+ <div className="relative min-h-screen w-full flex items-center justify-center overflow-hidden bg-[#030303]">
99
+ <div className="absolute inset-0 bg-gradient-to-br from-indigo-500/[0.05] via-transparent to-rose-500/[0.05] blur-3xl" />
100
+
101
+ <div className="absolute inset-0 overflow-hidden">
102
+ <ElegantShape
103
+ delay={0.3}
104
+ width={600}
105
+ height={140}
106
+ rotate={12}
107
+ gradient="from-indigo-500/[0.15]"
108
+ className="left-[-10%] md:left-[-5%] top-[15%] md:top-[20%]"
109
+ />
110
+ <ElegantShape
111
+ delay={0.5}
112
+ width={500}
113
+ height={120}
114
+ rotate={-15}
115
+ gradient="from-rose-500/[0.15]"
116
+ className="right-[-5%] md:right-[0%] top-[70%] md:top-[75%]"
117
+ />
118
+ <ElegantShape
119
+ delay={0.4}
120
+ width={300}
121
+ height={80}
122
+ rotate={-8}
123
+ gradient="from-violet-500/[0.15]"
124
+ className="left-[5%] md:left-[10%] bottom-[5%] md:bottom-[10%]"
125
+ />
126
+ <ElegantShape
127
+ delay={0.6}
128
+ width={200}
129
+ height={60}
130
+ rotate={20}
131
+ gradient="from-amber-500/[0.15]"
132
+ className="right-[15%] md:right-[20%] top-[10%] md:top-[15%]"
133
+ />
134
+ <ElegantShape
135
+ delay={0.7}
136
+ width={150}
137
+ height={40}
138
+ rotate={-25}
139
+ gradient="from-cyan-500/[0.15]"
140
+ className="left-[20%] md:left-[25%] top-[5%] md:top-[10%]"
141
+ />
142
+ </div>
143
+
144
+ <div className="relative z-10 container mx-auto px-4 md:px-6">
145
+ <div className="max-w-4xl mx-auto text-center">
146
+ <motion.div
147
+ custom={0}
148
+ variants={fadeUpVariants}
149
+ initial="hidden"
150
+ animate="visible"
151
+ className="inline-flex items-center gap-2 px-4 py-1.5 rounded-full bg-white/[0.03] border border-white/[0.08] mb-6 md:mb-10"
152
+ >
153
+ <Circle className="h-2 w-2 fill-indigo-500/80" />
154
+ <span className="text-xs font-semibold text-white/60 tracking-[0.1em] uppercase">
155
+ {badge}
156
+ </span>
157
+ </motion.div>
158
+
159
+ <motion.div
160
+ custom={1}
161
+ variants={fadeUpVariants}
162
+ initial="hidden"
163
+ animate="visible"
164
+ >
165
+ <h1 className="text-3xl sm:text-4xl md:text-6xl font-extrabold mb-6 md:mb-8 tracking-tight leading-[1.1]">
166
+ <span className="bg-clip-text text-transparent bg-gradient-to-b from-white to-white/80">
167
+ {title1}
168
+ </span>
169
+ <br />
170
+ <span
171
+ className={cn(
172
+ "bg-clip-text text-transparent bg-gradient-to-r from-indigo-300 via-white/90 to-rose-300"
173
+ )}
174
+ >
175
+ {title2}
176
+ </span>
177
+ </h1>
178
+ </motion.div>
179
+
180
+ <motion.div
181
+ custom={2}
182
+ variants={fadeUpVariants}
183
+ initial="hidden"
184
+ animate="visible"
185
+ >
186
+ <p className="text-sm sm:text-base md:text-lg text-white/40 mb-10 leading-relaxed font-normal tracking-tight max-w-xl mx-auto px-4">
187
+ Upload your data. Describe your goal.
188
+ Let AI handle profiling, modeling, visualization, and strategic insights autonomously.
189
+ </p>
190
+ </motion.div>
191
+
192
+ <motion.div
193
+ custom={3}
194
+ variants={fadeUpVariants}
195
+ initial="hidden"
196
+ animate="visible"
197
+ className="flex flex-col sm:flex-row items-center justify-center gap-4 px-4"
198
+ >
199
+ <button
200
+ onClick={onChatClick}
201
+ className="w-full sm:w-auto px-8 py-3.5 bg-white text-black font-bold rounded-xl hover:bg-white/90 transition-all flex items-center justify-center gap-2 group text-sm shadow-xl"
202
+ >
203
+ Chat Now
204
+ <MessageSquare className="w-4 h-4 fill-black group-hover:translate-x-0.5 transition-transform" />
205
+ </button>
206
+ </motion.div>
207
+ </div>
208
+ </div>
209
+
210
+ <div className="absolute inset-0 bg-gradient-to-t from-[#030303] via-transparent to-[#030303]/80 pointer-events-none" />
211
+ </div>
212
+ );
213
+ }
FRRONTEEEND/components/KeyCapabilities.tsx ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import React from 'react';
3
+ import { motion } from 'framer-motion';
4
+ import { Database, Wrench, Cpu, Brain, LineChart, Server } from 'lucide-react';
5
+ import { cn } from '../lib/utils';
6
+
7
+ const capabilities = [
8
+ {
9
+ title: "Autonomous ML Pipelines",
10
+ description: "End-to-end automation from profiling to deployment without manual coding.",
11
+ icon: Database,
12
+ color: "from-blue-500/20 to-cyan-500/20",
13
+ hover: "hover:bg-blue-500/10 hover:border-blue-500/30 hover:shadow-[0_0_30px_-10px_rgba(59,130,246,0.2)]"
14
+ },
15
+ {
16
+ title: "82+ Specialized Tools",
17
+ description: "An extensive arsenal for cleaning, statistical testing, and predictive modeling.",
18
+ icon: Wrench,
19
+ color: "from-purple-500/20 to-pink-500/20",
20
+ hover: "hover:bg-pink-500/10 hover:border-pink-500/30 hover:shadow-[0_0_30px_-10px_rgba(236,72,153,0.2)]"
21
+ },
22
+ {
23
+ title: "Dual LLM Intelligence",
24
+ description: "Orchestrated by Groq (for speed) and Gemini (for deep reasoning).",
25
+ icon: Brain,
26
+ color: "from-orange-500/20 to-amber-500/20",
27
+ hover: "hover:bg-amber-500/10 hover:border-amber-500/30 hover:shadow-[0_0_30px_-10px_rgba(245,158,11,0.2)]"
28
+ },
29
+ {
30
+ title: "Session Memory",
31
+ description: "Maintains context across complex workflows, allowing for iterative refinement.",
32
+ icon: Cpu,
33
+ color: "from-emerald-500/20 to-teal-500/20",
34
+ hover: "hover:bg-emerald-500/10 hover:border-emerald-500/30 hover:shadow-[0_0_30px_-10px_rgba(16,185,129,0.2)]"
35
+ },
36
+ {
37
+ title: "Visual Insights",
38
+ description: "Automatic generation of publication-quality charts and explainability reports.",
39
+ icon: LineChart,
40
+ color: "from-indigo-500/20 to-blue-500/20",
41
+ hover: "hover:bg-indigo-500/10 hover:border-indigo-500/30 hover:shadow-[0_0_30px_-10px_rgba(99,102,241,0.2)]"
42
+ },
43
+ {
44
+ title: "Cloud Run Ready",
45
+ description: "Deploy your optimized models directly to production-grade cloud environments.",
46
+ icon: Server,
47
+ color: "from-rose-500/20 to-red-500/20",
48
+ hover: "hover:bg-rose-500/10 hover:border-rose-500/30 hover:shadow-[0_0_30px_-10px_rgba(244,63,94,0.2)]"
49
+ }
50
+ ];
51
+
52
+ const KeyCapabilities = () => {
53
+ return (
54
+ <section id="features" className="py-24 bg-[#030303]">
55
+ <div className="max-w-7xl mx-auto px-6">
56
+ <div className="text-center mb-16">
57
+ <h2 className="text-4xl md:text-5xl font-extrabold text-white mb-4 tracking-tight">Powerful Orchestration</h2>
58
+ <p className="text-white/40 text-xl font-medium">Not just a chatbot, but a true system of intelligence.</p>
59
+ </div>
60
+
61
+ <div className="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-3 gap-8">
62
+ {capabilities.map((cap, i) => (
63
+ <motion.div
64
+ key={i}
65
+ initial={{ opacity: 0, y: 20 }}
66
+ whileInView={{ opacity: 1, y: 0 }}
67
+ viewport={{ once: true }}
68
+ transition={{ delay: i * 0.1 }}
69
+ whileHover={{ scale: 1.02, y: -5 }}
70
+ className={cn(
71
+ "group p-8 rounded-2xl bg-white/[0.02] border border-white/[0.08] transition-all duration-300 cursor-default",
72
+ cap.hover
73
+ )}
74
+ >
75
+ <div className={cn(
76
+ "w-12 h-12 rounded-lg bg-gradient-to-br flex items-center justify-center mb-6 group-hover:scale-110 transition-transform duration-300",
77
+ cap.color
78
+ )}>
79
+ <cap.icon className="w-6 h-6 text-white" />
80
+ </div>
81
+ <h3 className="text-xl font-bold text-white mb-3 tracking-tight">{cap.title}</h3>
82
+ <p className="text-white/50 leading-relaxed font-medium">{cap.description}</p>
83
+ </motion.div>
84
+ ))}
85
+ </div>
86
+ </div>
87
+ </section>
88
+ );
89
+ };
90
+
91
+ export default KeyCapabilities;
FRRONTEEEND/components/Logo.tsx ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import React from 'react';
3
+ import { cn } from '../lib/utils';
4
+
5
+ interface LogoProps {
6
+ className?: string;
7
+ showText?: boolean;
8
+ }
9
+
10
+ export const Logo: React.FC<LogoProps> = ({ className, showText = false }) => {
11
+ return (
12
+ <div className={cn("flex flex-col items-center", className)}>
13
+ <svg
14
+ viewBox="0 0 120 120"
15
+ className="w-full h-full"
16
+ fill="none"
17
+ xmlns="http://www.w3.org/2000/svg"
18
+ >
19
+ <defs>
20
+ <linearGradient id="logoGradient" x1="0%" y1="0%" x2="100%" y2="100%">
21
+ <stop offset="0%" stopColor="#22d3ee" />
22
+ <stop offset="100%" stopColor="#6366f1" />
23
+ </linearGradient>
24
+ <filter id="glow" x="-20%" y="-20%" width="140%" height="140%">
25
+ <feGaussianBlur stdDeviation="2" result="blur" />
26
+ <feComposite in="SourceGraphic" in2="blur" operator="over" />
27
+ </filter>
28
+ </defs>
29
+
30
+ {/* Central Core */}
31
+ <circle cx="60" cy="60" r="6" fill="url(#logoGradient)" filter="url(#glow)" />
32
+
33
+ {/* Inner Circuit Ring */}
34
+ <circle cx="60" cy="60" r="18" stroke="url(#logoGradient)" strokeWidth="1" strokeDasharray="2 4" opacity="0.4" />
35
+
36
+ {/* Complex Neural Paths (Stylized) */}
37
+ <g opacity="0.8">
38
+ {[0, 45, 90, 135, 180, 225, 270, 315].map((angle) => (
39
+ <g key={angle} transform={`rotate(${angle} 60 60)`}>
40
+ <path
41
+ d="M60 35 L60 30 M60 30 L55 25 M60 30 L65 25"
42
+ stroke="url(#logoGradient)"
43
+ strokeWidth="1.5"
44
+ strokeLinecap="round"
45
+ />
46
+ <circle cx="55" cy="25" r="1.5" fill="url(#logoGradient)" />
47
+ <circle cx="65" cy="25" r="1.5" fill="url(#logoGradient)" />
48
+ </g>
49
+ ))}
50
+ </g>
51
+
52
+ {/* Middle Dashed Ring */}
53
+ <circle cx="60" cy="60" r="32" stroke="url(#logoGradient)" strokeWidth="1.5" strokeDasharray="10 6" opacity="0.6" />
54
+
55
+ {/* Outer Orbital with Squares */}
56
+ <circle cx="60" cy="60" r="45" stroke="url(#logoGradient)" strokeWidth="0.5" opacity="0.3" />
57
+ {[0, 30, 60, 90, 120, 150, 180, 210, 240, 270, 300, 330].map((angle) => (
58
+ <rect
59
+ key={angle}
60
+ x="58"
61
+ y="12"
62
+ width="4"
63
+ height="4"
64
+ fill="url(#logoGradient)"
65
+ transform={`rotate(${angle} 60 60)`}
66
+ rx="1"
67
+ />
68
+ ))}
69
+
70
+ {/* Connection Spokes */}
71
+ {[0, 90, 180, 270].map((angle) => (
72
+ <line
73
+ key={angle}
74
+ x1="60"
75
+ y1="16"
76
+ x2="60"
77
+ y2="30"
78
+ stroke="url(#logoGradient)"
79
+ strokeWidth="1"
80
+ opacity="0.5"
81
+ transform={`rotate(${angle} 60 60)`}
82
+ />
83
+ ))}
84
+ </svg>
85
+ {showText && (
86
+ <span className="mt-2 text-white font-extrabold tracking-widest text-[10px] sm:text-xs uppercase">
87
+ DATA SCIENCE AGENT
88
+ </span>
89
+ )}
90
+ </div>
91
+ );
92
+ };
FRRONTEEEND/components/ProblemSolution.tsx ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import React from 'react';
3
+ import { motion } from 'framer-motion';
4
+ import { AlertCircle, Zap, ShieldCheck, Clock } from 'lucide-react';
5
+
6
+ const ProblemSolution = () => {
7
+ return (
8
+ <section className="py-24 relative bg-[#030303] overflow-hidden">
9
+ <div className="max-w-7xl mx-auto px-6">
10
+ <div className="grid grid-cols-1 lg:grid-cols-2 gap-16 items-center">
11
+ <motion.div
12
+ initial={{ opacity: 0, x: -30 }}
13
+ whileInView={{ opacity: 1, x: 0 }}
14
+ viewport={{ once: true }}
15
+ transition={{ duration: 0.8 }}
16
+ >
17
+ <h2 className="text-3xl md:text-5xl font-extrabold text-white mb-6 tracking-tight">
18
+ The Data Science <span className="text-rose-400">Bottleneck</span>
19
+ </h2>
20
+ <p className="text-white/60 text-lg mb-8 leading-relaxed font-medium">
21
+ Modern data science is 80% manual labor. Cleaning messy datasets, engineering features, and tuning models takes weeks of repetitive effort. Mistakes are costly, and scaling insights is slow.
22
+ </p>
23
+ <ul className="space-y-4">
24
+ {[
25
+ { icon: AlertCircle, text: "Error-prone manual data preprocessing", color: "text-rose-400" },
26
+ { icon: Clock, text: "Days spent on hyperparameter tuning", color: "text-rose-400" },
27
+ { icon: AlertCircle, text: "Disconnected silos of code and insights", color: "text-rose-400" },
28
+ ].map((item, i) => (
29
+ <li key={i} className="flex items-center gap-3 text-white/80 font-semibold">
30
+ <item.icon className={`w-5 h-5 ${item.color}`} />
31
+ <span>{item.text}</span>
32
+ </li>
33
+ ))}
34
+ </ul>
35
+ </motion.div>
36
+
37
+ <motion.div
38
+ initial={{ opacity: 0, x: 30 }}
39
+ whileInView={{ opacity: 1, x: 0 }}
40
+ viewport={{ once: true }}
41
+ transition={{ duration: 0.8 }}
42
+ className="relative p-8 md:p-12 rounded-3xl bg-gradient-to-br from-indigo-500/10 via-white/5 to-rose-500/10 border border-white/10"
43
+ >
44
+ <div className="absolute -top-6 -right-6 w-32 h-32 bg-indigo-500/20 blur-3xl" />
45
+ <h2 className="text-3xl md:text-5xl font-extrabold text-white mb-6 tracking-tight">
46
+ The <span className="text-indigo-400">Autonomous</span> Solution
47
+ </h2>
48
+ <p className="text-white/60 text-lg mb-8 leading-relaxed font-medium">
49
+ DATA SCIENCE AGENT automates the entire lifecycle. From raw CSV to production-ready models and interactive dashboards, our agent uses 82+ specialized tools to deliver precision at scale.
50
+ </p>
51
+ <ul className="space-y-4">
52
+ {[
53
+ { icon: Zap, text: "Instant feature engineering and selection", color: "text-indigo-400" },
54
+ { icon: ShieldCheck, text: "Automated error recovery and re-training", color: "text-indigo-400" },
55
+ { icon: Zap, text: "Explainable AI (XAI) reports by default", color: "text-indigo-400" },
56
+ ].map((item, i) => (
57
+ <li key={i} className="flex items-center gap-3 text-white/80 font-semibold">
58
+ <item.icon className={`w-5 h-5 ${item.color}`} />
59
+ <span>{item.text}</span>
60
+ </li>
61
+ ))}
62
+ </ul>
63
+ </motion.div>
64
+ </div>
65
+ </div>
66
+ </section>
67
+ );
68
+ };
69
+
70
+ export default ProblemSolution;
FRRONTEEEND/components/Process.tsx ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import React from 'react';
3
+ import { motion } from 'framer-motion';
4
+
5
+ const steps = [
6
+ {
7
+ number: "01",
8
+ title: "Ingest Data",
9
+ description: "Upload your raw CSV, JSON, or Parquet files directly to the secure environment."
10
+ },
11
+ {
12
+ number: "02",
13
+ title: "Define Objective",
14
+ description: "Describe what you want to achieve in natural language. 'Predict churn' or 'Find outliers'."
15
+ },
16
+ {
17
+ number: "03",
18
+ title: "Agent Execution",
19
+ description: "The agent orchestrates tools to clean, transform, and model your data autonomously."
20
+ },
21
+ {
22
+ number: "04",
23
+ title: "Receive Assets",
24
+ description: "Get fully trained models, performance metrics, and interactive explainable reports."
25
+ }
26
+ ];
27
+
28
+ const Process = () => {
29
+ return (
30
+ <section id="process" className="py-24 bg-[#030303] border-y border-white/5">
31
+ <div className="max-w-7xl mx-auto px-6">
32
+ <div className="text-center mb-20">
33
+ <h2 className="text-4xl md:text-5xl font-extrabold text-white mb-4 tracking-tight">How it Works</h2>
34
+ <p className="text-white/40 text-xl font-medium">From raw data to actionable intelligence in 4 steps.</p>
35
+ </div>
36
+
37
+ <div className="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-4 gap-12">
38
+ {steps.map((step, i) => (
39
+ <motion.div
40
+ key={i}
41
+ initial={{ opacity: 0, scale: 0.95 }}
42
+ whileInView={{ opacity: 1, scale: 1 }}
43
+ viewport={{ once: true }}
44
+ transition={{ delay: i * 0.1 }}
45
+ className="relative"
46
+ >
47
+ <span className="text-7xl font-extrabold text-white/5 absolute -top-10 -left-4 select-none italic">
48
+ {step.number}
49
+ </span>
50
+ <div className="relative z-10">
51
+ <h3 className="text-xl font-bold text-white mb-4 flex items-center gap-2 tracking-tight">
52
+ <span className="w-1.5 h-1.5 rounded-full bg-indigo-500" />
53
+ {step.title}
54
+ </h3>
55
+ <p className="text-white/40 leading-relaxed font-medium">
56
+ {step.description}
57
+ </p>
58
+ </div>
59
+ {i < steps.length - 1 && (
60
+ <div className="hidden lg:block absolute top-1/2 -right-6 w-12 h-[1px] bg-gradient-to-r from-white/10 to-transparent" />
61
+ )}
62
+ </motion.div>
63
+ ))}
64
+ </div>
65
+ </div>
66
+ </section>
67
+ );
68
+ };
69
+
70
+ export default Process;
FRRONTEEEND/components/ShadowSection.tsx ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ 'use client';
3
+
4
+ import React, { useRef, useId, useEffect, CSSProperties } from 'react';
5
+ import { animate, useMotionValue, AnimationPlaybackControls, motion } from 'framer-motion';
6
+ import { cn } from '../lib/utils';
7
+
8
+ // Type definitions
9
+ interface ResponsiveImage {
10
+ src: string;
11
+ alt?: string;
12
+ srcSet?: string;
13
+ }
14
+
15
+ interface AnimationConfig {
16
+ preview?: boolean;
17
+ scale: number;
18
+ speed: number;
19
+ }
20
+
21
+ interface NoiseConfig {
22
+ opacity: number;
23
+ scale: number;
24
+ }
25
+
26
+ interface ShadowOverlayProps {
27
+ type?: 'preset' | 'custom';
28
+ presetIndex?: number;
29
+ customImage?: ResponsiveImage;
30
+ sizing?: 'fill' | 'stretch';
31
+ color?: string;
32
+ animation?: AnimationConfig;
33
+ noise?: NoiseConfig;
34
+ style?: CSSProperties;
35
+ className?: string;
36
+ title?: string;
37
+ description?: string;
38
+ }
39
+
40
+ function mapRange(
41
+ value: number,
42
+ fromLow: number,
43
+ fromHigh: number,
44
+ toLow: number,
45
+ toHigh: number
46
+ ): number {
47
+ if (fromLow === fromHigh) {
48
+ return toLow;
49
+ }
50
+ const percentage = (value - fromLow) / (fromHigh - fromLow);
51
+ return toLow + percentage * (toHigh - toLow);
52
+ }
53
+
54
+ const useInstanceId = (): string => {
55
+ const id = useId();
56
+ const cleanId = id.replace(/:/g, "");
57
+ const instanceId = `shadowoverlay-${cleanId}`;
58
+ return instanceId;
59
+ };
60
+
61
+ export function ShadowSection({
62
+ sizing = 'fill',
63
+ color = 'rgba(99, 102, 241, 0.6)',
64
+ animation = { scale: 50, speed: 15 },
65
+ noise = { opacity: 0.1, scale: 0.5 },
66
+ style,
67
+ className,
68
+ title = "Cognitive Core",
69
+ description = "The unseen intelligence powering your most critical decisions."
70
+ }: ShadowOverlayProps) {
71
+ const id = useInstanceId();
72
+ const animationEnabled = animation && animation.scale > 0;
73
+ const feColorMatrixRef = useRef<SVGFEColorMatrixElement>(null);
74
+ const hueRotateMotionValue = useMotionValue(180);
75
+ const hueRotateAnimation = useRef<AnimationPlaybackControls | null>(null);
76
+
77
+ const displacementScale = animation ? mapRange(animation.scale, 1, 100, 20, 100) : 0;
78
+ const animationDuration = animation ? mapRange(animation.speed, 1, 100, 1000, 50) : 1;
79
+
80
+ useEffect(() => {
81
+ if (feColorMatrixRef.current && animationEnabled) {
82
+ if (hueRotateAnimation.current) {
83
+ hueRotateAnimation.current.stop();
84
+ }
85
+ hueRotateMotionValue.set(0);
86
+ hueRotateAnimation.current = animate(hueRotateMotionValue, 360, {
87
+ duration: animationDuration / 25,
88
+ repeat: Infinity,
89
+ repeatType: "loop",
90
+ repeatDelay: 0,
91
+ ease: "linear",
92
+ delay: 0,
93
+ onUpdate: (value: number) => {
94
+ if (feColorMatrixRef.current) {
95
+ feColorMatrixRef.current.setAttribute("values", String(value));
96
+ }
97
+ }
98
+ });
99
+
100
+ return () => {
101
+ if (hueRotateAnimation.current) {
102
+ hueRotateAnimation.current.stop();
103
+ }
104
+ };
105
+ }
106
+ }, [animationEnabled, animationDuration, hueRotateMotionValue]);
107
+
108
+ return (
109
+ <section
110
+ className={cn("relative w-full h-[70vh] min-h-[500px] overflow-hidden bg-[#030303]", className)}
111
+ style={style}
112
+ >
113
+ <div
114
+ style={{
115
+ position: "absolute",
116
+ inset: -displacementScale,
117
+ filter: animationEnabled ? `url(#${id}) blur(8px)` : "none"
118
+ }}
119
+ >
120
+ {animationEnabled && (
121
+ <svg style={{ position: "absolute", width: 0, height: 0 }}>
122
+ <defs>
123
+ <filter id={id}>
124
+ <feTurbulence
125
+ result="undulation"
126
+ numOctaves="2"
127
+ baseFrequency={`${mapRange(animation.scale, 0, 100, 0.001, 0.0005)},${mapRange(animation.scale, 0, 100, 0.004, 0.002)}`}
128
+ seed="0"
129
+ type="turbulence"
130
+ />
131
+ <feColorMatrix
132
+ ref={feColorMatrixRef}
133
+ in="undulation"
134
+ type="hueRotate"
135
+ values="180"
136
+ />
137
+ <feColorMatrix
138
+ in="dist"
139
+ result="circulation"
140
+ type="matrix"
141
+ values="4 0 0 0 1 4 0 0 0 1 4 0 0 0 1 1 0 0 0 0"
142
+ />
143
+ <feDisplacementMap
144
+ in="SourceGraphic"
145
+ in2="circulation"
146
+ scale={displacementScale}
147
+ result="dist"
148
+ />
149
+ <feDisplacementMap
150
+ in="dist"
151
+ in2="undulation"
152
+ scale={displacementScale}
153
+ result="output"
154
+ />
155
+ </filter>
156
+ </defs>
157
+ </svg>
158
+ )}
159
+ <div
160
+ style={{
161
+ backgroundColor: color,
162
+ maskImage: `url('https://framerusercontent.com/images/ceBGguIpUU8luwByxuQz79t7To.png')`,
163
+ maskSize: sizing === "stretch" ? "100% 100%" : "cover",
164
+ maskRepeat: "no-repeat",
165
+ maskPosition: "center",
166
+ width: "100%",
167
+ height: "100%"
168
+ }}
169
+ />
170
+ </div>
171
+
172
+ <div
173
+ style={{
174
+ position: "absolute",
175
+ top: "50%",
176
+ left: "50%",
177
+ transform: "translate(-50%, -50%)",
178
+ textAlign: "center",
179
+ zIndex: 20,
180
+ width: '100%',
181
+ padding: '0 2rem'
182
+ }}
183
+ >
184
+ <motion.h2
185
+ initial={{ opacity: 0, y: 20 }}
186
+ whileInView={{ opacity: 1, y: 0 }}
187
+ viewport={{ once: true }}
188
+ className="md:text-7xl text-5xl lg:text-8xl font-heading font-bold text-center text-white relative z-20 tracking-tighter mb-4"
189
+ >
190
+ {title}
191
+ </motion.h2>
192
+ <motion.p
193
+ initial={{ opacity: 0, y: 20 }}
194
+ whileInView={{ opacity: 1, y: 0 }}
195
+ viewport={{ once: true }}
196
+ transition={{ delay: 0.2 }}
197
+ className="text-white/60 text-lg md:text-xl font-sans max-w-xl mx-auto"
198
+ >
199
+ {description}
200
+ </motion.p>
201
+ </div>
202
+
203
+ {noise && noise.opacity > 0 && (
204
+ <div
205
+ style={{
206
+ position: "absolute",
207
+ inset: 0,
208
+ backgroundImage: `url("https://framerusercontent.com/images/g0QcWrxr87K0ufOxIUFBakwYA8.png")`,
209
+ backgroundSize: noise.scale * 200,
210
+ backgroundRepeat: "repeat",
211
+ opacity: noise.opacity / 2,
212
+ zIndex: 15
213
+ }}
214
+ />
215
+ )}
216
+
217
+ {/* Bottom Vignette */}
218
+ <div className="absolute inset-x-0 bottom-0 h-40 bg-gradient-to-t from-[#030303] to-transparent z-30" />
219
+ <div className="absolute inset-x-0 top-0 h-40 bg-gradient-to-b from-[#030303] to-transparent z-30" />
220
+ </section>
221
+ );
222
+ }
FRRONTEEEND/components/TechStack.tsx ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import React from 'react';
3
+ import { motion } from 'framer-motion';
4
+
5
+ const techs = [
6
+ "Python", "Polars", "Pandas", "Scikit-Learn", "XGBoost", "LightGBM", "Groq", "Gemini", "FastAPI", "Cloud Run", "Docker", "PyTorch"
7
+ ];
8
+
9
+ const TechStack = () => {
10
+ return (
11
+ <section className="py-24 bg-[#030303]">
12
+ <div className="max-w-7xl mx-auto px-6">
13
+ <div className="text-center mb-12">
14
+ <h3 className="text-xs font-bold uppercase tracking-[0.3em] text-white/30 italic">Built with the modern AI Stack</h3>
15
+ </div>
16
+
17
+ <div className="flex flex-wrap justify-center gap-4 md:gap-6 opacity-60">
18
+ {techs.map((tech, i) => (
19
+ <motion.div
20
+ key={tech}
21
+ initial={{ opacity: 0 }}
22
+ whileInView={{ opacity: 1 }}
23
+ viewport={{ once: true }}
24
+ transition={{ delay: i * 0.05 }}
25
+ className="px-5 py-2 rounded-lg border border-white/5 bg-white/[0.02] text-white/80 font-bold text-xs md:text-sm whitespace-nowrap tracking-wide uppercase"
26
+ >
27
+ {tech}
28
+ </motion.div>
29
+ ))}
30
+ </div>
31
+ </div>
32
+ </section>
33
+ );
34
+ };
35
+
36
+ export default TechStack;
FRRONTEEEND/index.html ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ <!DOCTYPE html>
3
+ <html lang="en">
4
+ <head>
5
+ <meta charset="UTF-8" />
6
+ <meta name="viewport" content="width=device-width, initial-scale=1.0" />
7
+ <title>Data Science Agent </title>
8
+ <script src="https://cdn.tailwindcss.com"></script>
9
+ <link rel="preconnect" href="https://fonts.googleapis.com">
10
+ <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
11
+ <link href="https://fonts.googleapis.com/css2?family=Plus+Jakarta+Sans:ital,wght@0,200;0,300;0,400;0,500;0,600;0,700;0,800;1,200;1,300;1,400;1,500;1,600;1,700;1,800&display=swap" rel="stylesheet">
12
+ <script>
13
+ tailwind.config = {
14
+ theme: {
15
+ extend: {
16
+ fontFamily: {
17
+ sans: ['Plus Jakarta Sans', 'sans-serif'],
18
+ heading: ['Plus Jakarta Sans', 'sans-serif'],
19
+ mono: ['Plus Jakarta Sans', 'sans-serif'],
20
+ },
21
+ },
22
+ },
23
+ }
24
+ </script>
25
+ <style>
26
+ body {
27
+ margin: 0;
28
+ background-color: #030303;
29
+ overflow-x: hidden;
30
+ font-family: 'Plus Jakarta Sans', sans-serif;
31
+ -webkit-font-smoothing: antialiased;
32
+ -moz-osx-font-smoothing: grayscale;
33
+ }
34
+ ::selection {
35
+ background-color: rgba(99, 102, 241, 0.3);
36
+ color: white;
37
+ }
38
+ </style>
39
+ <script type="importmap">
40
+ {
41
+ "imports": {
42
+ "react": "https://esm.sh/react@^19.2.3",
43
+ "react-dom/": "https://esm.sh/react-dom@^19.2.3/",
44
+ "react/": "https://esm.sh/react@^19.2.3/",
45
+ "clsx": "https://esm.sh/clsx@^2.1.1",
46
+ "tailwind-merge": "https://esm.sh/tailwind-merge@^3.4.0",
47
+ "framer-motion": "https://esm.sh/framer-motion@^12.23.26",
48
+ "lucide-react": "https://esm.sh/lucide-react@^0.562.0",
49
+ "@google/genai": "https://esm.sh/@google/genai@^1.34.0"
50
+ }
51
+ }
52
+ </script>
53
+ <link rel="stylesheet" href="/index.css">
54
+ </head>
55
+ <body>
56
+ <div id="root"></div>
57
+ <script type="module" src="/index.tsx"></script>
58
+ </body>
59
+ </html>
FRRONTEEEND/index.tsx ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import React from 'react';
3
+ import ReactDOM from 'react-dom/client';
4
+ import App from './App';
5
+
6
+ const rootElement = document.getElementById('root');
7
+ if (!rootElement) {
8
+ throw new Error("Could not find root element to mount to");
9
+ }
10
+
11
+ const root = ReactDOM.createRoot(rootElement);
12
+ root.render(
13
+ <React.StrictMode>
14
+ <App />
15
+ </React.StrictMode>
16
+ );
FRRONTEEEND/lib/utils.ts ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+
2
+ import { clsx, type ClassValue } from 'clsx';
3
+ import { twMerge } from 'tailwind-merge';
4
+
5
+ export function cn(...inputs: ClassValue[]) {
6
+ return twMerge(clsx(inputs));
7
+ }
FRRONTEEEND/metadata.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "name": "Data Science Agent",
3
+ "description": "A production-grade autonomous AI agent for end-to-end data science workflows, featuring 82+ specialized tools and dual LLM support.",
4
+ "requestFramePermissions": []
5
+ }
FRRONTEEEND/package-lock.json ADDED
The diff for this file is too large to render. See raw diff
 
FRRONTEEEND/package.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "data-science-agent",
3
+ "private": true,
4
+ "version": "0.0.0",
5
+ "type": "module",
6
+ "scripts": {
7
+ "dev": "vite",
8
+ "build": "vite build",
9
+ "preview": "vite preview"
10
+ },
11
+ "dependencies": {
12
+ "react": "^19.2.3",
13
+ "react-dom": "^19.2.3",
14
+ "clsx": "^2.1.1",
15
+ "tailwind-merge": "^3.4.0",
16
+ "framer-motion": "^12.23.26",
17
+ "lucide-react": "^0.562.0",
18
+ "react-markdown": "^9.0.1"
19
+ },
20
+ "devDependencies": {
21
+ "@types/node": "^22.14.0",
22
+ "@vitejs/plugin-react": "^5.0.0",
23
+ "typescript": "~5.8.2",
24
+ "vite": "^6.2.0"
25
+ }
26
+ }
FRRONTEEEND/tsconfig.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "compilerOptions": {
3
+ "target": "ES2022",
4
+ "experimentalDecorators": true,
5
+ "useDefineForClassFields": false,
6
+ "module": "ESNext",
7
+ "lib": [
8
+ "ES2022",
9
+ "DOM",
10
+ "DOM.Iterable"
11
+ ],
12
+ "skipLibCheck": true,
13
+ "types": [
14
+ "node"
15
+ ],
16
+ "moduleResolution": "bundler",
17
+ "isolatedModules": true,
18
+ "moduleDetection": "force",
19
+ "allowJs": true,
20
+ "jsx": "react-jsx",
21
+ "paths": {
22
+ "@/*": [
23
+ "./*"
24
+ ]
25
+ },
26
+ "allowImportingTsExtensions": true,
27
+ "noEmit": true
28
+ }
29
+ }
FRRONTEEEND/vite.config.ts ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import path from 'path';
2
+ import { defineConfig, loadEnv } from 'vite';
3
+ import react from '@vitejs/plugin-react';
4
+
5
+ export default defineConfig(({ mode }) => {
6
+ const env = loadEnv(mode, '.', '');
7
+ return {
8
+ server: {
9
+ port: 3000,
10
+ host: '0.0.0.0',
11
+ proxy: {
12
+ '/api': {
13
+ target: env.VITE_API_URL || 'http://localhost:8080',
14
+ changeOrigin: true,
15
+ rewrite: (path) => path.replace(/^\/api/, '')
16
+ }
17
+ }
18
+ },
19
+ plugins: [react()],
20
+ define: {
21
+ 'import.meta.env.VITE_API_URL': JSON.stringify(env.VITE_API_URL || 'http://localhost:8080')
22
+ },
23
+ resolve: {
24
+ alias: {
25
+ '@': path.resolve(__dirname, '.'),
26
+ }
27
+ }
28
+ };
29
+ });
GEMINI_UPDATE.md ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🔄 Updated to Use Google Gemini!
2
+
3
+ ## What Changed
4
+
5
+ The application now uses **Google Gemini (gemini-2.0-flash-exp)** instead of Groq for the chat interface.
6
+
7
+ ## Required Setup
8
+
9
+ ### 1. Set Your Google API Key
10
+
11
+ ```powershell
12
+ # Windows PowerShell
13
+ $env:GOOGLE_API_KEY="your-google-api-key-here"
14
+
15
+ # Verify it's set
16
+ echo $env:GOOGLE_API_KEY
17
+ ```
18
+
19
+ ### 2. Get Your API Key
20
+
21
+ If you don't have a Google API key:
22
+ 1. Go to [Google AI Studio](https://aistudio.google.com/app/apikey)
23
+ 2. Create a new API key
24
+ 3. Copy and set it as shown above
25
+
26
+ ## Quick Start
27
+
28
+ ```powershell
29
+ # Set your API key
30
+ $env:GOOGLE_API_KEY="your-key-here"
31
+
32
+ # Run the application
33
+ .\start.ps1
34
+ ```
35
+
36
+ Then open: **http://localhost:8080**
37
+
38
+ ## What's Using Gemini
39
+
40
+ - ✅ **Chat Interface** (`/chat` endpoint) - Uses Gemini 2.0 Flash
41
+ - ℹ️ **Full Workflow** (`/run` endpoint) - Uses the main agent (configurable via LLM_PROVIDER)
42
+
43
+ ## Technical Details
44
+
45
+ The `/chat` endpoint now:
46
+ - Uses `google.generativeai` SDK
47
+ - Model: `gemini-2.0-flash-exp`
48
+ - Maintains conversation history
49
+ - Professional data science system instruction
50
+
51
+ ## Expected Console Output
52
+
53
+ When you start the server:
54
+ ```
55
+ INFO: Started server process [####]
56
+ INFO: Waiting for application startup.
57
+ ✅ Agent initialized with provider: gemini
58
+ ✅ Frontend assets mounted from C:\Users\Pulastya\Videos\DS AGENTTTT\FRRONTEEEND\dist
59
+ INFO: Application startup complete.
60
+ INFO: Uvicorn running on http://0.0.0.0:8080
61
+ ```
62
+
63
+ ## Files Updated
64
+
65
+ - ✅ [src/api/app.py](src/api/app.py) - `/chat` endpoint now uses Gemini
66
+ - ✅ [.env.example](.env.example) - Updated to GOOGLE_API_KEY
67
+ - ✅ [start.ps1](start.ps1) - Updated environment variable reference
68
+ - ✅ [start.sh](start.sh) - Updated environment variable reference
69
+ - ✅ [CHECKLIST.md](CHECKLIST.md) - Updated instructions
70
+ - ✅ [FRRONTEEEND/.env](FRRONTEEEND/.env) - Added note about Gemini
71
+
72
+ ## Troubleshooting
73
+
74
+ ### Error: "API key not configured"
75
+ **Solution**: Make sure you've set the environment variable:
76
+ ```powershell
77
+ $env:GOOGLE_API_KEY="your-actual-api-key"
78
+ ```
79
+
80
+ ### Error: "Module google.generativeai not found"
81
+ **Solution**: The dependency is already in requirements.txt. Verify it's installed:
82
+ ```bash
83
+ pip install google-generativeai
84
+ ```
85
+
86
+ ### Rate Limits
87
+ Gemini 2.0 Flash has generous rate limits:
88
+ - Free tier: 15 RPM (requests per minute)
89
+ - 1 million TPM (tokens per minute)
90
+
91
+ ---
92
+
93
+ **Ready?** Set your `GOOGLE_API_KEY` and run `.\start.ps1` 🚀
MIGRATION_COMPLETE.md ADDED
@@ -0,0 +1,325 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🎉 Frontend Migration Complete!
2
+
3
+ ## Summary
4
+
5
+ Successfully replaced the old Gradio interface with a modern React-based frontend featuring:
6
+ - **Professional Landing Page**: Showcases the agent's capabilities
7
+ - **Modern Chat Interface**: NextChat-style conversational UI
8
+ - **Direct Backend Integration**: Communicates with FastAPI backend
9
+ - **Beautiful Design**: Dark theme with animations and responsive layout
10
+
11
+ ## What Was Changed
12
+
13
+ ### ✅ Backend Updates ([src/api/app.py](src/api/app.py))
14
+ 1. **Added CORS middleware** for frontend communication
15
+ 2. **Created `/chat` endpoint** for conversational interface
16
+ 3. **Static file serving** for built React app
17
+ 4. **Catch-all route** to serve `index.html` for client-side routing
18
+
19
+ ### ✅ Frontend Updates
20
+ 1. **Removed Google GenAI dependency** from [package.json](FRRONTEEEND/package.json)
21
+ 2. **Updated ChatInterface.tsx** to call backend `/chat` endpoint instead of external API
22
+ 3. **Added environment configuration**:
23
+ - `.env` for local development
24
+ - `.env.production` for production builds
25
+ 4. **Updated vite.config.ts** with proxy configuration
26
+
27
+ ### ✅ Configuration Files
28
+ 1. **requirements.txt**: Commented out Gradio (no longer needed)
29
+ 2. **Dockerfile**: Added multi-stage build for React frontend
30
+ 3. **.dockerignore**: Excluded node_modules and frontend dev files
31
+ 4. **New Scripts**:
32
+ - `start.ps1` / `start.sh` - Quick start scripts
33
+ - `build-and-deploy.ps1` / `build-and-deploy.sh` - Build scripts
34
+
35
+ ### ✅ Documentation
36
+ - **FRONTEND_INTEGRATION.md**: Complete integration guide
37
+ - **README.md**: Updated with frontend announcement
38
+
39
+ ## 🚀 How to Run
40
+
41
+ ### Quick Start (Recommended)
42
+
43
+ **Windows:**
44
+ ```powershell
45
+ .\start.ps1
46
+ ```
47
+
48
+ **Linux/Mac:**
49
+ ```bash
50
+ chmod +x start.sh
51
+ ./start.sh
52
+ ```
53
+
54
+ ### Manual Steps
55
+
56
+ 1. **Build Frontend** (already done ✅):
57
+ ```bash
58
+ cd FRRONTEEEND
59
+ npm.cmd install
60
+ npm.cmd run build
61
+ cd ..
62
+ ```
63
+
64
+ 2. **Set Environment Variables**:
65
+ ```powershell
66
+ # Required
67
+ $env:GROQ_API_KEY="your-groq-api-key-here"
68
+
69
+ # Optional
70
+ $env:GOOGLE_API_KEY="your-google-api-key"
71
+ ```
72
+
73
+ 3. **Start Backend**:
74
+ ```bash
75
+ python src\api\app.py
76
+ ```
77
+
78
+ 4. **Access Application**:
79
+ Open browser to: **http://localhost:8080**
80
+
81
+ ## 🏗️ Architecture
82
+
83
+ ```
84
+ ┌─────────────────────────────────────────────────────────┐
85
+ │ Browser │
86
+ │ │
87
+ │ ┌──────────────────────────────────────────────────┐ │
88
+ │ │ React Frontend (Port 8080) │ │
89
+ │ │ - Landing Page (HeroGeometric, etc.) │ │
90
+ │ │ - Chat Interface (ChatInterface.tsx) │ │
91
+ │ └──────────────────────────────────────────────────┘ │
92
+ │ │ │
93
+ │ │ HTTP POST /chat │
94
+ └─────────────────────────┼────────────────────────────────┘
95
+
96
+
97
+ ┌─────────────────────────────────────────────────────────┐
98
+ │ FastAPI Backend (Port 8080) │
99
+ │ │
100
+ │ ┌──────────────────────────────────────────────────┐ │
101
+ │ │ API Endpoints │ │
102
+ │ │ - POST /chat → Chat with agent │ │
103
+ │ │ - POST /run → Full workflow │ │
104
+ │ │ - POST /profile → Dataset profiling │ │
105
+ │ │ - GET /tools → List tools │ │
106
+ │ │ - GET /* → Serve React app │ │
107
+ │ └──────────────────────────────────────────────────┘ │
108
+ │ │ │
109
+ │ ▼ │
110
+ │ ┌──────────────────────────────────────────────────┐ │
111
+ │ │ DataScienceCopilot (orchestrator.py) │ │
112
+ │ │ - 82+ Tools │ │
113
+ │ │ - Groq LLM │ │
114
+ │ │ - Session Memory │ │
115
+ │ └──────────────────────────────────────────────────┘ │
116
+ └─────────────────────────────────────────────────────────┘
117
+ ```
118
+
119
+ ## 🎯 Key Endpoints
120
+
121
+ ### `/chat` - Conversational Interface
122
+ ```typescript
123
+ POST /chat
124
+ Content-Type: application/json
125
+
126
+ {
127
+ "messages": [
128
+ {"role": "user", "content": "Profile my dataset"},
129
+ {"role": "assistant", "content": "..."}
130
+ ],
131
+ "stream": false
132
+ }
133
+ ```
134
+
135
+ **Response:**
136
+ ```json
137
+ {
138
+ "success": true,
139
+ "message": "I can help you profile your dataset...",
140
+ "model": "llama-3.3-70b-versatile",
141
+ "provider": "groq"
142
+ }
143
+ ```
144
+
145
+ ### `/run` - Complete Workflow
146
+ ```bash
147
+ POST /run
148
+ Content-Type: multipart/form-data
149
+
150
+ file: <dataset.csv>
151
+ task_description: "Predict house prices"
152
+ target_col: "price"
153
+ ```
154
+
155
+ ### `/profile` - Quick Profiling
156
+ ```bash
157
+ POST /profile
158
+ Content-Type: multipart/form-data
159
+
160
+ file: <dataset.csv>
161
+ ```
162
+
163
+ ## 📝 Environment Variables
164
+
165
+ ### Backend (.env or system)
166
+ ```env
167
+ # Required
168
+ GROQ_API_KEY=your-groq-api-key
169
+
170
+ # Optional
171
+ GOOGLE_API_KEY=your-google-api-key
172
+ GCP_PROJECT_ID=your-project-id
173
+ LLM_PROVIDER=groq # or "gemini"
174
+ ```
175
+
176
+ ### Frontend (FRRONTEEEND/.env)
177
+ ```env
178
+ # Development
179
+ VITE_API_URL=http://localhost:8080
180
+
181
+ # Production (FRRONTEEEND/.env.production)
182
+ VITE_API_URL=https://your-cloud-run-url.run.app
183
+ ```
184
+
185
+ ## 🐳 Docker Deployment
186
+
187
+ The Dockerfile now includes a multi-stage build:
188
+
189
+ ```bash
190
+ # Build image
191
+ docker build -t data-science-agent .
192
+
193
+ # Run container
194
+ docker run -p 8080:8080 \
195
+ -e GROQ_API_KEY=your-key \
196
+ data-science-agent
197
+ ```
198
+
199
+ ## ☁️ Google Cloud Run Deployment
200
+
201
+ ```bash
202
+ # Build and push
203
+ gcloud builds submit --tag gcr.io/YOUR-PROJECT-ID/data-science-agent
204
+
205
+ # Deploy
206
+ gcloud run deploy data-science-agent \
207
+ --image gcr.io/YOUR-PROJECT-ID/data-science-agent \
208
+ --platform managed \
209
+ --region us-central1 \
210
+ --allow-unauthenticated \
211
+ --set-env-vars GROQ_API_KEY=your-api-key
212
+ ```
213
+
214
+ ## 🔍 Testing
215
+
216
+ ### Test Backend API
217
+ ```bash
218
+ # Health check
219
+ curl http://localhost:8080/health
220
+
221
+ # List tools
222
+ curl http://localhost:8080/tools
223
+
224
+ # Chat
225
+ curl -X POST http://localhost:8080/chat \
226
+ -H "Content-Type: application/json" \
227
+ -d '{
228
+ "messages": [
229
+ {"role": "user", "content": "Hello, what can you do?"}
230
+ ]
231
+ }'
232
+ ```
233
+
234
+ ### Test Frontend
235
+ 1. Open browser: http://localhost:8080
236
+ 2. Click "Launch Console"
237
+ 3. Type a message and send
238
+
239
+ ## 🎨 Frontend Development
240
+
241
+ For frontend development with hot-reloading:
242
+
243
+ **Terminal 1 - Backend:**
244
+ ```bash
245
+ python src\api\app.py
246
+ ```
247
+
248
+ **Terminal 2 - Frontend:**
249
+ ```bash
250
+ cd FRRONTEEEND
251
+ npm.cmd run dev
252
+ ```
253
+
254
+ Access:
255
+ - Frontend Dev: http://localhost:3000
256
+ - Backend API: http://localhost:8080
257
+
258
+ ## 📦 Build Status
259
+
260
+ ✅ **Frontend Built**: FRRONTEEEND/dist/ contains:
261
+ - index.html
262
+ - assets/index-[hash].js (384 KB)
263
+
264
+ ✅ **Backend Ready**: src/api/app.py configured to:
265
+ - Serve static files from FRRONTEEEND/dist/assets
266
+ - Route all non-API requests to index.html
267
+ - Handle /chat endpoint
268
+
269
+ ## 🔄 Migration Notes
270
+
271
+ ### What's Deprecated
272
+ - ❌ `chat_ui.py` - Old Gradio interface (kept for reference)
273
+ - ❌ Direct Google GenAI calls from frontend
274
+
275
+ ### What's New
276
+ - ✅ React 19 + TypeScript
277
+ - ✅ Vite 6 build system
278
+ - ✅ Tailwind CSS styling
279
+ - ✅ Framer Motion animations
280
+ - ✅ Backend-first architecture
281
+
282
+ ## 🐛 Troubleshooting
283
+
284
+ ### Issue: Frontend shows 404
285
+ **Solution**: Make sure you've built the frontend:
286
+ ```bash
287
+ cd FRRONTEEEND
288
+ npm.cmd run build
289
+ ```
290
+
291
+ ### Issue: API errors in chat
292
+ **Solution**:
293
+ 1. Check backend is running: `python src\api\app.py`
294
+ 2. Verify GROQ_API_KEY is set
295
+ 3. Check console for errors
296
+
297
+ ### Issue: CORS errors
298
+ **Solution**: The backend has CORS enabled. If issues persist, check the `allow_origins` in app.py
299
+
300
+ ### Issue: Module import errors
301
+ **Solution**: Make sure all Python dependencies are installed:
302
+ ```bash
303
+ pip install -r requirements.txt
304
+ ```
305
+
306
+ ## 📚 Additional Resources
307
+
308
+ - **[FRONTEND_INTEGRATION.md](FRONTEND_INTEGRATION.md)** - Detailed integration guide
309
+ - **[README.md](README.md)** - Main project documentation
310
+ - **[DEPLOYMENT.md](DEPLOYMENT.md)** - Cloud deployment guide
311
+
312
+ ## ✨ Next Steps
313
+
314
+ 1. **File Upload**: Add file upload capability to ChatInterface
315
+ 2. **Visualizations**: Display charts and plots in chat
316
+ 3. **Session Persistence**: Store chat history in backend
317
+ 4. **Authentication**: Add user authentication
318
+ 5. **Streaming**: Implement streaming responses
319
+ 6. **Dark/Light Mode**: Add theme toggle
320
+
321
+ ---
322
+
323
+ **Status**: ✅ Ready to use!
324
+
325
+ **Last Updated**: December 27, 2025
QUICK_REFERENCE.txt ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ╔═══════════════════════════════════════════════════════════════╗
2
+ ║ 🚀 DATA SCIENCE AGENT - QUICK REFERENCE ║
3
+ ║ Now powered by Google Gemini! 🤖 ║
4
+ ╚═══════════════════════════════════════════════════════════════╝
5
+
6
+ ┌───────────────────────────────────────────────────────────────┐
7
+ │ 1. SET API KEY (REQUIRED!) │
8
+ └───────────────────────────────────────────────────────────────┘
9
+
10
+ PowerShell:
11
+ $env:GOOGLE_API_KEY="your-google-api-key-here"
12
+
13
+ Get your key: https://aistudio.google.com/app/apikey
14
+
15
+ ┌───────────────────────────────────────────────────────────────┐
16
+ │ 2. START THE APPLICATION │
17
+ └───────────────────────────────────────────────────────────────┘
18
+
19
+ .\start.ps1
20
+
21
+ ┌───────────────────────────────────────────────────────────────┐
22
+ │ 3. ACCESS THE APP │
23
+ └───────────────────────────────────────────────────────────────┘
24
+
25
+ Open browser: http://localhost:8080
26
+
27
+ ┌───────────────────────────────────────────────────────────────┐
28
+ │ WHAT'S INCLUDED │
29
+ └───────────────────────────────────────────────────────────────┘
30
+
31
+ ✅ Modern React frontend with landing page
32
+ ✅ Professional chat interface
33
+ ✅ Google Gemini 2.0 Flash integration
34
+ ✅ 82+ data science tools
35
+ ✅ Complete ML pipeline automation
36
+
37
+ ┌───────────────────────────────────────────────────────────────┐
38
+ │ KEY FILES │
39
+ └───────────────────────────────────────────────────────────────┘
40
+
41
+ 📖 GEMINI_UPDATE.md - Gemini migration details
42
+ 📖 CHECKLIST.md - Pre-launch checklist
43
+ 📖 MIGRATION_COMPLETE.md - Full change log
44
+ 📖 FRONTEND_INTEGRATION.md - Technical docs
45
+
46
+ ┌───────────────────────────────────────────────────────────────┐
47
+ │ TROUBLESHOOTING │
48
+ └───────────────────────────────────────────────────────────────┘
49
+
50
+ Issue: "API key not configured"
51
+ → Set: $env:GOOGLE_API_KEY="your-key"
52
+
53
+ Issue: "Frontend not found"
54
+ → Run: cd FRRONTEEEND && npm run build
55
+
56
+ Issue: "Module not found"
57
+ → Run: pip install -r requirements.txt
58
+
59
+ ┌───────────────────────────────────────────────────────────────┐
60
+ │ API ENDPOINTS │
61
+ └───────────────────────────────────────────────────────────────┘
62
+
63
+ POST /chat - Chat with Gemini agent
64
+ POST /run - Full ML workflow
65
+ POST /profile - Quick dataset profiling
66
+ GET /tools - List available tools
67
+ GET /docs - API documentation
68
+
69
+ ╔═══════════════════════════════════════════════════════════════╗
70
+ ║ Ready to start? Run: .\start.ps1 ║
71
+ ���═══════════════════════════════════════════════════════════════╝
README.md ADDED
@@ -0,0 +1,632 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Data Science Agent 🤖
2
+
3
+ A production-grade **autonomous AI agent** for end-to-end data science workflows. Upload datasets, describe your goal in natural language, and let the AI handle profiling, cleaning, feature engineering, model training, and visualization.
4
+
5
+ **Key Differentiator**: Not just a chatbot - a true AI agent with 75+ specialized tools, intelligent orchestration, dual LLM support, session memory, code interpreter, and Cloud Run API.
6
+
7
+ ---
8
+
9
+ > ## 🎉 **NEW: Modern React Frontend!**
10
+ >
11
+ > The application now features a **professional React-based web interface** with a beautiful landing page and chat UI, replacing the old Gradio interface.
12
+ >
13
+ > **Quick Start:**
14
+ > ```powershell
15
+ > .\start.ps1 # Windows
16
+ > ```
17
+ > or
18
+ > ```bash
19
+ > ./start.sh # Linux/Mac
20
+ > ```
21
+ >
22
+ > 📖 **[See Full Frontend Integration Guide →](FRONTEND_INTEGRATION.md)**
23
+
24
+ ---
25
+
26
+ ## 🎯 Project Vision
27
+
28
+ Build an **autonomous data science system** that achieves **50-70th percentile performance** on Kaggle competitions through intelligent automation, proving AI agents can handle real-world ML workflows end-to-end.
29
+
30
+ ---
31
+
32
+ ## ✨ Core Features
33
+
34
+ ### **🤖 Intelligent Agent System**
35
+ - **82+ Specialized Tools** across 11 categories (profiling, cleaning, feature engineering, ML, visualization, BigQuery)
36
+ - **Dual LLM Support**: Groq (llama-3.3-70b) + Google Gemini (2.0-flash-exp)
37
+ - **Smart Orchestration**: LLM-powered function calling with intelligent tool chaining
38
+ - **Session Memory**: Contextual awareness across conversations ("cross-validate it", "try with Ridge")
39
+ - **Code Interpreter**: Write and execute custom Python code for tasks beyond predefined tools
40
+ - **Error Recovery**: Automatic retry with corrected parameters
41
+ - **Reasoning Modules**: Dedicated LLM reasoning layer with 19 specialized functions
42
+ - **Cloud Integration**: BigQuery data access + GCS artifact storage
43
+
44
+ ### 🎨 **Multiple Interfaces**
45
+ - **Gradio Web UI** (`chat_ui.py`): Upload files, chat interface, visual plots
46
+ - **CLI Interface** (`src/cli.py`): Command-line workflow automation
47
+ - **REST API** (`src/api/app.py`): Cloud Run-ready FastAPI wrapper
48
+ - **Python SDK**: Direct programmatic access
49
+
50
+ ### 📊 **Complete ML Pipeline**
51
+ 1. **Data Profiling** → Statistics, types, quality issues
52
+ 2. **Data Cleaning** → Smart imputation, outlier handling, type conversion
53
+ 3. **Feature Engineering** → Time features, encoding, interactions, ratios
54
+ 4. **Model Training** → XGBoost, LightGBM, CatBoost, ensemble methods
55
+ 5. **Hyperparameter Tuning** → Optuna-based optimization
56
+ 6. **Visualization** → Matplotlib, Plotly, interactive dashboards
57
+ 7. **EDA Reports** → Sweetviz, ydata-profiling HTML reports
58
+ 8. **Explainability** → SHAP values, feature importance
59
+
60
+ ### ⚡ **Performance & Scale**
61
+ - **Token Optimization**: 34% reduction in LLM context (compressed tool schemas)
62
+ - **SQLite Caching**: Memoization of expensive operations with TTL
63
+ - **Polars & DuckDB**: 10-100x faster than pandas for large datasets
64
+ - **Rate Limiting**: Intelligent API call management (Groq: 12K TPM, Gemini: 10 RPM)
65
+ - **Cloud Ready**: FastAPI service for Google Cloud Run deployment
66
+
67
+ ---
68
+
69
+ ## 🏗️ Architecture
70
+
71
+ ### **System Design**
72
+
73
+ ```
74
+ ┌─────────────────────────────────────────────────────────────┐
75
+ │ User Interfaces │
76
+ │ Gradio UI │ CLI │ REST API │ Python SDK │
77
+ └─────────────────────────┬───────────────────────────────────┘
78
+
79
+ ┌─────────────────────────────────────────────────────────────┐
80
+ │ DataScienceCopilot Orchestrator │
81
+ │ • LLM Function Calling (Groq/Gemini) │
82
+ │ • Session Memory Management │
83
+ │ • Tool Execution & Chaining │
84
+ │ • Error Recovery & Retry Logic │
85
+ └─────────────────────────┬───────────────────────────────────┘
86
+
87
+ ┌─────────────────────────────────────────────────────────────┐
88
+ │ 75+ Specialized Tools │
89
+ │ Data Profiling │ Cleaning │ Feature Engineering │
90
+ │ Model Training │ Visualization │ EDA Reports │
91
+ │ NLP/Text │ Computer Vision │ Time Series │ MLOps │
92
+ └───────���─────────────────┬───────────────────────────────────┘
93
+
94
+ ┌─────────────────────────────────────────────────────────────┐
95
+ │ Execution & Storage Backends │
96
+ │ Local: Polars, sklearn, XGBoost │
97
+ │ Cloud: BigQuery, Vertex AI, Cloud Storage (planned) │
98
+ │ Cache: SQLite with TTL │
99
+ └─────────────────────────────────────────────────────────────┘
100
+ ```
101
+
102
+ ### **Tech Stack**
103
+
104
+ | Layer | Technologies |
105
+ |-------|-------------|
106
+ | **LLM** | Groq (llama-3.3-70b), Google Gemini (2.0-flash-exp) |
107
+ | **Data Processing** | Polars, DuckDB, Pandas, PyArrow, BigQuery |
108
+ | **ML/AI** | scikit-learn, XGBoost, LightGBM, CatBoost, Optuna |
109
+ | **Visualization** | Matplotlib, Seaborn, Plotly |
110
+ | **EDA Reports** | Sweetviz, ydata-profiling |
111
+ | **Explainability** | SHAP, LIME |
112
+ | **APIs** | FastAPI, Uvicorn |
113
+ | **UI** | Gradio, Typer + Rich (CLI) |
114
+ | **Storage** | SQLite (cache), CSV, Parquet, Google Cloud Storage |
115
+ | **Cloud** | Google Cloud Run, BigQuery, GCS, Vertex AI (planned) |
116
+
117
+ ---
118
+
119
+ ## 🚀 Quick Start
120
+
121
+ ### **Prerequisites**
122
+ - Python 3.9+
123
+ - API Keys: [Groq](https://console.groq.com) or [Google AI Studio](https://makersuite.google.com/app/apikey)
124
+
125
+ ### **Installation**
126
+
127
+ ```bash
128
+ # Clone repository
129
+ git clone https://github.com/Surfing-Ninja/Data-Science-Agent.git
130
+ cd Data-Science-Agent
131
+
132
+ # Create virtual environment
133
+ python -m venv .venv
134
+ source .venv/bin/activate # On Windows: .venv\Scripts\activate
135
+
136
+ # Install dependencies
137
+ pip install -r requirements.txt
138
+
139
+ # Set up environment variables
140
+ cp .env.example .env
141
+ # Edit .env and add your API keys:
142
+ # GROQ_API_KEY=your_groq_key
143
+ # GOOGLE_API_KEY=your_google_key (optional)
144
+ # LLM_PROVIDER=groq # or "gemini"
145
+ ```
146
+
147
+ ### **Usage Examples**
148
+
149
+ #### **1. Gradio Web UI** (Recommended for beginners)
150
+ ```bash
151
+ python chat_ui.py
152
+ # Opens at http://localhost:7860
153
+ # Upload CSV → Ask: "Analyze this data and predict house prices"
154
+ ```
155
+
156
+ #### **2. CLI Interface**
157
+ ```bash
158
+ # Complete workflow
159
+ python src/cli.py analyze data.csv --target price --task "Predict house prices"
160
+
161
+ # Quick profiling
162
+ python src/cli.py profile data.csv
163
+
164
+ # Train models only
165
+ python src/cli.py train cleaned.csv Survived --task-type classification
166
+ ```
167
+
168
+ #### **3. Python SDK**
169
+ ```python
170
+ from src.orchestrator import DataScienceCopilot
171
+
172
+ # Initialize agent
173
+ agent = DataScienceCopilot(
174
+ provider="groq", # or "gemini"
175
+ reasoning_effort="medium"
176
+ )
177
+
178
+ # Run workflow
179
+ result = agent.analyze(
180
+ file_path="titanic.csv",
181
+ task_description="Build a model to predict passenger survival",
182
+ target_col="Survived"
183
+ )
184
+
185
+ print(f"Status: {result['status']}")
186
+ print(f"Best Model: {result['best_model']}")
187
+ print(f"Accuracy: {result['best_score']}")
188
+ ```
189
+
190
+ #### **4. REST API** (Cloud Run Ready)
191
+ ```bash
192
+ # Start local server
193
+ cd src/api
194
+ python app.py
195
+ # Server runs at http://localhost:8080
196
+
197
+ # Make API call
198
+ curl -X POST http://localhost:8080/run \
199
+ -F "file=@data.csv" \
200
+ -F "task_description=Analyze and predict churn" \
201
+ -F "target_col=churn"
202
+ ```
203
+
204
+ ---
205
+
206
+ ## 📁 Project Structure
207
+
208
+ ```
209
+ Data-Science-Agent/
210
+ ├── src/
211
+ │ ├── orchestrator.py # Main agent brain (1,136 lines)
212
+ │ ├── cli.py # CLI interface (346 lines)
213
+ │ ├── api/
214
+ │ │ └── app.py # FastAPI Cloud Run wrapper (331 lines)
215
+ │ ├── bigquery/ # BigQuery integration 🆕
216
+ │ │ ├── __init__.py # BigQuery tools (4 functions)
217
+ │ │ └── client.py # BigQuery client wrapper
218
+ │ ├── storage/ # Artifact storage 🆕
219
+ │ │ ├── artifact_store.py # Local + GCS backends (613 lines)
220
+ │ │ └── helpers.py # Storage helper functions (125 lines)
221
+ │ ├── reasoning/ # LLM reasoning layer 🆕
222
+ │ │ ├── __init__.py # Core reasoning engine (350 lines)
223
+ │ │ ├── data_understanding.py # Data insights (6 functions)
224
+ │ │ ├── model_explanation.py # Model interpretation (6 functions)
225
+ │ │ └── business_summary.py # Business translations (7 functions)
226
+ │ ├── cache/
227
+ │ │ └── cache_manager.py # SQLite caching with TTL
228
+ │ ├── tools/ # 82+ specialized tools
229
+ │ │ ├── data_profiling.py # Dataset analysis
230
+ │ │ ├── data_cleaning.py # Cleaning & preprocessing
231
+ │ │ ├── feature_engineering.py # Feature creation
232
+ │ │ ├── model_training.py # ML training
233
+ │ │ ├── visualization_engine.py # Matplotlib/Seaborn plots
234
+ │ │ ├── plotly_visualizations.py # Interactive charts
235
+ │ │ ├── eda_reports.py # Sweetviz, ydata-profiling
236
+ │ │ ├── advanced_*.py # Advanced features
237
+ │ │ └── tools_registry.py # All 82 tool definitions (1,600+ lines)
238
+ │ └── utils/ # Helper utilities
239
+ │ ├── polars_helpers.py # Data manipulation
240
+ │ └── validation.py # Input validation
241
+ ├── chat_ui.py # Gradio web interface (912 lines)
242
+ ├── examples/
243
+ │ └── titanic_example.py # Complete workflow demo
244
+ ├── outputs/
245
+ │ ├── data/ # Processed datasets
246
+ │ ├── models/ # Trained models (.pkl)
247
+ │ ├── plots/ # Visualizations (.png, .html)
248
+ │ └── reports/ # EDA reports (.html)
249
+ ├── cache_db/ # SQLite cache storage
250
+ ├── requirements.txt # Python dependencies
251
+ ├── .env.example # Environment template
252
+ └── README.md # This file
253
+ ```
254
+
255
+ ---
256
+
257
+ ## 🛠️ Tool Categories (82 Tools Total)
258
+
259
+ ### **📊 Data Profiling & Analysis (7 tools)**
260
+ - `profile_dataset`, `detect_data_quality_issues`, `analyze_correlations`, `get_smart_summary`, `compare_datasets`, `calculate_statistics`, `detect_skewness`
261
+
262
+ ### **☁️ BigQuery Integration (4 tools)** 🆕
263
+ - `bigquery_profile_table`, `bigquery_load_table`, `bigquery_execute_query`, `bigquery_write_results`
264
+
265
+ ### **🧹 Data Cleaning (8 tools)**
266
+ - `clean_missing_values`, `handle_outliers`, `remove_duplicates`, `filter_rows`, `rename_columns`, `drop_columns`, `sort_data`, `fix_data_types`
267
+
268
+ ### **🔧 Feature Engineering (13 tools)**
269
+ - `encode_categorical`, `force_numeric_conversion`, `smart_type_inference`, `create_time_features`, `create_interaction_features`, `create_aggregation_features`, `create_ratio_features`, `create_statistical_features`, `create_log_features`, `create_binned_features`, `engineer_text_features`, `auto_feature_engineering`, `auto_feature_selection`
270
+
271
+ ### **🤖 Model Training & Tuning (6 tools)**
272
+ - `train_baseline_models`, `hyperparameter_tuning`, `train_ensemble_models`, `perform_cross_validation`, `generate_model_report`, `auto_ml_pipeline`
273
+
274
+ ### **📈 Visualization (11 tools)**
275
+ - `generate_all_plots`, `generate_data_quality_plots`, `generate_eda_plots`, `generate_model_performance_plots`, `generate_feature_importance_plot`, `generate_interactive_scatter`, `generate_interactive_histogram`, `generate_interactive_correlation_heatmap`, `generate_interactive_box_plots`, `generate_interactive_time_series`, `generate_plotly_dashboard`
276
+
277
+ ### **📊 EDA Reports (3 tools)**
278
+ - `generate_sweetviz_report`, `generate_ydata_profiling_report`, `generate_combined_eda_report`
279
+
280
+ ### **🔬 Advanced Analysis (11 tools)**
281
+ - `perform_eda_analysis`, `detect_model_issues`, `detect_anomalies`, `detect_and_handle_multicollinearity`, `perform_statistical_tests`, `analyze_root_cause`, `detect_trends_and_seasonality`, `detect_anomalies_advanced`, `perform_hypothesis_testing`, `analyze_distribution`, `perform_segment_analysis`
282
+
283
+ ### **📝 Data Wrangling (3 tools)**
284
+ - `merge_datasets`, `concat_datasets`, `reshape_dataset`
285
+
286
+ ### **🚀 MLOps & Production (5 tools)**
287
+ - `monitor_model_drift`, `explain_predictions`, `generate_model_card`, `perform_ab_test_analysis`, `detect_feature_leakage`
288
+
289
+ ### **⏰ Time Series (3 tools)**
290
+ - `forecast_time_series`, `detect_seasonality_trends`, `create_time_series_features`
291
+
292
+ ### **💼 Business Intelligence (4 tools)**
293
+ - `perform_cohort_analysis`, `perform_rfm_analysis`, `detect_causal_relationships`, `generate_business_insights`
294
+
295
+ ### **📚 NLP/Text (4 tools)**
296
+ - `perform_topic_modeling`, `perform_named_entity_recognition`, `analyze_sentiment_advanced`, `perform_text_similarity`
297
+
298
+ ### **🖼️ Computer Vision (3 tools)**
299
+ - `extract_image_features`, `perform_image_clustering`, `analyze_tabular_image_hybrid`
300
+
301
+ ---
302
+
303
+ ## 🎯 Advanced Features
304
+
305
+ ### **1. Session Memory**
306
+ The agent remembers context across conversations:
307
+
308
+ ```python
309
+ # Conversation 1
310
+ "Train a model on earthquake.csv to predict magnitude"
311
+ → Agent trains XGBoost, achieves 0.92 R²
312
+
313
+ # Conversation 2 (Same session)
314
+ "Cross-validate it"
315
+ → Agent knows: model=XGBoost, dataset=earthquake.csv, target=magnitude
316
+ → Runs 5-fold CV automatically
317
+ ```
318
+
319
+ ### **2. Code Interpreter**
320
+ Execute custom Python code for tasks beyond predefined tools:
321
+
322
+ ```python
323
+ User: "Make a Plotly scatter with custom dropdown filters"
324
+
325
+ Agent: execute_python_code(code='''
326
+ import plotly.graph_objects as go
327
+ df = pd.read_csv('./temp/data.csv')
328
+ # Custom visualization code...
329
+ fig.write_html('./outputs/code/custom_plot.html')
330
+ ''')
331
+ ```
332
+
333
+ ### **3. Token Optimization**
334
+ System stays under LLM token limits even with 75 tools:
335
+
336
+ | Component | Before | After | Savings |
337
+ |-----------|--------|-------|---------|
338
+ | Tool Schemas | 8,193 tokens | 5,463 tokens | 34% |
339
+ | Tool Results | 5,000+ tokens | 50-200 tokens | 90%+ |
340
+
341
+ ### **4. Error Recovery**
342
+ Agent learns from errors and auto-corrects:
343
+
344
+ ```python
345
+ # Attempt 1
346
+ train_baseline_models(target_col="magnitude")
347
+ → Error: Column 'magnitude' not found. Hint: Did you mean 'mag'?
348
+
349
+ # Attempt 2 (Automatic)
350
+ train_baseline_models(target_col="mag")
351
+ → Success! Trained 4 models, best: XGBoost (0.92 R²)
352
+ ```
353
+
354
+ ---
355
+
356
+ ## ☁️ Cloud Features
357
+
358
+ ### **1. BigQuery Integration** 🆕
359
+ Direct access to BigQuery tables without local downloads:
360
+
361
+ ```python
362
+ # Profile a BigQuery table
363
+ agent.chat("Profile the table project.dataset.sales")
364
+
365
+ # Query and analyze
366
+ agent.chat("Query top 10 customers by revenue from BigQuery")
367
+
368
+ # Write results back
369
+ agent.chat("Write the cleaned data to BigQuery table project.dataset.sales_clean")
370
+ ```
371
+
372
+ **Available Tools:**
373
+ - `bigquery_profile_table`: Get statistics for any BigQuery table
374
+ - `bigquery_load_table`: Load BigQuery data into local Polars DataFrame
375
+ - `bigquery_execute_query`: Run SQL queries directly on BigQuery
376
+ - `bigquery_write_results`: Write processed data back to BigQuery
377
+
378
+ **Setup:**
379
+ ```bash
380
+ # Install BigQuery dependencies
381
+ pip install google-cloud-bigquery db-dtypes
382
+
383
+ # Set environment variable
384
+ export GOOGLE_APPLICATION_CREDENTIALS="/path/to/service-account-key.json"
385
+ ```
386
+
387
+ **Looker-Compatible Schemas:**
388
+
389
+ The project defines stable BigQuery table schemas for BI tools (see [`BIGQUERY_SCHEMAS.md`](BIGQUERY_SCHEMAS.md)):
390
+ - 📊 `model_metrics` - Model performance tracking over time
391
+ - 🎯 `feature_importance` - Feature impact analysis
392
+ - 🔮 `predictions` - Prediction monitoring with actuals
393
+ - 📋 `data_profile_summary` - Data quality metrics
394
+
395
+ **Design Principles:**
396
+ - Stable schemas (no breaking changes without versioning)
397
+ - Consistent snake_case naming
398
+ - Clear dimension/metric separation
399
+ - Dashboard-ready with sample Looker views
400
+
401
+ ### **2. Artifact Storage** 🆕
402
+ Unified storage abstraction - switch between local and GCS with zero code changes:
403
+
404
+ ```python
405
+ # Local storage (default)
406
+ agent.save_model(model, "my_model.pkl")
407
+ # → Saves to outputs/models/my_model.pkl
408
+
409
+ # GCS storage (automatic when GCS credentials present)
410
+ agent.save_model(model, "my_model.pkl")
411
+ # → Saves to gs://your-bucket/models/my_model_v1.pkl with versioning
412
+ ```
413
+
414
+ **Features:**
415
+ - **Automatic Backend Selection**: Uses GCS if credentials available, falls back to local
416
+ - **Versioning**: Automatic version suffixes for GCS artifacts
417
+ - **Metadata**: Stores creation time, size, checksums
418
+ - **Unified API**: Same code works for local and cloud storage
419
+
420
+ **Setup:**
421
+ ```bash
422
+ # Install GCS dependencies
423
+ pip install google-cloud-storage
424
+
425
+ # Set bucket (optional, defaults to local)
426
+ export GCS_BUCKET="your-gcs-bucket-name"
427
+ ```
428
+
429
+ ### **3. Reasoning Modules** 🆕
430
+ Dedicated LLM reasoning layer with clear boundaries (no raw data access, no training decisions):
431
+
432
+ ```python
433
+ from reasoning.data_understanding import explain_dataset
434
+ from reasoning.model_explanation import explain_model_performance
435
+ from reasoning.business_summary import create_executive_summary
436
+
437
+ # Data insights
438
+ insights = explain_dataset(summary={
439
+ "rows": 10000,
440
+ "columns": 20,
441
+ "missing_values": {"age": {"count": 150, "percentage": 1.5}}
442
+ })
443
+
444
+ # Model explanations
445
+ explanation = explain_model_performance(metrics={
446
+ "accuracy": 0.95,
447
+ "precision": 0.92,
448
+ "recall": 0.88
449
+ }, task_type="classification")
450
+
451
+ # Business summaries
452
+ summary = create_executive_summary(
453
+ project_results={"model_accuracy": 0.95},
454
+ project_name="churn_prediction",
455
+ business_objective="Reduce customer churn"
456
+ )
457
+ ```
458
+
459
+ **19 Reasoning Functions:**
460
+ - **Data Understanding**: explain_dataset, suggest_transformations, identify_feature_engineering_opportunities, explain_missing_values, compare_datasets (6 functions)
461
+ - **Model Explanation**: explain_model_performance, interpret_feature_importance, diagnose_model_failure, explain_prediction, compare_models, explain_overfitting (6 functions)
462
+ - **Business Summary**: create_executive_summary, estimate_business_impact, create_stakeholder_report, translate_technical_to_business, prioritize_next_steps, explain_to_customer, assess_deployment_readiness (7 functions)
463
+
464
+ **Design Principles:**
465
+ - ✅ **NO Raw Data Access**: Only summaries/statistics allowed
466
+ - ✅ **NO Training Decisions**: Only explanations, never execution
467
+ - ✅ **Structured Output**: JSON schemas for cacheability
468
+ - ✅ **Dual Backend**: Works with both Gemini and Groq
469
+
470
+ ---
471
+
472
+ ## 🔧 Configuration
473
+
474
+ ### **Environment Variables** (`.env`)
475
+
476
+ ```bash
477
+ # LLM Provider
478
+ LLM_PROVIDER=groq # "groq" or "gemini"
479
+ GROQ_API_KEY=your_groq_key
480
+ GOOGLE_API_KEY=your_google_key # Optional
481
+
482
+ # Model Selection
483
+ GROQ_MODEL=llama-3.3-70b-versatile
484
+ GEMINI_MODEL=gemini-2.0-flash-exp
485
+ REASONING_EFFORT=medium # low, medium, high
486
+
487
+ # Cache Settings
488
+ CACHE_DB_PATH=./cache_db/cache.db
489
+ CACHE_TTL_SECONDS=86400 # 24 hours
490
+
491
+ # Cloud Features (Optional)
492
+ GCS_BUCKET=your-gcs-bucket-name # For artifact storage
493
+ GOOGLE_APPLICATION_CREDENTIALS=/path/to/service-key.json # For BigQuery + GCS
494
+
495
+ # Cloud Run (for API deployment)
496
+ PORT=8080
497
+ ```
498
+
499
+ ### **Provider Comparison**
500
+
501
+ | Feature | Groq | Gemini |
502
+ |---------|------|--------|
503
+ | **Model** | llama-3.3-70b-versatile | gemini-2.0-flash-exp |
504
+ | **Speed** | ⚡ Extremely fast (LPU) | 🚀 Very fast |
505
+ | **Free Tier** | 100K tokens/day | 1,500 requests/day |
506
+ | **Rate Limit** | 12K tokens/min | 10 requests/min |
507
+ | **Best For** | High-volume, low-latency | Free tier, high quota |
508
+
509
+ ---
510
+
511
+ ## 🚀 Cloud Deployment (Google Cloud Run)
512
+
513
+ ### **Deploy REST API**
514
+
515
+ ```bash
516
+ # 1. Build Docker image (Dockerfile provided)
517
+ docker build -t data-science-agent .
518
+
519
+ # 2. Push to Google Container Registry
520
+ gcloud builds submit --tag gcr.io/PROJECT_ID/data-science-agent
521
+
522
+ # 3. Deploy to Cloud Run
523
+ gcloud run deploy data-science-agent \
524
+ --image gcr.io/PROJECT_ID/data-science-agent \
525
+ --platform managed \
526
+ --region us-central1 \
527
+ --allow-unauthenticated \
528
+ --memory 4Gi \
529
+ --timeout 3600 \
530
+ --set-env-vars GROQ_API_KEY=your_key,LLM_PROVIDER=groq
531
+
532
+ # 4. Test deployment
533
+ curl -X POST https://your-service-url/run \
534
+ -F "file=@data.csv" \
535
+ -F "task_description=Predict churn"
536
+ ```
537
+
538
+ ### **API Endpoints**
539
+
540
+ - `GET /` - Health check
541
+ - `GET /health` - Readiness probe
542
+ - `POST /run` - Full analysis workflow
543
+ - `POST /profile` - Quick dataset profiling
544
+ - `GET /tools` - List all available tools
545
+
546
+ ---
547
+
548
+ ## 🗺️ Roadmap
549
+
550
+ ### **Phase 1: Core Agent** ✅ COMPLETE
551
+ - [x] 75 specialized tools
552
+ - [x] Dual LLM support (Groq + Gemini)
553
+ - [x] CLI + Gradio UI
554
+ - [x] SQLite caching
555
+ - [x] Token optimization
556
+
557
+ ### **Phase 2: Intelligence** ✅ COMPLETE
558
+ - [x] Session memory
559
+ - [x] Code interpreter
560
+ - [x] Error recovery
561
+ - [x] EDA reports (Sweetviz, ydata-profiling)
562
+ - [x] Interactive Plotly visualizations
563
+
564
+ ### **Phase 3: Cloud Native** ✅ COMPLETE
565
+ - [x] FastAPI Cloud Run wrapper with 4 REST endpoints
566
+ - [x] BigQuery integration (4 tools: profile, load, query, write)
567
+ - [x] Artifact Storage abstraction (Local ↔ GCS switching)
568
+ - [x] Reasoning modules for LLM explanations (19 functions)
569
+ - [x] Looker-compatible BigQuery schemas (4 stable tables)
570
+ - [ ] Vertex AI model training (planned)
571
+ - [ ] Cloud Logging & Monitoring (planned)
572
+
573
+ ### **Phase 4: Enterprise** 📋 PLANNED
574
+ - [ ] Multi-user authentication
575
+ - [ ] Team workspaces
576
+ - [ ] Model registry
577
+ - [ ] Automated retraining pipelines
578
+
579
+ ### **Phase 5: Kaggle Integration** 🎯 FUTURE
580
+ - [ ] Direct Kaggle API integration
581
+ - [ ] Automated competition workflow
582
+ - [ ] Ensemble strategies
583
+ - [ ] Submission automation
584
+
585
+ ---
586
+
587
+ ## 🤝 Contributing
588
+
589
+ Contributions welcome! Areas for improvement:
590
+
591
+ 1. **New Tools**: Time series forecasting, NLP preprocessing, image augmentation
592
+ 2. **Cloud Backends**: AWS, Azure support
593
+ 3. **Performance**: Optimize tool execution, reduce latency
594
+ 4. **UI/UX**: Better visualization, workflow builder
595
+ 5. **Documentation**: Tutorials, video guides, blog posts
596
+
597
+ ---
598
+
599
+ ## 📜 License
600
+
601
+ MIT License - See LICENSE file for details
602
+
603
+ ---
604
+
605
+ ## 📧 Support & Community
606
+
607
+ - **Issues**: [GitHub Issues](https://github.com/Surfing-Ninja/Data-Science-Agent/issues)
608
+ - **Discussions**: [GitHub Discussions](https://github.com/Surfing-Ninja/Data-Science-Agent/discussions)
609
+
610
+ ---
611
+
612
+ ## 📊 Project Stats
613
+
614
+ - **Lines of Code**: ~18,000+
615
+ - **Tools**: 82 specialized functions (75 core + 4 BigQuery + 3 storage helpers)
616
+ - **Reasoning Functions**: 19 LLM-powered explanation modules
617
+ - **Supported Models**: 10+ (LR, Ridge, Lasso, RF, XGBoost, LightGBM, CatBoost, etc.)
618
+ - **Visualization Types**: 20+ (static + interactive)
619
+ - **Data Formats**: CSV, Parquet, JSON, BigQuery tables
620
+ - **Cloud Platforms**: Google Cloud (Run, BigQuery, GCS) - AWS/Azure planned
621
+
622
+ ---
623
+
624
+ <div align="center">
625
+
626
+ **Built with ❤️ for the Data Science Community**
627
+
628
+ *"Making data science accessible through AI automation"*
629
+
630
+ ⭐ Star this repo if you find it useful! ⭐
631
+
632
+ </div>
build-and-deploy.ps1 ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Build and Deploy Script for Data Science Agent (Windows)
2
+
3
+ Write-Host "🚀 Building and Deploying Data Science Agent..." -ForegroundColor Cyan
4
+
5
+ # Step 1: Build React Frontend
6
+ Write-Host ""
7
+ Write-Host "📦 Building React frontend..." -ForegroundColor Yellow
8
+ Set-Location FRRONTEEEND
9
+ npm.cmd install
10
+ if ($LASTEXITCODE -ne 0) {
11
+ Write-Host "❌ Frontend npm install failed!" -ForegroundColor Red
12
+ exit 1
13
+ }
14
+ npm.cmd run build
15
+ if ($LASTEXITCODE -ne 0) {
16
+ Write-Host "❌ Frontend build failed!" -ForegroundColor Red
17
+ exit 1
18
+ }
19
+ Set-Location ..
20
+
21
+ Write-Host ""
22
+ Write-Host "✅ Frontend built successfully!" -ForegroundColor Green
23
+ Write-Host " Built files are in: FRRONTEEEND\dist" -ForegroundColor Gray
24
+
25
+ # Step 2: Install Python dependencies
26
+ Write-Host ""
27
+ Write-Host "📦 Installing Python dependencies..." -ForegroundColor Yellow
28
+ pip install -r requirements.txt
29
+ if ($LASTEXITCODE -ne 0) {
30
+ Write-Host "⚠️ Some Python dependencies may have failed to install" -ForegroundColor Yellow
31
+ }
32
+
33
+ Write-Host ""
34
+ Write-Host "✅ Build complete!" -ForegroundColor Green
35
+ Write-Host ""
36
+ Write-Host "To run the application:" -ForegroundColor Cyan
37
+ Write-Host " python src\api\app.py" -ForegroundColor White
38
+ Write-Host ""
39
+ Write-Host "Access the app at: http://localhost:8080" -ForegroundColor Green
build-and-deploy.sh ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Build and Deploy Script for Data Science Agent
3
+
4
+ set -e # Exit on error
5
+
6
+ echo "🚀 Building and Deploying Data Science Agent..."
7
+
8
+ # Step 1: Build React Frontend
9
+ echo ""
10
+ echo "📦 Building React frontend..."
11
+ cd FRRONTEEEND
12
+ npm.cmd install
13
+ npm.cmd run build
14
+ cd ..
15
+
16
+ # Step 2: Copy built frontend to deployment location (if needed)
17
+ echo ""
18
+ echo "✅ Frontend built successfully!"
19
+ echo " Built files are in: FRRONTEEEND/dist"
20
+
21
+ # Step 3: Install Python dependencies
22
+ echo ""
23
+ echo "📦 Installing Python dependencies..."
24
+ pip install -r requirements.txt
25
+
26
+ echo ""
27
+ echo "✅ Build complete!"
28
+ echo ""
29
+ echo "To run the application:"
30
+ echo " 1. Backend: python -m uvicorn src.api.app:app --host 0.0.0.0 --port 8080"
31
+ echo " 2. Or use: python src/api/app.py"
32
+ echo ""
33
+ echo "Access the app at: http://localhost:8080"
cache_db/.gitkeep ADDED
File without changes
chat_ui.py ADDED
@@ -0,0 +1,1073 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ AI Agent Data Scientist - Interactive Chat UI
3
+ ==============================================
4
+
5
+ A simple web interface to interact with your AI Agent.
6
+ Upload datasets, ask questions, and get AI-powered insights!
7
+ """
8
+
9
+ import gradio as gr
10
+ import sys
11
+ import os
12
+ import shutil
13
+ from pathlib import Path
14
+ import traceback
15
+
16
+ # Add src to path
17
+ sys.path.append('src')
18
+
19
+ from tools.data_profiling import profile_dataset, detect_data_quality_issues
20
+ from tools.model_training import train_baseline_models
21
+
22
+ # Try to import AI agent (optional)
23
+ try:
24
+ from orchestrator import DataScienceCopilot
25
+ agent = DataScienceCopilot()
26
+ AI_ENABLED = True
27
+ print("✅ AI Agent loaded successfully!")
28
+ print(f"📊 Model: {agent.model}")
29
+ print(f"🔧 Tools available: {len(agent.tool_functions)}")
30
+ except Exception as e:
31
+ print(f"ℹ️ Running in manual mode (AI agent not available)")
32
+ print(f" Error: {str(e)}")
33
+ print("💡 You can still use all the quick actions and tools!")
34
+ AI_ENABLED = False
35
+ agent = None
36
+
37
+ # Store uploaded file path
38
+ current_file = None
39
+ current_profile = None
40
+ last_agent_response = None # Store last agent response for visualization extraction
41
+
42
+
43
+ # Helper functions for Gradio 6.x message format
44
+ def add_message(history, role, content):
45
+ """Add a message to history in Gradio 6.x format."""
46
+ if history is None:
47
+ history = []
48
+ history.append({"role": role, "content": content})
49
+ return history
50
+
51
+
52
+ def add_user_message(history, content):
53
+ """Add a user message to history."""
54
+ return add_message(history, "user", content)
55
+
56
+
57
+ def add_assistant_message(history, content):
58
+ """Add an assistant message to history."""
59
+ return add_message(history, "assistant", content)
60
+
61
+
62
+ def update_last_assistant_message(history, content):
63
+ """Update the last assistant message in history."""
64
+ if history and len(history) > 0 and history[-1].get("role") == "assistant":
65
+ history[-1]["content"] = content
66
+ return history
67
+
68
+
69
+ def get_last_user_content(history):
70
+ """Get the content of the last user message."""
71
+ if history:
72
+ for msg in reversed(history):
73
+ if msg.get("role") == "user":
74
+ return msg.get("content", "")
75
+ return ""
76
+
77
+
78
+ def analyze_dataset(file, user_message, history):
79
+ """Process uploaded dataset(s) and user message. Supports single or multiple file uploads."""
80
+ global current_file, current_profile, last_agent_response
81
+
82
+ # Initialize with empty plot list (will collect PNG file paths)
83
+ plots_paths = []
84
+ html_reports = [] # Initialize HTML reports list
85
+
86
+ # Initialize history if None
87
+ if history is None:
88
+ history = []
89
+
90
+ # Debug: Log the call
91
+ print(f"[DEBUG] analyze_dataset called - file: {file is not None}, message: '{user_message}', current_file: {current_file}")
92
+
93
+ try:
94
+ # Handle file uploads (single or multiple)
95
+ if file is not None:
96
+ # file can be a single filepath or a list of filepaths
97
+ files_to_process = file if isinstance(file, list) else [file]
98
+
99
+ # Filter out None values
100
+ files_to_process = [f for f in files_to_process if f is not None]
101
+
102
+ if len(files_to_process) > 0:
103
+ print(f"[DEBUG] Processing {len(files_to_process)} file(s) upload")
104
+
105
+ # Copy all files to simpler paths
106
+ os.makedirs("./temp", exist_ok=True)
107
+ processed_files = []
108
+ seen_files = {} # Track files by content hash to detect duplicates
109
+ duplicate_count = 0
110
+
111
+ for uploaded_file in files_to_process:
112
+ simple_filename = Path(uploaded_file.name if hasattr(uploaded_file, 'name') else uploaded_file).name
113
+ file_source = uploaded_file.name if hasattr(uploaded_file, 'name') else uploaded_file
114
+
115
+ # Calculate file hash to detect duplicates (even with different names)
116
+ import hashlib
117
+ hasher = hashlib.md5()
118
+ with open(file_source, 'rb') as f:
119
+ # Read file in chunks to handle large files efficiently
120
+ for chunk in iter(lambda: f.read(8192), b""):
121
+ hasher.update(chunk)
122
+ file_hash = hasher.hexdigest()
123
+
124
+ # Check if this exact file was already uploaded
125
+ if file_hash in seen_files:
126
+ print(f"[DEBUG] Duplicate file detected: {simple_filename} (same as {seen_files[file_hash]})")
127
+ duplicate_count += 1
128
+ continue # Skip duplicate
129
+
130
+ # Not a duplicate - process it
131
+ simple_path = f"./temp/{simple_filename}"
132
+
133
+ # Handle filename collision (different files with same name)
134
+ if os.path.exists(simple_path):
135
+ # Check if existing file is the same (by comparing with already processed files)
136
+ existing_in_processed = simple_path in processed_files
137
+ if not existing_in_processed:
138
+ # Different file with same name - add suffix
139
+ base_name = Path(simple_filename).stem
140
+ extension = Path(simple_filename).suffix
141
+ counter = 1
142
+ while os.path.exists(f"./temp/{base_name}_{counter}{extension}"):
143
+ counter += 1
144
+ simple_filename = f"{base_name}_{counter}{extension}"
145
+ simple_path = f"./temp/{simple_filename}"
146
+ print(f"[DEBUG] Filename collision - renamed to: {simple_filename}")
147
+
148
+ shutil.copy2(file_source, simple_path)
149
+ processed_files.append(simple_path)
150
+ seen_files[file_hash] = simple_filename
151
+ print(f"[DEBUG] Copied file to: {simple_path}")
152
+
153
+ # Set current_file to the first file (for single-file operations)
154
+ # For multi-file operations, the agent will use all files from ./temp/
155
+ current_file = processed_files[0] if processed_files else None
156
+
157
+ # Only show file upload response if there's no user message
158
+ if not (user_message and user_message.strip()):
159
+ if len(processed_files) == 0:
160
+ # All files were duplicates
161
+ response = f"⚠️ **No New Files Uploaded**\n\n"
162
+ response += f"All {len(files_to_process)} file(s) were duplicates of already uploaded files.\n\n"
163
+ response += "Your previously uploaded dataset is still active."
164
+ elif len(processed_files) == 1:
165
+ # Single file upload - show detailed profile
166
+ response = f"📊 **Dataset Uploaded Successfully!**\n\n"
167
+ if duplicate_count > 0:
168
+ response += f"ℹ️ *({duplicate_count} duplicate file(s) were skipped)*\n\n"
169
+ response += f"**File:** {Path(current_file).name}\n\n"
170
+
171
+ # Get basic profile
172
+ profile = profile_dataset(current_file)
173
+ current_profile = profile
174
+
175
+ response += f"**Dataset Overview:**\n"
176
+ response += f"- Rows: {profile['shape']['rows']:,}\n"
177
+ response += f"- Columns: {profile['shape']['columns']}\n"
178
+
179
+ # Handle memory_usage (can be float or dict)
180
+ memory = profile.get('memory_usage', 0)
181
+ if isinstance(memory, dict):
182
+ memory = memory.get('total_mb', 0)
183
+ response += f"- Memory: {memory:.2f} MB\n\n"
184
+
185
+ response += f"**Column Types:**\n"
186
+ response += f"- Numeric: {len(profile['column_types']['numeric'])} columns\n"
187
+ response += f"- Categorical: {len(profile['column_types']['categorical'])} columns\n"
188
+ response += f"- Datetime: {len(profile['column_types']['datetime'])} columns\n\n"
189
+
190
+ # Check data quality
191
+ quality = detect_data_quality_issues(current_file)
192
+ if quality['critical']:
193
+ response += f"🔴 **Critical Issues:** {len(quality['critical'])}\n"
194
+ for issue in quality['critical'][:3]:
195
+ response += f" - {issue['message']}\n"
196
+ if quality['warning']:
197
+ response += f"🟡 **Warnings:** {len(quality['warning'])}\n"
198
+ for issue in quality['warning'][:3]:
199
+ response += f" - {issue['message']}\n"
200
+ else:
201
+ # Multiple files uploaded
202
+ response = f"📊 **{len(processed_files)} Datasets Uploaded Successfully!**\n\n"
203
+ if duplicate_count > 0:
204
+ response += f"ℹ️ *({duplicate_count} duplicate file(s) were skipped)*\n\n"
205
+ response += f"**Files:**\n"
206
+ for i, fp in enumerate(processed_files, 1):
207
+ response += f"{i}. {Path(fp).name}\n"
208
+ response += f"\n**💡 You can now use multi-dataset operations!**\n\n"
209
+
210
+ response += f"\n\n💬 **What would you like to do with {'this dataset' if len(processed_files) == 1 else 'these datasets'}?**\n\n"
211
+ response += "You can ask me to:\n"
212
+ if len(processed_files) > 1:
213
+ response += "- **Merge these datasets** (e.g., 'merge customers and orders on customer_id')\n"
214
+ response += "- **Combine/concatenate** them (e.g., 'combine all monthly sales files')\n"
215
+ response += "- Train a classification or regression model\n"
216
+ response += "- Analyze specific columns\n"
217
+ response += "- Detect outliers\n"
218
+ response += "- Engineer features\n"
219
+ response += "- Generate predictions\n"
220
+ response += "- And much more!\n"
221
+
222
+ # Add assistant message to history
223
+ history = add_assistant_message(history, response)
224
+ yield history, "", [], []
225
+ return
226
+ # If user uploaded file AND sent a message, don't return - continue to process the message
227
+ elif user_message and user_message.strip():
228
+ # Continue processing the message below
229
+ pass
230
+
231
+ # If user sends a message about the current file
232
+ print(f"[DEBUG] Checking message conditions: user_message={bool(user_message and user_message.strip())}, current_file={bool(current_file)}")
233
+ if user_message and user_message.strip() and current_file:
234
+ print(f"[DEBUG] User message detected. AI_ENABLED={AI_ENABLED}, agent={agent is not None}")
235
+ if AI_ENABLED and agent:
236
+ print(f"[DEBUG] Entering AI Agent block...")
237
+ try:
238
+ # Show immediate processing message
239
+ print(f"🤖 AI Agent analyzing: {user_message}")
240
+ history = add_user_message(history, user_message)
241
+ history = add_assistant_message(history, "🤖 **AI Agent is thinking...**\n\n⏳ Analyzing your request and planning the workflow...")
242
+ yield history, "", [], []
243
+
244
+ # Use the AI agent to process the request
245
+ print(f"📂 File path: {current_file}")
246
+ print(f"📝 Task: {user_message}")
247
+ print(f"🚀 Calling agent.analyze()...")
248
+
249
+ agent_response = agent.analyze(
250
+ file_path=current_file,
251
+ task_description=user_message,
252
+ use_cache=False, # Disable cache to avoid dict hashing issues
253
+ stream=False
254
+ )
255
+
256
+ print(f"✅ Agent response received: {agent_response.get('status', 'unknown')}")
257
+
258
+ # Store agent response for visualization extraction
259
+ last_agent_response = agent_response
260
+
261
+ # Format the response
262
+ if agent_response.get('status') == 'success':
263
+ response = f"🤖 **AI Agent Analysis Complete!**\n\n"
264
+ response += f"{agent_response.get('summary', '')}\n\n"
265
+
266
+ if 'workflow_history' in agent_response and agent_response['workflow_history']:
267
+ response += f"**Execution Summary:**\n"
268
+ response += f"- Tools Executed: {len(agent_response['workflow_history'])}\n"
269
+ response += f"- Iterations: {agent_response.get('iterations', 0)}\n"
270
+ response += f"- Time: {agent_response.get('execution_time', 0):.1f}s\n\n"
271
+
272
+ # Find and display MODEL TRAINING RESULTS with ALL METRICS
273
+ model_results = None
274
+ for step in agent_response['workflow_history']:
275
+ if step.get('tool') == 'train_baseline_models':
276
+ result = step.get('result', {})
277
+ if isinstance(result, dict) and 'result' in result:
278
+ model_results = result['result']
279
+ elif isinstance(result, dict):
280
+ model_results = result
281
+ break
282
+
283
+ if model_results and 'models' in model_results:
284
+ response += f"## 🎯 Model Training Results\n\n"
285
+ task_type = model_results.get('task_type', 'unknown')
286
+ response += f"**Task Type:** {task_type.title()}\n"
287
+ response += f"**Features:** {model_results.get('n_features', 0)}\n"
288
+ response += f"**Training Samples:** {model_results.get('train_size', 0):,}\n"
289
+ response += f"**Test Samples:** {model_results.get('test_size', 0):,}\n\n"
290
+
291
+ # Show ALL models tested
292
+ response += "### 📊 All Models Tested:\n\n"
293
+ models_data = model_results.get('models', {})
294
+
295
+ for model_name, model_info in models_data.items():
296
+ if 'test_metrics' in model_info:
297
+ metrics = model_info['test_metrics']
298
+ response += f"**{model_name}:**\n"
299
+
300
+ if task_type == 'classification':
301
+ response += f"- Accuracy: {metrics.get('accuracy', 0):.4f}\n"
302
+ response += f"- Precision: {metrics.get('precision', 0):.4f}\n"
303
+ response += f"- Recall: {metrics.get('recall', 0):.4f}\n"
304
+ response += f"- F1 Score: {metrics.get('f1', 0):.4f}\n"
305
+ else:
306
+ response += f"- R² Score: {metrics.get('r2', 0):.4f}\n"
307
+ response += f"- RMSE: {metrics.get('rmse', 0):.2f}\n"
308
+ response += f"- MAE: {metrics.get('mae', 0):.2f}\n"
309
+ response += f"- MAPE: {metrics.get('mape', 0):.2f}%\n"
310
+ response += "\n"
311
+
312
+ # Highlight BEST MODEL
313
+ best_model = model_results.get('best_model', {})
314
+ if best_model and best_model.get('name'):
315
+ response += f"### 🏆 Best Model: **{best_model['name']}**\n"
316
+ response += f"Score: {best_model.get('score', 0):.4f}\n\n"
317
+
318
+ # Show workflow execution summary
319
+ response += "### 🔧 Workflow Steps:\n"
320
+ for i, step in enumerate(agent_response['workflow_history'], 1):
321
+ tool_name = step['tool']
322
+ success = step['result'].get('success', False)
323
+ icon = "✅" if success else "❌"
324
+ response += f"{i}. {icon} {tool_name}\n"
325
+ response += "\n"
326
+
327
+ # Check for plots AND reports in workflow results
328
+ html_reports = [] # Separate list for HTML reports
329
+
330
+ for step in agent_response['workflow_history']:
331
+ result = step.get('result', {})
332
+
333
+ # Deep search for plots and reports in nested results
334
+ def find_plots_and_reports(obj, plots_list, reports_list):
335
+ if isinstance(obj, dict):
336
+ # Check direct plot/report keys
337
+ for key in ['plot_path', 'plot_file', 'output_path', 'html_path', 'report_path',
338
+ 'plots', 'plot_paths', 'performance_plots', 'feature_importance_plot']:
339
+ if key in obj and obj[key]:
340
+ if isinstance(obj[key], list):
341
+ for path in obj[key]:
342
+ if isinstance(path, str) and os.path.exists(path):
343
+ if path.endswith('.html'):
344
+ # Check if it's a report (in reports folder) or interactive plot
345
+ if '/reports/' in path or 'report' in Path(path).stem.lower():
346
+ reports_list.append(path)
347
+ else:
348
+ reports_list.append(path) # Interactive plots also go to reports
349
+ elif path.endswith(('.png', '.jpg', '.jpeg')):
350
+ plots_list.append(path)
351
+ elif isinstance(obj[key], str) and os.path.exists(obj[key]):
352
+ if obj[key].endswith('.html'):
353
+ if '/reports/' in obj[key] or 'report' in Path(obj[key]).stem.lower():
354
+ reports_list.append(obj[key])
355
+ else:
356
+ reports_list.append(obj[key])
357
+ elif obj[key].endswith(('.png', '.jpg', '.jpeg')):
358
+ plots_list.append(obj[key])
359
+ # Recursively search nested dicts
360
+ for value in obj.values():
361
+ find_plots_and_reports(value, plots_list, reports_list)
362
+
363
+ find_plots_and_reports(result, plots_paths, html_reports)
364
+
365
+ # Remove duplicates while preserving order
366
+ plots_paths = list(dict.fromkeys(plots_paths))
367
+ html_reports = list(dict.fromkeys(html_reports))
368
+
369
+ # Display visualization and report information in response
370
+ if plots_paths or html_reports:
371
+ response += f"## 📊 Generated Outputs\n\n"
372
+
373
+ if plots_paths:
374
+ response += f"### 📈 Visualizations ({len(plots_paths)} plots)\n"
375
+ response += "✅ Plots are displayed in the **Visualization Gallery** below!\n\n"
376
+
377
+ # List plot files
378
+ for i, plot_path in enumerate(plots_paths[:10], 1):
379
+ try:
380
+ plot_name = Path(plot_path).stem.replace('_', ' ').title()
381
+ rel_path = os.path.relpath(plot_path, '.')
382
+ response += f"{i}. 📊 **{plot_name}**\n"
383
+ response += f" 📁 `{rel_path}`\n\n"
384
+ except Exception as e:
385
+ response += f"{i}. ❌ Error: {str(e)}\n"
386
+
387
+ if html_reports:
388
+ response += f"### 📋 Reports & Interactive Plots ({len(html_reports)} files)\n"
389
+ response += "✅ Reports are displayed in the **Reports Viewer** below!\n\n"
390
+
391
+ # List report files
392
+ for i, report_path in enumerate(html_reports[:10], 1):
393
+ try:
394
+ report_name = Path(report_path).stem.replace('_', ' ').title()
395
+ rel_path = os.path.relpath(report_path, '.')
396
+ file_size = os.path.getsize(report_path) / 1024 # KB
397
+ response += f"{i}. 📄 **{report_name}**\n"
398
+ response += f" 📁 `{rel_path}` ({file_size:.1f} KB)\n\n"
399
+ except Exception as e:
400
+ response += f"{i}. ❌ Error: {str(e)}\n"
401
+ else:
402
+ response += "ℹ️ No visualizations or reports were generated in this workflow.\n"
403
+ else:
404
+ response = f"⚠️ **AI Agent Status:** {agent_response.get('status', 'unknown')}\n\n"
405
+ response += f"{agent_response.get('message', agent_response.get('error', 'Unknown error'))}\n"
406
+
407
+ # Update the last assistant message with the response
408
+ history = update_last_assistant_message(history, response)
409
+
410
+ # Return plot paths for gallery and html_reports for HTML viewer
411
+ # Store html_reports in a format the HTML component can use
412
+ yield history, "", plots_paths if plots_paths else [], html_reports if html_reports else []
413
+ return
414
+ except Exception as e:
415
+ import sys
416
+ exc_type, exc_value, exc_traceback = sys.exc_info()
417
+ response = f"⚠️ **AI Agent Error:**\n\n"
418
+ response += f"**Error Type:** {exc_type.__name__}\n\n"
419
+ response += f"**Error Message:** {str(e)}\n\n"
420
+ response += f"**Full Traceback:**\n```python\n{traceback.format_exc()}\n```\n\n"
421
+ response += "💡 **Fallback Options:**\n"
422
+ response += "- Use the **Quick Train** feature on the right\n"
423
+ response += "- Try manual commands: `profile`, `quality`, `columns`\n"
424
+ # Update the last assistant message with error
425
+ history = update_last_assistant_message(history, response)
426
+ yield history, "", plots_paths if plots_paths else []
427
+ return
428
+ else:
429
+ # Manual mode - Handle commands directly
430
+ user_msg_lower = user_message.lower().strip()
431
+
432
+ # Handle simple commands manually
433
+ if 'profile' in user_msg_lower:
434
+ response = "📊 **Dataset Profile:**\n\n"
435
+ if current_profile:
436
+ response += f"**Shape:** {current_profile['shape']['rows']:,} rows × {current_profile['shape']['columns']} columns\n\n"
437
+ response += f"**Column Types:**\n"
438
+ response += f"- Numeric: {len(current_profile['column_types']['numeric'])} columns\n"
439
+ response += f"- Categorical: {len(current_profile['column_types']['categorical'])} columns\n"
440
+ response += f"- Datetime: {len(current_profile['column_types']['datetime'])} columns\n\n"
441
+ response += f"**Overall Stats:**\n"
442
+ response += f"- Total cells: {current_profile['overall_stats']['total_cells']:,}\n"
443
+ response += f"- Null values: {current_profile['overall_stats']['total_nulls']} ({current_profile['overall_stats']['null_percentage']:.1f}%)\n"
444
+ response += f"- Duplicates: {current_profile['overall_stats']['duplicate_rows']}\n"
445
+ else:
446
+ response += "Profile information is available at the top of the chat!"
447
+
448
+ elif 'quality' in user_msg_lower or 'issues' in user_msg_lower:
449
+ quality = detect_data_quality_issues(current_file)
450
+ response = "🔍 **Data Quality Report:**\n\n"
451
+
452
+ if quality['critical']:
453
+ response += f"🔴 **Critical Issues:** {len(quality['critical'])}\n"
454
+ for issue in quality['critical']:
455
+ response += f" • {issue['message']}\n"
456
+ response += "\n"
457
+
458
+ if quality['warning']:
459
+ response += f"🟡 **Warnings:** {len(quality['warning'])}\n"
460
+ for issue in quality['warning'][:5]: # Show first 5
461
+ response += f" • {issue['message']}\n"
462
+ if len(quality['warning']) > 5:
463
+ response += f" • ... and {len(quality['warning']) - 5} more\n"
464
+ response += "\n"
465
+
466
+ if quality['info']:
467
+ response += f"🔵 **Info:** {len(quality['info'])} observations\n"
468
+
469
+ if not quality['critical'] and not quality['warning'] and not quality['info']:
470
+ response += "✅ No issues detected! Your data looks good.\n"
471
+
472
+ elif 'columns' in user_msg_lower or 'column' in user_msg_lower:
473
+ if current_profile:
474
+ response = "📋 **Dataset Columns:**\n\n"
475
+ for col, info in current_profile['columns'].items():
476
+ nulls = info.get('null_count', 0)
477
+ null_pct = (nulls / current_profile['shape']['rows'] * 100) if current_profile['shape']['rows'] > 0 else 0
478
+ response += f"• **{col}** ({info['type']})\n"
479
+ response += f" - Nulls: {nulls} ({null_pct:.1f}%)\n"
480
+ if 'unique' in info:
481
+ response += f" - Unique: {info['unique']}\n"
482
+ else:
483
+ response = "📋 **Columns:** Please upload a file first to see column information."
484
+
485
+ elif 'help' in user_msg_lower:
486
+ response = "💡 **Available Commands:**\n\n"
487
+ response += "**Manual Commands:**\n"
488
+ response += "• `profile` - Show detailed dataset statistics\n"
489
+ response += "• `quality` - Check data quality issues\n"
490
+ response += "• `columns` - List all columns with details\n"
491
+ response += "• `help` - Show this help message\n\n"
492
+ response += "**Quick Actions:**\n"
493
+ response += "• Use the **Quick Train** panel on the right to train models\n"
494
+ response += "• Check **Dataset Info** in the sidebar for quick stats\n"
495
+
496
+ else:
497
+ # Default response for unrecognized commands
498
+ response = f"💬 **You said:** {user_message}\n\n"
499
+ response += "⚠️ AI agent is not available. I can respond to these commands:\n\n"
500
+ response += "• `profile` - Show detailed statistics\n"
501
+ response += "• `quality` - Check data quality\n"
502
+ response += "• `columns` - List all columns\n"
503
+ response += "• `help` - Show available commands\n\n"
504
+ response += "**Or use Quick Train** on the right to train models directly!\n"
505
+
506
+ # Add user message and assistant response
507
+ history = add_user_message(history, user_message)
508
+ history = add_assistant_message(history, response)
509
+ yield history, "", [], []
510
+ return
511
+
512
+ # If no file is uploaded yet
513
+ if user_message and user_message.strip() and not current_file:
514
+ response = "⚠️ **Please upload a dataset first!**\n\n"
515
+ response += "Click the 'Upload Dataset' button above and select a CSV or Parquet file."
516
+ # Add user message and assistant response
517
+ history = add_user_message(history, user_message)
518
+ history = add_assistant_message(history, response)
519
+ yield history, "", [], []
520
+ return
521
+
522
+ except Exception as e:
523
+ error_msg = f"❌ **Error:** {str(e)}\n\n"
524
+ error_msg += "**Traceback:**\n```\n" + traceback.format_exc() + "\n```"
525
+ if user_message:
526
+ # Check if we already added the user message
527
+ last_user = get_last_user_content(history)
528
+ if last_user != user_message:
529
+ history = add_user_message(history, user_message)
530
+ history = add_assistant_message(history, error_msg)
531
+ else:
532
+ history = add_assistant_message(history, error_msg)
533
+ yield history, "", [], []
534
+ return
535
+
536
+ # Default return if nothing matched
537
+ yield history, "", [], []
538
+
539
+
540
+ def quick_profile(file):
541
+ """Quick profile display in the sidebar."""
542
+ if file is None:
543
+ return "No file uploaded yet."
544
+
545
+ try:
546
+ profile = profile_dataset(file.name)
547
+
548
+ info = f"**{Path(file.name).name}**\n\n"
549
+ info += f"📊 {profile['shape']['rows']:,} rows × {profile['shape']['columns']} cols\n\n"
550
+ info += f"**Columns:**\n"
551
+ for col, col_info in list(profile['columns'].items())[:10]:
552
+ info += f"- {col} ({col_info['type']})\n"
553
+
554
+ if len(profile['columns']) > 10:
555
+ info += f"- ... and {len(profile['columns']) - 10} more\n"
556
+
557
+ return info
558
+ except Exception as e:
559
+ return f"Error: {str(e)}"
560
+
561
+
562
+ def train_model_ui(file, target_col, model_type, test_size, progress=gr.Progress()):
563
+ """Train a model directly from the UI."""
564
+ if file is None:
565
+ return "⚠️ Please upload a dataset first!"
566
+
567
+ if not target_col:
568
+ return "⚠️ Please specify a target column!"
569
+
570
+ # Clean up the target column name - remove surrounding quotes if present
571
+ target_col = target_col.strip().strip("'").strip('"')
572
+
573
+ try:
574
+ # Show progress
575
+ progress(0, desc="🔄 Loading dataset...")
576
+ yield "⏳ **Training in progress...**\n\n📊 Loading dataset..."
577
+
578
+ import time
579
+ time.sleep(0.5) # Brief pause for UI feedback
580
+
581
+ progress(0.2, desc="🔄 Preparing data...")
582
+ yield "⏳ **Training in progress...**\n\n📊 Dataset loaded\n🔄 Preparing data..."
583
+
584
+ time.sleep(0.3)
585
+ # Determine problem type
586
+ problem_type = "classification" if model_type == "Classification" else "regression"
587
+
588
+ progress(0.4, desc="🤖 Training models...")
589
+ yield "⏳ **Training in progress...**\n\n📊 Dataset loaded\n✅ Data prepared\n🤖 Training multiple models..."
590
+
591
+ # Train baseline models
592
+ result = train_baseline_models(
593
+ file.name,
594
+ target_col=target_col,
595
+ task_type=problem_type,
596
+ test_size=test_size
597
+ )
598
+
599
+ progress(0.9, desc="📊 Evaluating results...")
600
+
601
+ # Check if training was successful
602
+ if result.get('status') == 'error':
603
+ yield f"❌ **Training Failed**\n\n{result.get('message', 'Unknown error')}"
604
+ return
605
+
606
+ if 'best_model' not in result:
607
+ yield f"❌ **Training Failed**\n\nNo models were successfully trained. Result: {result}"
608
+ return
609
+
610
+ # Get the best model
611
+ best_model_name = result['best_model']['name']
612
+ if not best_model_name:
613
+ yield f"❌ **Training Failed**\n\nNo model could be selected as best model."
614
+ return
615
+
616
+ best_model_info = result['models'][best_model_name]
617
+ best_metrics = best_model_info.get('test_metrics', {})
618
+
619
+ output = f"✅ **Model Training Complete!**\n\n"
620
+ output += f"## 🏆 Best Model: **{best_model_name}**\n\n"
621
+
622
+ output += f"**Dataset Info:**\n"
623
+ output += f"- Features: {result.get('n_features', 0)}\n"
624
+ output += f"- Training samples: {result.get('train_size', 0):,}\n"
625
+ output += f"- Test samples: {result.get('test_size', 0):,}\n\n"
626
+
627
+ if problem_type == "classification":
628
+ output += f"**Test Metrics:**\n"
629
+ output += f"- ✅ Accuracy: {best_metrics.get('accuracy', 0):.4f}\n"
630
+ output += f"- 🎯 Precision: {best_metrics.get('precision', 0):.4f}\n"
631
+ output += f"- 📊 Recall: {best_metrics.get('recall', 0):.4f}\n"
632
+ output += f"- 🔥 F1 Score: {best_metrics.get('f1', 0):.4f}\n\n"
633
+ else:
634
+ output += f"**Test Metrics:**\n"
635
+ output += f"- 📈 R² Score: {best_metrics.get('r2', 0):.4f}\n"
636
+ output += f"- 📉 RMSE: {best_metrics.get('rmse', 0):.2f}\n"
637
+ output += f"- 📊 MAE: {best_metrics.get('mae', 0):.2f}\n"
638
+ output += f"- 💯 MAPE: {best_metrics.get('mape', 0):.2f}%\n\n"
639
+
640
+ output += f"## 📊 All Models Comparison:\n\n"
641
+ for model_name, model_info in result['models'].items():
642
+ if 'test_metrics' in model_info:
643
+ test_metrics = model_info['test_metrics']
644
+ indicator = "🏆 " if model_name == best_model_name else " "
645
+ if problem_type == "classification":
646
+ f1 = test_metrics.get('f1', 0)
647
+ acc = test_metrics.get('accuracy', 0)
648
+ output += f"{indicator}**{model_name}:**\n"
649
+ output += f" - F1: {f1:.4f} | Accuracy: {acc:.4f}\n"
650
+ else:
651
+ r2 = test_metrics.get('r2', 0)
652
+ rmse = test_metrics.get('rmse', 0)
653
+ output += f"{indicator}**{model_name}:**\n"
654
+ output += f" - R²: {r2:.4f} | RMSE: {rmse:.2f}\n"
655
+ elif 'status' in model_info and model_info['status'] == 'error':
656
+ output += f" ❌ **{model_name}:** {model_info.get('message', 'Error')}\n"
657
+
658
+ # Display generated plots if available
659
+ plots_to_show = []
660
+
661
+ # Check for performance plots
662
+ if 'performance_plots' in result and result['performance_plots']:
663
+ if isinstance(result['performance_plots'], list):
664
+ plots_to_show.extend(result['performance_plots'])
665
+ else:
666
+ plots_to_show.append(result['performance_plots'])
667
+
668
+ # Check for feature importance plot
669
+ if 'feature_importance_plot' in result and result['feature_importance_plot']:
670
+ plots_to_show.append(result['feature_importance_plot'])
671
+
672
+ # Embed plots
673
+ if plots_to_show:
674
+ output += f"\n\n📊 **Visualizations:**\n\n"
675
+ for plot_path in plots_to_show:
676
+ if isinstance(plot_path, str) and plot_path.endswith('.html') and os.path.exists(plot_path):
677
+ try:
678
+ with open(plot_path, 'r', encoding='utf-8') as f:
679
+ plot_html = f.read()
680
+ # Add plot title based on filename
681
+ plot_name = Path(plot_path).stem.replace('_', ' ').title()
682
+ output += f"**{plot_name}:**\n"
683
+ output += f'<iframe srcdoc="{plot_html.replace(chr(34), "&quot;")}" width="100%" height="500" frameborder="0"></iframe>\n\n'
684
+ except Exception as e:
685
+ # Fallback to file path
686
+ output += f"📁 {Path(plot_path).name}: `{plot_path}`\n"
687
+
688
+ progress(1.0, desc="✅ Complete!")
689
+ yield output
690
+
691
+ except Exception as e:
692
+ yield f"❌ **Error:** {str(e)}\n\n```\n{traceback.format_exc()}\n```"
693
+
694
+
695
+ def clear_conversation():
696
+ """Clear the conversation and reset state."""
697
+ global current_file, current_profile
698
+ current_file = None
699
+ current_profile = None
700
+ return [], None, "", [], ""
701
+
702
+
703
+ def format_html_reports(html_paths):
704
+ """Format HTML reports/plots for display in HTML component."""
705
+ if not html_paths or len(html_paths) == 0:
706
+ return "<div style='text-align:center; padding:40px; color:#666;'>No reports generated yet. Try: 'Generate a quality report' or 'Create interactive visualizations'</div>"
707
+
708
+ html_output = """
709
+ <style>
710
+ .report-container {
711
+ padding: 20px;
712
+ background: #f8f9fa;
713
+ }
714
+ .report-card {
715
+ margin-bottom: 30px;
716
+ border: 2px solid #dee2e6;
717
+ border-radius: 12px;
718
+ overflow: hidden;
719
+ background: white;
720
+ box-shadow: 0 4px 6px rgba(0,0,0,0.1);
721
+ }
722
+ .report-header {
723
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
724
+ color: white;
725
+ padding: 15px 20px;
726
+ font-weight: bold;
727
+ font-size: 18px;
728
+ display: flex;
729
+ justify-content: space-between;
730
+ align-items: center;
731
+ }
732
+ .report-meta {
733
+ font-size: 12px;
734
+ opacity: 0.9;
735
+ }
736
+ .report-iframe {
737
+ width: 100%;
738
+ min-height: 600px;
739
+ border: none;
740
+ background: white;
741
+ }
742
+ .report-footer {
743
+ background: #f8f9fa;
744
+ padding: 10px 20px;
745
+ font-size: 12px;
746
+ color: #666;
747
+ border-top: 1px solid #dee2e6;
748
+ }
749
+ </style>
750
+ <div class="report-container">
751
+ """
752
+
753
+ html_output += f"<h2 style='color: #667eea; margin-bottom: 20px;'>📋 {len(html_paths)} Report(s) Generated</h2>"
754
+
755
+ for i, html_path in enumerate(html_paths, 1):
756
+ try:
757
+ # Get file metadata
758
+ file_name = Path(html_path).name
759
+ file_size = os.path.getsize(html_path) / 1024 # KB
760
+ report_title = Path(html_path).stem.replace('_', ' ').title()
761
+
762
+ # Read the HTML content
763
+ with open(html_path, 'r', encoding='utf-8') as f:
764
+ html_content = f.read()
765
+
766
+ # Escape the content for embedding
767
+ escaped_content = html_content.replace('\\', '\\\\').replace('"', '&quot;').replace("'", "\\'")
768
+
769
+ html_output += f"""
770
+ <div class="report-card">
771
+ <div class="report-header">
772
+ <span>📊 {i}. {report_title}</span>
773
+ <span class="report-meta">{file_size:.1f} KB</span>
774
+ </div>
775
+ <iframe class="report-iframe" srcdoc="{escaped_content}"></iframe>
776
+ <div class="report-footer">
777
+ 📁 {html_path}
778
+ </div>
779
+ </div>
780
+ """
781
+ except Exception as e:
782
+ html_output += f"""
783
+ <div class="report-card">
784
+ <div class="report-header" style="background: linear-gradient(135deg, #f44336 0%, #e91e63 100%);">
785
+ <span>❌ Error loading: {Path(html_path).name}</span>
786
+ </div>
787
+ <div style="padding: 20px;">
788
+ <p><strong>Error:</strong> {str(e)}</p>
789
+ <p><strong>Path:</strong> {html_path}</p>
790
+ </div>
791
+ </div>
792
+ """
793
+
794
+ html_output += "</div>"
795
+
796
+ return html_output
797
+
798
+
799
+ def extract_and_display_plots(agent_response):
800
+ """Extract plots from agent response and format them for display."""
801
+ plots_html = ""
802
+
803
+ if not agent_response or agent_response.get('status') != 'success':
804
+ return gr.update(value="<p style='text-align:center; color:#666;'>No visualizations generated yet. Upload a dataset and run analysis!</p>")
805
+
806
+ workflow_history = agent_response.get('workflow_history', [])
807
+ if not workflow_history:
808
+ return gr.update(value="<p style='text-align:center; color:#666;'>No visualizations in this workflow.</p>")
809
+
810
+ # Find all plots
811
+ plots_paths = []
812
+
813
+ def find_plots(obj, plots_list):
814
+ if isinstance(obj, dict):
815
+ # Check direct plot keys
816
+ for key in ['plot_path', 'plot_file', 'html_path', 'output_path',
817
+ 'plots', 'plot_paths', 'performance_plots', 'feature_importance_plot']:
818
+ if key in obj and obj[key]:
819
+ if isinstance(obj[key], list):
820
+ for plot_path in obj[key]:
821
+ if isinstance(plot_path, str) and plot_path.endswith('.html') and os.path.exists(plot_path):
822
+ plots_list.append(plot_path)
823
+ elif isinstance(obj[key], str) and obj[key].endswith('.html') and os.path.exists(obj[key]):
824
+ plots_list.append(obj[key])
825
+ # Recursively search nested dicts
826
+ for value in obj.values():
827
+ find_plots(value, plots_list)
828
+
829
+ for step in workflow_history:
830
+ result = step.get('result', {})
831
+ find_plots(result, plots_paths)
832
+
833
+ # Remove duplicates while preserving order
834
+ plots_paths = list(dict.fromkeys(plots_paths))
835
+
836
+ if not plots_paths:
837
+ return gr.update(value="<p style='text-align:center; color:#666;'>No plots were generated in this analysis.</p>")
838
+
839
+ # Build HTML gallery
840
+ plots_html = f"""
841
+ <div style='padding: 20px;'>
842
+ <h2 style='color: #1f77b4; margin-bottom: 20px;'>📊 Visualization Gallery ({len(plots_paths)} plots)</h2>
843
+ """
844
+
845
+ for i, plot_path in enumerate(plots_paths, 1):
846
+ try:
847
+ with open(plot_path, 'r', encoding='utf-8') as f:
848
+ plot_content = f.read()
849
+
850
+ plot_name = Path(plot_path).stem.replace('_', ' ').title()
851
+
852
+ plots_html += f"""
853
+ <div style='margin-bottom: 30px; border: 1px solid #ddd; border-radius: 8px; overflow: hidden;'>
854
+ <div style='background: linear-gradient(90deg, #1f77b4, #2ca02c); color: white; padding: 10px 15px; font-weight: bold;'>
855
+ {i}. {plot_name}
856
+ </div>
857
+ <div style='padding: 10px; background: white;'>
858
+ <iframe srcdoc='{plot_content.replace("'", "&apos;").replace('"', "&quot;")}'
859
+ width='100%' height='500' frameborder='0'
860
+ style='border: none; border-radius: 5px;'></iframe>
861
+ </div>
862
+ <div style='background: #f8f9fa; padding: 8px 15px; font-size: 12px; color: #666;'>
863
+ 📁 {plot_path}
864
+ </div>
865
+ </div>
866
+ """
867
+ except Exception as e:
868
+ plots_html += f"""
869
+ <div style='margin-bottom: 20px; padding: 15px; border: 1px solid #f44336; border-radius: 5px; background: #ffebee;'>
870
+ <strong>❌ Failed to load: {Path(plot_path).name}</strong><br>
871
+ <small>{str(e)}</small>
872
+ </div>
873
+ """
874
+
875
+ plots_html += "</div>"
876
+
877
+ return gr.update(value=plots_html)
878
+
879
+
880
+ # Custom CSS for better visual feedback
881
+ custom_css = """
882
+ .status-box {
883
+ padding: 10px;
884
+ border-radius: 5px;
885
+ background: linear-gradient(90deg, #e8f5e9 0%, #c8e6c9 100%);
886
+ margin-bottom: 10px;
887
+ text-align: center;
888
+ font-weight: bold;
889
+ }
890
+ """
891
+
892
+ # Create the Gradio interface
893
+ with gr.Blocks(title="AI Agent Data Scientist", theme=gr.themes.Soft(), css=custom_css) as demo:
894
+ gr.Markdown("""
895
+ # 🤖 AI Agent Data Scientist
896
+
897
+ Upload your dataset and chat with the AI agent to perform data science tasks!
898
+
899
+ **Features:**
900
+ - 📊 Automatic dataset profiling
901
+ - 🤖 Natural language queries
902
+ - 🎯 Model training (classification & regression)
903
+ - 🔍 Data quality analysis
904
+ - 📈 Feature engineering
905
+ - 🎨 **NEW:** Automatic visualization generation!
906
+ - And 59 tools total!
907
+ """)
908
+
909
+ # Store agent response for visualization extraction
910
+ agent_response_state = gr.State(None)
911
+
912
+ with gr.Row():
913
+ # Left column - Main chat interface
914
+ with gr.Column(scale=2):
915
+ # Status indicator
916
+ status_box = gr.Markdown("🟢 **Ready** - Upload a dataset to begin", elem_classes=["status-box"])
917
+
918
+ chatbot = gr.Chatbot(
919
+ label="Chat with AI Agent",
920
+ height=450,
921
+ show_label=True,
922
+ avatar_images=(None, "🤖"),
923
+ sanitize_html=False # Allow HTML content including iframes
924
+ )
925
+
926
+ with gr.Row():
927
+ file_upload = gr.File(
928
+ label="📁 Upload Dataset(s) (CSV/Parquet) - Single or Multiple Files",
929
+ file_types=[".csv", ".parquet"],
930
+ file_count="multiple", # Allow multiple file uploads
931
+ type="filepath"
932
+ )
933
+
934
+ with gr.Row():
935
+ user_input = gr.Textbox(
936
+ label="Your Message",
937
+ placeholder="Ask anything: 'train a model', 'analyze my data', 'generate visualizations'",
938
+ lines=2,
939
+ scale=4
940
+ )
941
+ submit_btn = gr.Button("📤 Send", variant="primary", scale=1)
942
+
943
+ with gr.Row():
944
+ clear_btn = gr.Button("🗑️ Clear", variant="secondary") # Right column - Quick actions and info
945
+ with gr.Column(scale=1):
946
+ gr.Markdown("## 📊 Dataset Info")
947
+ dataset_info = gr.Markdown("Upload a dataset to see information here.")
948
+
949
+ gr.Markdown("## 🎯 Quick Train")
950
+ with gr.Group():
951
+ target_column = gr.Textbox(
952
+ label="Target Column",
953
+ placeholder="e.g., 'price', 'class', 'label'"
954
+ )
955
+ model_type_choice = gr.Radio(
956
+ ["Classification", "Regression"],
957
+ label="Model Type",
958
+ value="Classification"
959
+ )
960
+ test_size_slider = gr.Slider(
961
+ 0.1, 0.5, 0.3,
962
+ label="Test Size",
963
+ step=0.05
964
+ )
965
+ train_btn = gr.Button("🚀 Train Model", variant="primary")
966
+
967
+ training_output = gr.Markdown("Training results will appear here.")
968
+
969
+ gr.Markdown("""
970
+ ## 💡 Example Queries
971
+
972
+ - "Train a classification model to predict [target]"
973
+ - "Show me statistics for [column]"
974
+ - "Detect outliers in the dataset"
975
+ - "What are the most important features?"
976
+ - "Generate a quality report"
977
+ - "Create polynomial features"
978
+ - "Balance the dataset using SMOTE"
979
+ """)
980
+
981
+ # Visualization Gallery Section (Full Width)
982
+ with gr.Row():
983
+ with gr.Column():
984
+ gr.Markdown("## 🎨 Visualization Gallery")
985
+ visualization_gallery = gr.Gallery(
986
+ label="Generated Plots (PNG/JPG)",
987
+ show_label=True,
988
+ elem_id="gallery",
989
+ columns=2,
990
+ height=400
991
+ )
992
+
993
+ # Reports Viewer Section (Full Width)
994
+ with gr.Row():
995
+ with gr.Column():
996
+ gr.Markdown("## 📋 Reports & Interactive Visualizations")
997
+ gr.Markdown("*HTML reports and interactive Plotly charts will be displayed here*")
998
+ reports_viewer = gr.HTML(
999
+ value="<div style='text-align:center; padding:40px; color:#666;'>No reports generated yet. Try: 'Generate a quality report' or 'Create interactive visualizations'</div>",
1000
+ elem_id="reports_viewer"
1001
+ )
1002
+
1003
+ # Create state to hold HTML report paths
1004
+ html_reports_state = gr.State([])
1005
+
1006
+ # Event handlers with streaming support
1007
+ submit_result = submit_btn.click(
1008
+ fn=analyze_dataset,
1009
+ inputs=[file_upload, user_input, chatbot],
1010
+ outputs=[chatbot, user_input, visualization_gallery, html_reports_state],
1011
+ show_progress="full" # Show progress bar
1012
+ )
1013
+ submit_result.then(
1014
+ fn=format_html_reports,
1015
+ inputs=[html_reports_state],
1016
+ outputs=[reports_viewer]
1017
+ )
1018
+
1019
+ user_input_result = user_input.submit(
1020
+ fn=analyze_dataset,
1021
+ inputs=[file_upload, user_input, chatbot],
1022
+ outputs=[chatbot, user_input, visualization_gallery, html_reports_state],
1023
+ show_progress="full"
1024
+ )
1025
+ user_input_result.then(
1026
+ fn=format_html_reports,
1027
+ inputs=[html_reports_state],
1028
+ outputs=[reports_viewer]
1029
+ )
1030
+
1031
+ file_result = file_upload.change(
1032
+ fn=analyze_dataset,
1033
+ inputs=[file_upload, gr.Textbox(value="", visible=False), chatbot],
1034
+ outputs=[chatbot, user_input, visualization_gallery, html_reports_state],
1035
+ show_progress="full"
1036
+ )
1037
+ file_result.then(
1038
+ fn=quick_profile,
1039
+ inputs=[file_upload],
1040
+ outputs=[dataset_info]
1041
+ )
1042
+ file_result.then(
1043
+ fn=format_html_reports,
1044
+ inputs=[html_reports_state],
1045
+ outputs=[reports_viewer]
1046
+ )
1047
+
1048
+ train_btn.click(
1049
+ fn=train_model_ui,
1050
+ inputs=[file_upload, target_column, model_type_choice, test_size_slider],
1051
+ outputs=[training_output],
1052
+ show_progress="full" # Show progress bar
1053
+ )
1054
+
1055
+ clear_btn.click(
1056
+ clear_conversation,
1057
+ outputs=[chatbot, file_upload, user_input, visualization_gallery, reports_viewer]
1058
+ )
1059
+
1060
+ if __name__ == "__main__":
1061
+ print("=" * 70)
1062
+ print("🚀 Starting AI Agent Data Scientist Chat UI...")
1063
+ print("=" * 70)
1064
+ print("\n🌐 The UI will open in your browser automatically.")
1065
+ print("💡 If it doesn't, copy the URL shown below.\n")
1066
+
1067
+ demo.launch(
1068
+ share=False, # Set to True to create a public link
1069
+ server_name="0.0.0.0", # Listen on all interfaces
1070
+ server_port=7865, # Changed port to avoid conflict
1071
+ show_error=True,
1072
+ inbrowser=True # Auto-open browser
1073
+ )
cloudbuild.yaml ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Google Cloud Build configuration for automated deployments
2
+ # Triggered on git push to main branch
3
+
4
+ steps:
5
+ # Step 1: Build the container image
6
+ - name: 'gcr.io/cloud-builders/docker'
7
+ args:
8
+ - 'build'
9
+ - '-t'
10
+ - 'gcr.io/$PROJECT_ID/data-science-agent:$COMMIT_SHA'
11
+ - '-t'
12
+ - 'gcr.io/$PROJECT_ID/data-science-agent:latest'
13
+ - '.'
14
+ timeout: 600s
15
+
16
+ # Step 2: Push the container image to Container Registry
17
+ - name: 'gcr.io/cloud-builders/docker'
18
+ args:
19
+ - 'push'
20
+ - 'gcr.io/$PROJECT_ID/data-science-agent:$COMMIT_SHA'
21
+
22
+ - name: 'gcr.io/cloud-builders/docker'
23
+ args:
24
+ - 'push'
25
+ - 'gcr.io/$PROJECT_ID/data-science-agent:latest'
26
+
27
+ # Step 3: Deploy to Cloud Run
28
+ - name: 'gcr.io/google.com/cloudsdktool/cloud-sdk'
29
+ entrypoint: gcloud
30
+ args:
31
+ - 'run'
32
+ - 'deploy'
33
+ - 'data-science-agent'
34
+ - '--image'
35
+ - 'gcr.io/$PROJECT_ID/data-science-agent:$COMMIT_SHA'
36
+ - '--region'
37
+ - 'us-central1'
38
+ - '--platform'
39
+ - 'managed'
40
+ - '--allow-unauthenticated'
41
+ - '--memory'
42
+ - '4Gi'
43
+ - '--cpu'
44
+ - '2'
45
+ - '--timeout'
46
+ - '900'
47
+ - '--max-instances'
48
+ - '10'
49
+ - '--min-instances'
50
+ - '0'
51
+ - '--concurrency'
52
+ - '10'
53
+ - '--set-env-vars'
54
+ - 'LLM_PROVIDER=groq,REASONING_EFFORT=medium,CACHE_TTL_SECONDS=86400'
55
+ - '--set-secrets'
56
+ - 'GROQ_API_KEY=GROQ_API_KEY:latest,GOOGLE_API_KEY=GOOGLE_API_KEY:latest,GOOGLE_APPLICATION_CREDENTIALS=GOOGLE_APPLICATION_CREDENTIALS:latest'
57
+
58
+ # Build timeout
59
+ timeout: 1200s
60
+
61
+ # Images to push to Container Registry
62
+ images:
63
+ - 'gcr.io/$PROJECT_ID/data-science-agent:$COMMIT_SHA'
64
+ - 'gcr.io/$PROJECT_ID/data-science-agent:latest'
65
+
66
+ # Build options
67
+ options:
68
+ machineType: 'N1_HIGHCPU_8'
69
+ logging: CLOUD_LOGGING_ONLY
data/.gitkeep ADDED
File without changes
deploy.sh ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Manual deployment script for Google Cloud Run
3
+ # Use this for one-off deployments or CI/CD pipeline integration
4
+
5
+ set -e # Exit on error
6
+
7
+ # Colors for output
8
+ RED='\033[0;31m'
9
+ GREEN='\033[0;32m'
10
+ YELLOW='\033[1;33m'
11
+ NC='\033[0m' # No Color
12
+
13
+ echo -e "${GREEN}🚀 Data Science Agent - Cloud Run Deployment${NC}"
14
+ echo "=================================================="
15
+
16
+ # Check if gcloud is installed
17
+ if ! command -v gcloud &> /dev/null; then
18
+ echo -e "${RED}❌ Error: gcloud CLI not found. Install it from: https://cloud.google.com/sdk/install${NC}"
19
+ exit 1
20
+ fi
21
+
22
+ # Get GCP Project ID
23
+ if [ -z "$GCP_PROJECT_ID" ]; then
24
+ echo -e "${YELLOW}⚠️ GCP_PROJECT_ID not set. Using gcloud default project...${NC}"
25
+ GCP_PROJECT_ID=$(gcloud config get-value project 2>/dev/null)
26
+
27
+ if [ -z "$GCP_PROJECT_ID" ]; then
28
+ echo -e "${RED}❌ Error: No GCP project configured. Run: gcloud config set project YOUR_PROJECT_ID${NC}"
29
+ exit 1
30
+ fi
31
+ fi
32
+
33
+ echo -e "${GREEN}📋 Project ID: ${GCP_PROJECT_ID}${NC}"
34
+
35
+ # Configuration
36
+ SERVICE_NAME="data-science-agent"
37
+ REGION="${CLOUD_RUN_REGION:-us-central1}"
38
+ IMAGE_NAME="gcr.io/${GCP_PROJECT_ID}/${SERVICE_NAME}"
39
+ MEMORY="${MEMORY:-4Gi}"
40
+ CPU="${CPU:-2}"
41
+ MAX_INSTANCES="${MAX_INSTANCES:-10}"
42
+ TIMEOUT="${TIMEOUT:-900}"
43
+
44
+ echo "Region: ${REGION}"
45
+ echo "Image: ${IMAGE_NAME}:latest"
46
+ echo "Memory: ${MEMORY}"
47
+ echo "CPU: ${CPU}"
48
+ echo ""
49
+
50
+ # Step 1: Enable required APIs
51
+ echo -e "${YELLOW}🔧 Step 1/5: Enabling required Google Cloud APIs...${NC}"
52
+ gcloud services enable \
53
+ cloudbuild.googleapis.com \
54
+ run.googleapis.com \
55
+ containerregistry.googleapis.com \
56
+ secretmanager.googleapis.com \
57
+ --project=${GCP_PROJECT_ID} \
58
+ --quiet
59
+
60
+ echo -e "${GREEN}✅ APIs enabled${NC}"
61
+ echo ""
62
+
63
+ # Step 2: Create secrets (if not exist)
64
+ echo -e "${YELLOW}🔐 Step 2/5: Checking secrets...${NC}"
65
+
66
+ create_secret_if_not_exists() {
67
+ local secret_name=$1
68
+ local secret_value=$2
69
+
70
+ if gcloud secrets describe ${secret_name} --project=${GCP_PROJECT_ID} &>/dev/null; then
71
+ echo " ℹ️ Secret ${secret_name} already exists"
72
+ else
73
+ if [ -n "${secret_value}" ]; then
74
+ echo " ➕ Creating secret: ${secret_name}"
75
+ echo -n "${secret_value}" | gcloud secrets create ${secret_name} \
76
+ --data-file=- \
77
+ --project=${GCP_PROJECT_ID} \
78
+ --quiet
79
+ else
80
+ echo -e " ${YELLOW}⚠️ ${secret_name} not provided. You'll need to create it manually:${NC}"
81
+ echo " gcloud secrets create ${secret_name} --data-file=- --project=${GCP_PROJECT_ID}"
82
+ fi
83
+ fi
84
+ }
85
+
86
+ create_secret_if_not_exists "GROQ_API_KEY" "${GROQ_API_KEY}"
87
+ create_secret_if_not_exists "GOOGLE_API_KEY" "${GOOGLE_API_KEY}"
88
+
89
+ echo -e "${GREEN}✅ Secrets checked${NC}"
90
+ echo ""
91
+
92
+ # Step 3: Build container image
93
+ echo -e "${YELLOW}🏗️ Step 3/5: Building container image...${NC}"
94
+ gcloud builds submit \
95
+ --tag ${IMAGE_NAME}:latest \
96
+ --project=${GCP_PROJECT_ID} \
97
+ --timeout=600s \
98
+ .
99
+
100
+ echo -e "${GREEN}✅ Container built: ${IMAGE_NAME}:latest${NC}"
101
+ echo ""
102
+
103
+ # Step 4: Deploy to Cloud Run
104
+ echo -e "${YELLOW}🚀 Step 4/5: Deploying to Cloud Run...${NC}"
105
+
106
+ # Build the gcloud command
107
+ DEPLOY_CMD="gcloud run deploy ${SERVICE_NAME} \
108
+ --image ${IMAGE_NAME}:latest \
109
+ --platform managed \
110
+ --region ${REGION} \
111
+ --allow-unauthenticated \
112
+ --memory ${MEMORY} \
113
+ --cpu ${CPU} \
114
+ --timeout ${TIMEOUT} \
115
+ --max-instances ${MAX_INSTANCES} \
116
+ --min-instances 0 \
117
+ --concurrency 10 \
118
+ --set-env-vars LLM_PROVIDER=groq,REASONING_EFFORT=medium,CACHE_TTL_SECONDS=86400,ARTIFACT_BACKEND=local \
119
+ --project ${GCP_PROJECT_ID}"
120
+
121
+ # Add secrets if they exist
122
+ if gcloud secrets describe GROQ_API_KEY --project=${GCP_PROJECT_ID} &>/dev/null; then
123
+ DEPLOY_CMD="${DEPLOY_CMD} --set-secrets GROQ_API_KEY=GROQ_API_KEY:latest"
124
+ fi
125
+
126
+ if gcloud secrets describe GOOGLE_API_KEY --project=${GCP_PROJECT_ID} &>/dev/null; then
127
+ DEPLOY_CMD="${DEPLOY_CMD} --set-secrets GOOGLE_API_KEY=GOOGLE_API_KEY:latest"
128
+ fi
129
+
130
+ # Execute deployment
131
+ eval ${DEPLOY_CMD}
132
+
133
+ echo -e "${GREEN}✅ Deployment complete${NC}"
134
+ echo ""
135
+
136
+ # Step 5: Get service URL
137
+ echo -e "${YELLOW}🌐 Step 5/5: Retrieving service URL...${NC}"
138
+ SERVICE_URL=$(gcloud run services describe ${SERVICE_NAME} \
139
+ --region ${REGION} \
140
+ --project ${GCP_PROJECT_ID} \
141
+ --format 'value(status.url)')
142
+
143
+ echo ""
144
+ echo -e "${GREEN}========================================${NC}"
145
+ echo -e "${GREEN}✅ DEPLOYMENT SUCCESSFUL!${NC}"
146
+ echo -e "${GREEN}========================================${NC}"
147
+ echo ""
148
+ echo -e "🌐 Service URL: ${GREEN}${SERVICE_URL}${NC}"
149
+ echo ""
150
+ echo "📝 Test endpoints:"
151
+ echo " Health check:"
152
+ echo " curl ${SERVICE_URL}/health"
153
+ echo ""
154
+ echo " List tools:"
155
+ echo " curl ${SERVICE_URL}/tools"
156
+ echo ""
157
+ echo " Run analysis:"
158
+ echo " curl -X POST ${SERVICE_URL}/run \\"
159
+ echo " -F 'file=@data.csv' \\"
160
+ echo " -F 'task_description=Analyze this dataset and predict the target column'"
161
+ echo ""
162
+ echo -e "${YELLOW}📊 View logs:${NC}"
163
+ echo " gcloud run logs read ${SERVICE_NAME} --region ${REGION} --project ${GCP_PROJECT_ID} --limit 50"
164
+ echo ""
165
+ echo -e "${YELLOW}🔧 Manage service:${NC}"
166
+ echo " gcloud run services describe ${SERVICE_NAME} --region ${REGION} --project ${GCP_PROJECT_ID}"
167
+ echo ""
168
+
169
+ # Save service URL to file
170
+ echo "${SERVICE_URL}" > .cloud_run_url
171
+ echo -e "${GREEN}💾 Service URL saved to .cloud_run_url${NC}"
examples/titanic_example.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Titanic Example - Demonstrating the complete Data Science Copilot workflow
3
+ """
4
+
5
+ import sys
6
+ import os
7
+ from pathlib import Path
8
+
9
+ # Add src to path
10
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
11
+
12
+ from orchestrator import DataScienceCopilot
13
+ from rich.console import Console
14
+ from rich.panel import Panel
15
+
16
+ console = Console()
17
+
18
+
19
+ def main():
20
+ """
21
+ Complete example using the Titanic dataset.
22
+
23
+ This demonstrates the full workflow:
24
+ 1. Dataset profiling
25
+ 2. Quality issue detection
26
+ 3. Data cleaning
27
+ 4. Feature engineering
28
+ 5. Model training
29
+ 6. Report generation
30
+ """
31
+
32
+ console.print(Panel.fit(
33
+ "🚢 Titanic Survival Prediction - Complete Workflow Example",
34
+ style="bold blue"
35
+ ))
36
+
37
+ # Setup
38
+ titanic_path = "./data/titanic.csv"
39
+
40
+ # Check if dataset exists
41
+ if not Path(titanic_path).exists():
42
+ console.print("\n[yellow]⚠ Titanic dataset not found at ./data/titanic.csv[/yellow]")
43
+ console.print("[yellow]Please download it from: https://www.kaggle.com/c/titanic/data[/yellow]")
44
+ console.print("[yellow]Or place your own CSV file in the data directory[/yellow]\n")
45
+
46
+ # Use a sample path instead
47
+ console.print("[blue]Using sample dataset path for demonstration...[/blue]\n")
48
+ titanic_path = "your_dataset.csv" # User should replace this
49
+
50
+ # Initialize copilot
51
+ console.print("\n[bold]Step 1: Initialize Data Science Copilot[/bold]")
52
+ try:
53
+ copilot = DataScienceCopilot(reasoning_effort="medium")
54
+ console.print("[green]✓ Copilot initialized successfully[/green]")
55
+ except Exception as e:
56
+ console.print(f"[red]✗ Error: {e}[/red]")
57
+ console.print("[yellow]Make sure to set GROQ_API_KEY in .env file[/yellow]")
58
+ return
59
+
60
+ # Define the task
61
+ task_description = """
62
+ Analyze the Titanic dataset and build a model to predict passenger survival.
63
+
64
+ Key objectives:
65
+ 1. Understand the data structure and identify quality issues
66
+ 2. Handle missing values appropriately
67
+ 3. Engineer relevant features from available data (e.g., family size, titles from names)
68
+ 4. Train and compare multiple baseline models
69
+ 5. Identify the most important features for prediction
70
+ 6. Provide recommendations for improvement
71
+
72
+ Target: Achieve competitive performance (aim for 50-70th percentile on Kaggle leaderboard)
73
+ """
74
+
75
+ target_column = "Survived"
76
+
77
+ console.print("\n[bold]Step 2: Run Complete Analysis Workflow[/bold]")
78
+ console.print(f"Dataset: {titanic_path}")
79
+ console.print(f"Target: {target_column}")
80
+ console.print(f"Task: Predict passenger survival\n")
81
+
82
+ # Run analysis
83
+ try:
84
+ result = copilot.analyze(
85
+ file_path=titanic_path,
86
+ task_description=task_description,
87
+ target_col=target_column,
88
+ use_cache=True,
89
+ max_iterations=15 # Allow more iterations for complex workflow
90
+ )
91
+
92
+ # Display results
93
+ if result["status"] == "success":
94
+ console.print("\n[green]✓ Analysis Complete![/green]\n")
95
+
96
+ # Display summary
97
+ console.print(Panel(
98
+ result["summary"],
99
+ title="📋 Final Analysis Summary",
100
+ border_style="green"
101
+ ))
102
+
103
+ # Display workflow steps
104
+ console.print("\n[bold]🔧 Workflow Steps Executed:[/bold]")
105
+ for i, step in enumerate(result["workflow_history"], 1):
106
+ tool = step["tool"]
107
+ success = step["result"].get("success", False)
108
+ icon = "✓" if success else "✗"
109
+ color = "green" if success else "red"
110
+ console.print(f"{i}. [{color}]{icon}[/{color}] {tool}")
111
+
112
+ # Display statistics
113
+ console.print(f"\n[bold]📊 Execution Statistics:[/bold]")
114
+ console.print(f" Total Iterations: {result['iterations']}")
115
+ console.print(f" API Calls Made: {result['api_calls']}")
116
+ console.print(f" Execution Time: {result['execution_time']}s")
117
+
118
+ # Check for trained models
119
+ console.print("\n[bold]🤖 Model Training Results:[/bold]")
120
+ for step in result["workflow_history"]:
121
+ if step["tool"] == "train_baseline_models":
122
+ if step["result"].get("success"):
123
+ models_result = step["result"]["result"]
124
+ best_model = models_result.get("best_model", {})
125
+ console.print(f" Best Model: {best_model.get('name')}")
126
+ console.print(f" Score: {best_model.get('score'):.4f}")
127
+ console.print(f" Model Path: {best_model.get('model_path')}")
128
+
129
+ # Save results
130
+ output_file = "./outputs/reports/titanic_analysis.json"
131
+ Path(output_file).parent.mkdir(parents=True, exist_ok=True)
132
+
133
+ import json
134
+ with open(output_file, "w") as f:
135
+ json.dump(result, f, indent=2)
136
+
137
+ console.print(f"\n[cyan]💾 Full results saved to: {output_file}[/cyan]")
138
+
139
+ # Next steps
140
+ console.print("\n[bold]🎯 Next Steps:[/bold]")
141
+ console.print(" 1. Review the generated models in ./outputs/models/")
142
+ console.print(" 2. Check data quality reports in ./outputs/reports/")
143
+ console.print(" 3. Examine cleaned datasets in ./outputs/data/")
144
+ console.print(" 4. Use the best model for predictions on new data")
145
+
146
+ elif result["status"] == "error":
147
+ console.print(f"\n[red]✗ Analysis failed: {result['error']}[/red]")
148
+ console.print(f"Error type: {result['error_type']}")
149
+
150
+ else:
151
+ console.print(f"\n[yellow]⚠ Analysis incomplete: {result.get('message')}[/yellow]")
152
+
153
+ except Exception as e:
154
+ console.print(f"\n[red]✗ Unexpected error: {e}[/red]")
155
+ import traceback
156
+ console.print(traceback.format_exc())
157
+
158
+ # Cache statistics
159
+ console.print("\n[bold]📦 Cache Statistics:[/bold]")
160
+ cache_stats = copilot.get_cache_stats()
161
+ console.print(f" Valid Entries: {cache_stats['valid_entries']}")
162
+ console.print(f" Cache Size: {cache_stats['size_mb']} MB")
163
+
164
+
165
+ if __name__ == "__main__":
166
+ main()
requirements.txt ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core Dependencies
2
+ groq==0.11.0
3
+ python-dotenv==1.0.0
4
+
5
+ # Data Processing
6
+ polars>=0.20.3
7
+ duckdb>=0.10.0
8
+ pyarrow>=14.0.1
9
+ pandas>=2.2.0 # Updated for Python 3.13 compatibility
10
+
11
+ # Machine Learning
12
+ scikit-learn>=1.4.0
13
+ xgboost>=2.0.3
14
+ lightgbm>=4.6.0
15
+ catboost>=1.2.8
16
+ optuna>=3.5.0
17
+
18
+ # Explainability
19
+ shap>=0.44.1
20
+
21
+ # Advanced ML Tools
22
+ imbalanced-learn>=0.12.0
23
+
24
+ # Statistical Analysis
25
+ scipy>=1.11.4
26
+ statsmodels>=0.14.1
27
+
28
+ # Visualization
29
+ matplotlib>=3.8.2
30
+ seaborn>=0.13.1
31
+ plotly>=5.18.0 # Interactive visualizations
32
+
33
+ # EDA Report Generation
34
+ sweetviz>=2.3.1 # Beautiful fast EDA reports
35
+ ydata-profiling>=4.17.0 # Updated for Python 3.13 compatibility
36
+
37
+ # User Interface
38
+ # gradio>=5.49.1 # Replaced with React frontend
39
+
40
+ # REST API (Cloud Run)
41
+ fastapi>=0.109.0
42
+ uvicorn>=0.25.0
43
+ python-multipart>=0.0.6 # For file uploads
44
+
45
+ # Text Processing
46
+ textblob>=0.17.1
47
+
48
+ # Time Series Forecasting
49
+ prophet>=1.1.5
50
+ holidays>=0.38
51
+
52
+ # MLOps & Explainability
53
+ lime==0.2.0.1
54
+ fairlearn==0.10.0
55
+
56
+ # NLP (Optional - Uncomment for advanced NLP tools)
57
+ # These are optional but recommended for full NLP capabilities
58
+ # spacy==3.7.2 # For named entity recognition (perform_named_entity_recognition)
59
+ # transformers==4.35.2 # For transformer-based sentiment & topic modeling
60
+ # sentence-transformers==2.2.2 # For semantic text similarity
61
+ # bertopic==0.16.0 # For advanced topic modeling
62
+
63
+ # Computer Vision (Optional - Uncomment for CV tools)
64
+ # These are optional but recommended for full CV capabilities
65
+ # torch==2.1.0 # For CNN-based image feature extraction
66
+ # torchvision==0.16.0 # For pre-trained models (ResNet, EfficientNet, VGG)
67
+ Pillow==10.1.0 # For basic image processing
68
+ #opencv-python==4.8.1 # For advanced image processing & color features
69
+
70
+ # Business Intelligence (Optional - Uncomment for advanced BI tools)
71
+ # These are optional but add specialized capabilities
72
+ # lifetimes==0.11.3 # For customer lifetime value modeling
73
+ # econml==0.15.0 # For advanced causal inference
74
+
75
+ # CLI & UI
76
+ typer==0.9.0
77
+ rich==13.7.0
78
+ tqdm==4.66.1
79
+
80
+ # Utilities
81
+ pydantic==2.5.3
82
+ joblib==1.3.2
83
+
84
+ # Google Cloud Integration
85
+ google-cloud-bigquery==3.14.1
86
+ google-cloud-storage==2.14.0 # For GCS artifact storage
87
+ google-auth==2.25.2
88
+ google-generativeai==0.3.2 # For Gemini LLM support
89
+
90
+ # Testing
91
+ pytest==7.4.3
92
+ pytest-mock==3.12.0
93
+ pytest-cov==4.1.0
94
+
95
+ # Development
96
+ black==23.12.1
97
+ flake8==7.0.0
98
+ mypy==1.8.0
setup-deployment.sh ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Quick setup script for macOS deployment prerequisites
3
+
4
+ set -e
5
+
6
+ RED='\033[0;31m'
7
+ GREEN='\033[0;32m'
8
+ YELLOW='\033[1;33m'
9
+ BLUE='\033[0;34m'
10
+ NC='\033[0m'
11
+
12
+ echo -e "${BLUE}🔧 Data Science Agent - Deployment Setup${NC}"
13
+ echo "=========================================="
14
+ echo ""
15
+
16
+ # Check if Homebrew is installed
17
+ if ! command -v brew &> /dev/null; then
18
+ echo -e "${RED}❌ Homebrew not found${NC}"
19
+ echo "Installing Homebrew..."
20
+ /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
21
+ else
22
+ echo -e "${GREEN}✅ Homebrew installed${NC}"
23
+ fi
24
+
25
+ # Install Docker Desktop
26
+ if ! command -v docker &> /dev/null; then
27
+ echo -e "${YELLOW}📦 Installing Docker Desktop...${NC}"
28
+ brew install --cask docker
29
+ echo -e "${GREEN}✅ Docker Desktop installed${NC}"
30
+ echo -e "${YELLOW}⚠️ Please start Docker Desktop application, then run this script again${NC}"
31
+ exit 0
32
+ else
33
+ echo -e "${GREEN}✅ Docker installed${NC}"
34
+ fi
35
+
36
+ # Check if Docker daemon is running
37
+ if ! docker info &> /dev/null; then
38
+ echo -e "${YELLOW}⚠️ Docker is installed but not running${NC}"
39
+ echo "Please start Docker Desktop application, then run this script again"
40
+ exit 0
41
+ fi
42
+
43
+ # Install Google Cloud SDK
44
+ if ! command -v gcloud &> /dev/null; then
45
+ echo -e "${YELLOW}☁️ Installing Google Cloud SDK...${NC}"
46
+ brew install --cask google-cloud-sdk
47
+ echo -e "${GREEN}✅ Google Cloud SDK installed${NC}"
48
+
49
+ echo ""
50
+ echo -e "${YELLOW}📝 Next steps:${NC}"
51
+ echo "1. Restart your terminal to load gcloud"
52
+ echo "2. Run: gcloud auth login"
53
+ echo "3. Run: gcloud auth application-default login"
54
+ echo "4. Run: gcloud config set project YOUR_PROJECT_ID"
55
+ echo "5. Run: ./deploy.sh"
56
+ else
57
+ echo -e "${GREEN}✅ Google Cloud SDK installed${NC}"
58
+ fi
59
+
60
+ echo ""
61
+ echo -e "${BLUE}========================================${NC}"
62
+ echo -e "${GREEN}✅ Setup complete!${NC}"
63
+ echo ""
64
+ echo "Next steps:"
65
+ echo "1. Authenticate with Google Cloud:"
66
+ echo " ${YELLOW}gcloud auth login${NC}"
67
+ echo " ${YELLOW}gcloud auth application-default login${NC}"
68
+ echo ""
69
+ echo "2. Set your GCP project:"
70
+ echo " ${YELLOW}gcloud config set project YOUR_PROJECT_ID${NC}"
71
+ echo ""
72
+ echo "3. Set your API keys:"
73
+ echo " ${YELLOW}export GROQ_API_KEY='your-groq-key'${NC}"
74
+ echo " ${YELLOW}export GOOGLE_API_KEY='your-google-key'${NC}"
75
+ echo ""
76
+ echo "4. Deploy to Cloud Run:"
77
+ echo " ${YELLOW}./deploy.sh${NC}"
78
+ echo ""
src/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ """Data Science Copilot - AI-powered data science automation."""
2
+
3
+ __version__ = "0.1.0"
4
+
5
+ from .orchestrator import DataScienceCopilot
6
+
7
+ __all__ = ["DataScienceCopilot"]
src/api/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ """
2
+ Cloud Run API Module
3
+ FastAPI wrapper for DataScienceCopilot
4
+ """
src/api/app.py ADDED
@@ -0,0 +1,513 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FastAPI Application for Google Cloud Run
3
+ Thin HTTP wrapper around DataScienceCopilot - No logic changes, just API exposure.
4
+ """
5
+
6
+ import os
7
+ import sys
8
+ import tempfile
9
+ import shutil
10
+ from pathlib import Path
11
+ from typing import Optional, Dict, Any, List
12
+ import logging
13
+ from dotenv import load_dotenv
14
+
15
+ # Load environment variables from .env file
16
+ load_dotenv()
17
+
18
+ from fastapi import FastAPI, File, UploadFile, Form, HTTPException, Request
19
+ from fastapi.responses import JSONResponse, FileResponse
20
+ from fastapi.staticfiles import StaticFiles
21
+ from fastapi.middleware.cors import CORSMiddleware
22
+ from pydantic import BaseModel
23
+
24
+ # Add src to path for imports
25
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
26
+
27
+ from orchestrator import DataScienceCopilot
28
+
29
+ # Configure logging
30
+ logging.basicConfig(level=logging.INFO)
31
+ logger = logging.getLogger(__name__)
32
+
33
+ # Initialize FastAPI
34
+ app = FastAPI(
35
+ title="Data Science Agent API",
36
+ description="Cloud Run wrapper for autonomous data science workflows",
37
+ version="1.0.0"
38
+ )
39
+
40
+ # Enable CORS for frontend
41
+ app.add_middleware(
42
+ CORSMiddleware,
43
+ allow_origins=["*"], # Configure this properly in production
44
+ allow_credentials=True,
45
+ allow_methods=["*"],
46
+ allow_headers=["*"],
47
+ )
48
+
49
+ # Initialize agent once (singleton pattern for stateless service)
50
+ # Agent itself is stateless - no conversation memory between requests
51
+ agent: Optional[DataScienceCopilot] = None
52
+
53
+ # Mount static files for React frontend
54
+ frontend_path = Path(__file__).parent.parent.parent / "FRRONTEEEND" / "dist"
55
+ if frontend_path.exists():
56
+ app.mount("/assets", StaticFiles(directory=str(frontend_path / "assets")), name="assets")
57
+ logger.info(f"✅ Frontend assets mounted from {frontend_path}")
58
+
59
+
60
+ @app.on_event("startup")
61
+ async def startup_event():
62
+ """Initialize DataScienceCopilot on service startup."""
63
+ global agent
64
+ try:
65
+ logger.info("Initializing DataScienceCopilot...")
66
+ agent = DataScienceCopilot(
67
+ reasoning_effort="medium",
68
+ provider=os.getenv("LLM_PROVIDER", "groq")
69
+ )
70
+ logger.info(f"✅ Agent initialized with provider: {agent.provider}")
71
+ except Exception as e:
72
+ logger.error(f"❌ Failed to initialize agent: {e}")
73
+ raise
74
+
75
+
76
+ @app.get("/api/health")
77
+ async def root():
78
+ """Health check endpoint."""
79
+ return {
80
+ "service": "Data Science Agent API",
81
+ "status": "healthy",
82
+ "provider": agent.provider if agent else "not initialized",
83
+ "tools_available": len(agent.tool_functions) if agent else 0
84
+ }
85
+
86
+
87
+ @app.get("/health")
88
+ async def health_check():
89
+ """
90
+ Health check for Cloud Run.
91
+ Returns 200 if service is ready to accept requests.
92
+ """
93
+ if agent is None:
94
+ raise HTTPException(status_code=503, detail="Agent not initialized")
95
+
96
+ return {
97
+ "status": "healthy",
98
+ "agent_ready": True,
99
+ "provider": agent.provider,
100
+ "tools_count": len(agent.tool_functions)
101
+ }
102
+
103
+
104
+ class AnalysisRequest(BaseModel):
105
+ """Request model for analysis endpoint (JSON body)."""
106
+ task_description: str
107
+ target_col: Optional[str] = None
108
+ use_cache: bool = True
109
+ max_iterations: int = 20
110
+
111
+
112
+ @app.post("/run")
113
+ async def run_analysis(
114
+ file: UploadFile = File(..., description="Dataset file (CSV or Parquet)"),
115
+ task_description: str = Form(..., description="Natural language task description"),
116
+ target_col: Optional[str] = Form(None, description="Target column name for prediction"),
117
+ use_cache: bool = Form(True, description="Enable caching for expensive operations"),
118
+ max_iterations: int = Form(20, description="Maximum workflow iterations")
119
+ ) -> JSONResponse:
120
+ """
121
+ Run complete data science workflow on uploaded dataset.
122
+
123
+ This is a thin wrapper - all logic lives in DataScienceCopilot.analyze().
124
+
125
+ Args:
126
+ file: CSV or Parquet file upload
127
+ task_description: Natural language description of the task
128
+ target_col: Optional target column for ML tasks
129
+ use_cache: Whether to use cached results
130
+ max_iterations: Maximum number of workflow steps
131
+
132
+ Returns:
133
+ JSON response with analysis results, workflow history, and execution stats
134
+
135
+ Example:
136
+ ```bash
137
+ curl -X POST http://localhost:8080/run \
138
+ -F "file=@data.csv" \
139
+ -F "task_description=Analyze this dataset and predict house prices" \
140
+ -F "target_col=price"
141
+ ```
142
+ """
143
+ if agent is None:
144
+ raise HTTPException(status_code=503, detail="Agent not initialized")
145
+
146
+ # Validate file format
147
+ filename = file.filename.lower()
148
+ if not (filename.endswith('.csv') or filename.endswith('.parquet')):
149
+ raise HTTPException(
150
+ status_code=400,
151
+ detail="Invalid file format. Only CSV and Parquet files are supported."
152
+ )
153
+
154
+ # Use /tmp for Cloud Run (ephemeral storage)
155
+ temp_dir = Path("/tmp") / "data_science_agent"
156
+ temp_dir.mkdir(parents=True, exist_ok=True)
157
+
158
+ temp_file_path = None
159
+
160
+ try:
161
+ # Save uploaded file to temporary location
162
+ temp_file_path = temp_dir / file.filename
163
+ logger.info(f"Saving uploaded file to: {temp_file_path}")
164
+
165
+ with open(temp_file_path, "wb") as buffer:
166
+ shutil.copyfileobj(file.file, buffer)
167
+
168
+ logger.info(f"File saved successfully: {file.filename} ({os.path.getsize(temp_file_path)} bytes)")
169
+
170
+ # Call existing agent logic - NO CHANGES to orchestrator
171
+ logger.info(f"Starting analysis with task: {task_description}")
172
+ result = agent.analyze(
173
+ file_path=str(temp_file_path),
174
+ task_description=task_description,
175
+ target_col=target_col,
176
+ use_cache=use_cache,
177
+ max_iterations=max_iterations
178
+ )
179
+
180
+ logger.info(f"Analysis completed: {result.get('status')}")
181
+
182
+ # Filter out non-JSON-serializable objects (like matplotlib/plotly Figures)
183
+ def make_json_serializable(obj):
184
+ """Recursively convert objects to JSON-serializable format."""
185
+ if isinstance(obj, dict):
186
+ return {k: make_json_serializable(v) for k, v in obj.items()}
187
+ elif isinstance(obj, list):
188
+ return [make_json_serializable(item) for item in obj]
189
+ elif hasattr(obj, '__class__') and obj.__class__.__name__ in ['Figure', 'Axes', 'Artist']:
190
+ # Skip matplotlib/plotly Figure objects
191
+ return f"<{obj.__class__.__name__} object - see artifacts>"
192
+ elif isinstance(obj, (str, int, float, bool, type(None))):
193
+ return obj
194
+ else:
195
+ # Try to convert to string for other types
196
+ try:
197
+ return str(obj)
198
+ except:
199
+ return f"<{type(obj).__name__}>"
200
+
201
+ serializable_result = make_json_serializable(result)
202
+
203
+ # Return result as-is from orchestrator
204
+ return JSONResponse(
205
+ content={
206
+ "success": result.get("status") == "success",
207
+ "result": serializable_result,
208
+ "metadata": {
209
+ "filename": file.filename,
210
+ "task": task_description,
211
+ "target": target_col,
212
+ "provider": agent.provider
213
+ }
214
+ },
215
+ status_code=200
216
+ )
217
+
218
+ except Exception as e:
219
+ logger.error(f"Analysis failed: {str(e)}", exc_info=True)
220
+ raise HTTPException(
221
+ status_code=500,
222
+ detail={
223
+ "error": str(e),
224
+ "error_type": type(e).__name__,
225
+ "message": "Analysis workflow failed. Check logs for details."
226
+ }
227
+ )
228
+
229
+ finally:
230
+ # Cleanup temporary file
231
+ if temp_file_path and temp_file_path.exists():
232
+ try:
233
+ temp_file_path.unlink()
234
+ logger.info(f"Cleaned up temporary file: {temp_file_path}")
235
+ except Exception as e:
236
+ logger.warning(f"Failed to cleanup temp file: {e}")
237
+
238
+
239
+ @app.post("/profile")
240
+ async def profile_dataset(
241
+ file: UploadFile = File(..., description="Dataset file (CSV or Parquet)")
242
+ ) -> JSONResponse:
243
+ """
244
+ Quick dataset profiling without full workflow.
245
+
246
+ Returns basic statistics, data types, and quality issues.
247
+ Useful for initial data exploration without running full analysis.
248
+
249
+ Example:
250
+ ```bash
251
+ curl -X POST http://localhost:8080/profile \
252
+ -F "file=@data.csv"
253
+ ```
254
+ """
255
+ if agent is None:
256
+ raise HTTPException(status_code=503, detail="Agent not initialized")
257
+
258
+ filename = file.filename.lower()
259
+ if not (filename.endswith('.csv') or filename.endswith('.parquet')):
260
+ raise HTTPException(
261
+ status_code=400,
262
+ detail="Invalid file format. Only CSV and Parquet files are supported."
263
+ )
264
+
265
+ temp_dir = Path("/tmp") / "data_science_agent"
266
+ temp_dir.mkdir(parents=True, exist_ok=True)
267
+ temp_file_path = None
268
+
269
+ try:
270
+ # Save file temporarily
271
+ temp_file_path = temp_dir / file.filename
272
+ with open(temp_file_path, "wb") as buffer:
273
+ shutil.copyfileobj(file.file, buffer)
274
+
275
+ # Import profiling tool directly
276
+ from tools.data_profiling import profile_dataset as profile_tool
277
+ from tools.data_profiling import detect_data_quality_issues
278
+
279
+ # Run profiling tools
280
+ logger.info(f"Profiling dataset: {file.filename}")
281
+ profile_result = profile_tool(str(temp_file_path))
282
+ quality_result = detect_data_quality_issues(str(temp_file_path))
283
+
284
+ return JSONResponse(
285
+ content={
286
+ "success": True,
287
+ "filename": file.filename,
288
+ "profile": profile_result,
289
+ "quality_issues": quality_result
290
+ },
291
+ status_code=200
292
+ )
293
+
294
+ except Exception as e:
295
+ logger.error(f"Profiling failed: {str(e)}", exc_info=True)
296
+ raise HTTPException(
297
+ status_code=500,
298
+ detail={
299
+ "error": str(e),
300
+ "error_type": type(e).__name__
301
+ }
302
+ )
303
+
304
+ finally:
305
+ if temp_file_path and temp_file_path.exists():
306
+ try:
307
+ temp_file_path.unlink()
308
+ except Exception as e:
309
+ logger.warning(f"Failed to cleanup temp file: {e}")
310
+
311
+
312
+ @app.get("/tools")
313
+ async def list_tools():
314
+ """
315
+ List all available tools in the agent.
316
+
317
+ Returns tool names organized by category.
318
+ Useful for understanding agent capabilities.
319
+ """
320
+ if agent is None:
321
+ raise HTTPException(status_code=503, detail="Agent not initialized")
322
+
323
+ from tools.tools_registry import get_tools_by_category
324
+
325
+ return {
326
+ "total_tools": len(agent.tool_functions),
327
+ "tools_by_category": get_tools_by_category(),
328
+ "all_tools": list(agent.tool_functions.keys())
329
+ }
330
+
331
+
332
+ class ChatMessage(BaseModel):
333
+ """Chat message model."""
334
+ role: str # 'user' or 'assistant'
335
+ content: str
336
+
337
+
338
+ class ChatRequest(BaseModel):
339
+ """Chat request model."""
340
+ messages: List[ChatMessage]
341
+ stream: bool = False
342
+
343
+
344
+ @app.post("/chat")
345
+ async def chat(request: ChatRequest) -> JSONResponse:
346
+ """
347
+ Chat endpoint for conversational interface.
348
+
349
+ Processes chat messages and returns agent responses.
350
+ Uses the same underlying agent as /run but in chat format.
351
+
352
+ Args:
353
+ request: Chat request with message history
354
+
355
+ Returns:
356
+ JSON response with agent's reply
357
+ """
358
+ if agent is None:
359
+ raise HTTPException(status_code=503, detail="Agent not initialized")
360
+
361
+ try:
362
+ # Extract the latest user message
363
+ user_messages = [msg for msg in request.messages if msg.role == "user"]
364
+ if not user_messages:
365
+ raise HTTPException(status_code=400, detail="No user message found")
366
+
367
+ latest_message = user_messages[-1].content
368
+
369
+ # Check for API key
370
+ api_key = os.getenv("GOOGLE_API_KEY")
371
+ if not api_key:
372
+ raise HTTPException(
373
+ status_code=500,
374
+ detail="GOOGLE_API_KEY not configured. Please set the environment variable."
375
+ )
376
+
377
+ # Use Google Gemini API
378
+ import google.generativeai as genai
379
+
380
+ logger.info(f"Configuring Gemini with API key (length: {len(api_key)})")
381
+ genai.configure(api_key=api_key)
382
+
383
+ # Initialize Gemini model
384
+ model = genai.GenerativeModel(
385
+ model_name=os.getenv("GEMINI_MODEL", "gemini-2.5-flash-lite"),
386
+ system_instruction="You are a Senior Data Science Autonomous Agent. You help users with end-to-end machine learning, data profiling, visualization, and strategic insights. Use a professional, technical yet accessible tone. Provide code snippets in Python if requested. You have access to tools for data analysis, ML training, visualization, and more."
387
+ )
388
+
389
+ # Convert messages to Gemini format (exclude system message, just conversation)
390
+ chat_history = []
391
+ for msg in request.messages[:-1]: # Exclude the latest message
392
+ chat_history.append({
393
+ "role": "user" if msg.role == "user" else "model",
394
+ "parts": [msg.content]
395
+ })
396
+
397
+ # Start chat with history
398
+ chat = model.start_chat(history=chat_history)
399
+
400
+ # Send the latest message
401
+ response = chat.send_message(latest_message)
402
+
403
+ assistant_message = response.text
404
+
405
+ return JSONResponse(
406
+ content={
407
+ "success": True,
408
+ "message": assistant_message,
409
+ "model": "gemini-2.0-flash-exp",
410
+ "provider": "gemini"
411
+ },
412
+ status_code=200
413
+ )
414
+
415
+ except Exception as e:
416
+ logger.error(f"Chat failed: {str(e)}", exc_info=True)
417
+ raise HTTPException(
418
+ status_code=500,
419
+ detail={
420
+ "error": str(e),
421
+ "error_type": type(e).__name__
422
+ }
423
+ )
424
+
425
+
426
+ # Error handlers
427
+ @app.exception_handler(HTTPException)
428
+ async def http_exception_handler(request, exc):
429
+ """Custom error response format."""
430
+ return JSONResponse(
431
+ status_code=exc.status_code,
432
+ content={
433
+ "success": False,
434
+ "error": exc.detail,
435
+ "status_code": exc.status_code
436
+ }
437
+ )
438
+
439
+
440
+ @app.exception_handler(Exception)
441
+ async def general_exception_handler(request, exc):
442
+ """Catch-all error handler."""
443
+ logger.error(f"Unhandled exception: {str(exc)}", exc_info=True)
444
+ return JSONResponse(
445
+ status_code=500,
446
+ content={
447
+ "success": False,
448
+ "error": "Internal server error",
449
+ "detail": str(exc),
450
+ "error_type": type(exc).__name__
451
+ }
452
+ )
453
+
454
+
455
+ @app.get("/outputs/{file_path:path}")
456
+ async def serve_output_files(file_path: str):
457
+ """
458
+ Serve generated output files (reports, plots, models, etc.).
459
+ """
460
+ output_path = Path("./outputs") / file_path
461
+
462
+ if not output_path.exists():
463
+ raise HTTPException(status_code=404, detail=f"File not found: {file_path}")
464
+
465
+ if not output_path.is_file():
466
+ raise HTTPException(status_code=400, detail="Path is not a file")
467
+
468
+ # Security: prevent directory traversal
469
+ try:
470
+ output_path.resolve().relative_to(Path("./outputs").resolve())
471
+ except ValueError:
472
+ raise HTTPException(status_code=403, detail="Access denied")
473
+
474
+ return FileResponse(output_path)
475
+
476
+
477
+ @app.get("/{full_path:path}")
478
+ async def serve_frontend(full_path: str):
479
+ """
480
+ Serve React frontend for all non-API routes.
481
+ This should be the last route defined.
482
+ """
483
+ frontend_path = Path(__file__).parent.parent.parent / "FRRONTEEEND" / "dist"
484
+
485
+ # Try to serve the requested file
486
+ file_path = frontend_path / full_path
487
+ if file_path.is_file():
488
+ return FileResponse(file_path)
489
+
490
+ # Default to index.html for client-side routing
491
+ index_path = frontend_path / "index.html"
492
+ if index_path.exists():
493
+ return FileResponse(index_path)
494
+
495
+ # Frontend not built
496
+ raise HTTPException(
497
+ status_code=404,
498
+ detail="Frontend not found. Please build the frontend first: cd FRRONTEEEND && npm run build"
499
+ )
500
+
501
+
502
+ # Cloud Run listens on PORT environment variable
503
+ if __name__ == "__main__":
504
+ import uvicorn
505
+
506
+ port = int(os.getenv("PORT", 8080))
507
+
508
+ uvicorn.run(
509
+ "app:app",
510
+ host="0.0.0.0",
511
+ port=port,
512
+ log_level="info"
513
+ )
src/cache/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ """Cache module initialization."""
2
+
3
+ from .cache_manager import CacheManager
4
+
5
+ __all__ = ["CacheManager"]
src/cache/cache_manager.py ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Cache Manager for Data Science Copilot
3
+ Uses SQLite for persistent caching of API responses and computation results.
4
+ """
5
+
6
+ import hashlib
7
+ import json
8
+ import sqlite3
9
+ import time
10
+ from pathlib import Path
11
+ from typing import Any, Optional
12
+ import pickle
13
+
14
+
15
+ class CacheManager:
16
+ """
17
+ Manages caching of LLM responses and expensive computations.
18
+
19
+ Uses SQLite for persistence and supports TTL-based invalidation.
20
+ Cache keys are generated from file hashes and operation parameters.
21
+ """
22
+
23
+ def __init__(self, db_path: str = "./cache_db/cache.db", ttl_seconds: int = 86400):
24
+ """
25
+ Initialize cache manager.
26
+
27
+ Args:
28
+ db_path: Path to SQLite database file
29
+ ttl_seconds: Time-to-live for cache entries (default 24 hours)
30
+ """
31
+ self.db_path = Path(db_path)
32
+ self.ttl_seconds = ttl_seconds
33
+
34
+ # Ensure cache directory exists
35
+ self.db_path.parent.mkdir(parents=True, exist_ok=True)
36
+
37
+ # Initialize database
38
+ self._init_db()
39
+
40
+ def _init_db(self) -> None:
41
+ """Create cache table if it doesn't exist."""
42
+ try:
43
+ conn = sqlite3.connect(self.db_path)
44
+ cursor = conn.cursor()
45
+
46
+ cursor.execute("""
47
+ CREATE TABLE IF NOT EXISTS cache (
48
+ key TEXT PRIMARY KEY,
49
+ value BLOB NOT NULL,
50
+ created_at INTEGER NOT NULL,
51
+ expires_at INTEGER NOT NULL,
52
+ metadata TEXT
53
+ )
54
+ """)
55
+
56
+ # Create index on expires_at for efficient cleanup
57
+ cursor.execute("""
58
+ CREATE INDEX IF NOT EXISTS idx_expires_at
59
+ ON cache(expires_at)
60
+ """)
61
+
62
+ conn.commit()
63
+ conn.close()
64
+ print(f"✅ Cache database initialized at {self.db_path}")
65
+ except Exception as e:
66
+ print(f"⚠️ Error initializing cache database: {e}")
67
+ print(f" Attempting to recreate database...")
68
+ try:
69
+ # Remove corrupted database and recreate
70
+ if self.db_path.exists():
71
+ self.db_path.unlink()
72
+
73
+ conn = sqlite3.connect(self.db_path)
74
+ cursor = conn.cursor()
75
+
76
+ cursor.execute("""
77
+ CREATE TABLE cache (
78
+ key TEXT PRIMARY KEY,
79
+ value BLOB NOT NULL,
80
+ created_at INTEGER NOT NULL,
81
+ expires_at INTEGER NOT NULL,
82
+ metadata TEXT
83
+ )
84
+ """)
85
+
86
+ cursor.execute("""
87
+ CREATE INDEX idx_expires_at
88
+ ON cache(expires_at)
89
+ """)
90
+
91
+ conn.commit()
92
+ conn.close()
93
+ print(f"✅ Cache database recreated successfully")
94
+ except Exception as e2:
95
+ print(f"❌ Failed to recreate cache database: {e2}")
96
+ print(f" Cache functionality will be disabled")
97
+
98
+ def _generate_key(self, *args, **kwargs) -> str:
99
+ """
100
+ Generate a unique cache key from arguments.
101
+
102
+ Args:
103
+ *args: Positional arguments to hash
104
+ **kwargs: Keyword arguments to hash
105
+
106
+ Returns:
107
+ MD5 hash of the arguments
108
+ """
109
+ # Combine args and kwargs into a single string
110
+ key_data = json.dumps({"args": args, "kwargs": kwargs}, sort_keys=True)
111
+ return hashlib.md5(key_data.encode()).hexdigest()
112
+
113
+ def get(self, key: str) -> Optional[Any]:
114
+ """
115
+ Retrieve value from cache.
116
+
117
+ Args:
118
+ key: Cache key
119
+
120
+ Returns:
121
+ Cached value if exists and not expired, None otherwise
122
+ """
123
+ try:
124
+ conn = sqlite3.connect(self.db_path)
125
+ cursor = conn.cursor()
126
+
127
+ current_time = int(time.time())
128
+
129
+ cursor.execute("""
130
+ SELECT value, expires_at
131
+ FROM cache
132
+ WHERE key = ? AND expires_at > ?
133
+ """, (key, current_time))
134
+
135
+ result = cursor.fetchone()
136
+ conn.close()
137
+ except sqlite3.OperationalError as e:
138
+ print(f"⚠️ Cache read error: {e}")
139
+ print(f" Reinitializing cache database...")
140
+ self._init_db()
141
+ return None
142
+ except Exception as e:
143
+ print(f"⚠️ Unexpected cache error: {e}")
144
+ return None
145
+
146
+ if result:
147
+ value_blob, expires_at = result
148
+ # Deserialize using pickle for complex Python objects
149
+ return pickle.loads(value_blob)
150
+
151
+ return None
152
+
153
+ def set(self, key: str, value: Any, ttl_override: Optional[int] = None,
154
+ metadata: Optional[dict] = None) -> None:
155
+ """
156
+ Store value in cache.
157
+
158
+ Args:
159
+ key: Cache key
160
+ value: Value to cache (must be pickleable)
161
+ ttl_override: Optional override for TTL (seconds)
162
+ metadata: Optional metadata to store with cache entry
163
+ """
164
+ try:
165
+ conn = sqlite3.connect(self.db_path)
166
+ cursor = conn.cursor()
167
+
168
+ current_time = int(time.time())
169
+ ttl = ttl_override if ttl_override is not None else self.ttl_seconds
170
+ expires_at = current_time + ttl
171
+
172
+ # Serialize value using pickle
173
+ value_blob = pickle.dumps(value)
174
+
175
+ # Serialize metadata as JSON
176
+ metadata_json = json.dumps(metadata) if metadata else None
177
+
178
+ cursor.execute("""
179
+ INSERT OR REPLACE INTO cache (key, value, created_at, expires_at, metadata)
180
+ VALUES (?, ?, ?, ?, ?)
181
+ """, (key, value_blob, current_time, expires_at, metadata_json))
182
+
183
+ conn.commit()
184
+ conn.close()
185
+ except sqlite3.OperationalError as e:
186
+ print(f"⚠️ Cache write error: {e}")
187
+ print(f" Reinitializing cache database...")
188
+ self._init_db()
189
+ except Exception as e:
190
+ print(f"⚠️ Unexpected cache error during write: {e}")
191
+
192
+ def invalidate(self, key: str) -> bool:
193
+ """
194
+ Remove specific entry from cache.
195
+
196
+ Args:
197
+ key: Cache key to invalidate
198
+
199
+ Returns:
200
+ True if entry was removed, False if not found
201
+ """
202
+ conn = sqlite3.connect(self.db_path)
203
+ cursor = conn.cursor()
204
+
205
+ cursor.execute("DELETE FROM cache WHERE key = ?", (key,))
206
+ deleted = cursor.rowcount > 0
207
+
208
+ conn.commit()
209
+ conn.close()
210
+
211
+ return deleted
212
+
213
+ def clear_expired(self) -> int:
214
+ """
215
+ Remove all expired entries from cache.
216
+
217
+ Returns:
218
+ Number of entries removed
219
+ """
220
+ conn = sqlite3.connect(self.db_path)
221
+ cursor = conn.cursor()
222
+
223
+ current_time = int(time.time())
224
+ cursor.execute("DELETE FROM cache WHERE expires_at <= ?", (current_time,))
225
+ deleted = cursor.rowcount
226
+
227
+ conn.commit()
228
+ conn.close()
229
+
230
+ return deleted
231
+
232
+ def clear_all(self) -> None:
233
+ """Remove all entries from cache."""
234
+ conn = sqlite3.connect(self.db_path)
235
+ cursor = conn.cursor()
236
+
237
+ cursor.execute("DELETE FROM cache")
238
+
239
+ conn.commit()
240
+ conn.close()
241
+
242
+ def get_stats(self) -> dict:
243
+ """
244
+ Get cache statistics.
245
+
246
+ Returns:
247
+ Dictionary with cache stats (total entries, expired, size)
248
+ """
249
+ conn = sqlite3.connect(self.db_path)
250
+ cursor = conn.cursor()
251
+
252
+ current_time = int(time.time())
253
+
254
+ # Total entries
255
+ cursor.execute("SELECT COUNT(*) FROM cache")
256
+ total = cursor.fetchone()[0]
257
+
258
+ # Valid entries
259
+ cursor.execute("SELECT COUNT(*) FROM cache WHERE expires_at > ?", (current_time,))
260
+ valid = cursor.fetchone()[0]
261
+
262
+ # Database size
263
+ cursor.execute("SELECT page_count * page_size FROM pragma_page_count(), pragma_page_size()")
264
+ size_bytes = cursor.fetchone()[0]
265
+
266
+ conn.close()
267
+
268
+ return {
269
+ "total_entries": total,
270
+ "valid_entries": valid,
271
+ "expired_entries": total - valid,
272
+ "size_mb": round(size_bytes / (1024 * 1024), 2)
273
+ }
274
+
275
+ def generate_file_hash(self, file_path: str) -> str:
276
+ """
277
+ Generate hash of file contents for cache key.
278
+
279
+ Args:
280
+ file_path: Path to file
281
+
282
+ Returns:
283
+ MD5 hash of file contents
284
+ """
285
+ hasher = hashlib.md5()
286
+
287
+ with open(file_path, 'rb') as f:
288
+ # Read file in chunks to handle large files
289
+ for chunk in iter(lambda: f.read(4096), b""):
290
+ hasher.update(chunk)
291
+
292
+ return hasher.hexdigest()