Akshay4506 commited on
Commit
f800c3b
Β·
1 Parent(s): 9423e26

feat: finalized stable architecture with consolidated structure and single-worker mode

Browse files
Dockerfile CHANGED
@@ -40,5 +40,5 @@ RUN pip install --no-cache-dir git+https://github.com/SAP-samples/sap-rpt-1-oss.
40
  # Expose port 7860 (Hugging Face Spaces default port)
41
  EXPOSE 7860
42
 
43
- # Run the FastAPI app
44
- CMD ["python", "-m", "uvicorn", "webapp.main:app", "--host", "0.0.0.0", "--port", "7860"]
 
40
  # Expose port 7860 (Hugging Face Spaces default port)
41
  EXPOSE 7860
42
 
43
+ # Run the FastAPI app with a single worker to save RAM and avoid download race conditions
44
+ CMD ["python", "-m", "uvicorn", "webapp.main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]
code/analysis/__init__.py DELETED
@@ -1,11 +0,0 @@
1
- """
2
- Analysis Package
3
- ================
4
-
5
- Results aggregation, statistical analysis, and visualization.
6
-
7
- Author: UW MSIM Team
8
- Date: November 2025
9
- """
10
-
11
- __all__ = ['aggregate_results']
 
 
 
 
 
 
 
 
 
 
 
 
code/analysis/aggregate_results.py DELETED
@@ -1,99 +0,0 @@
1
- """
2
- Results Aggregation
3
- ===================
4
-
5
- Aggregate all experiment results into summary tables.
6
-
7
- Author: UW MSIM Team
8
- Date: November 2025
9
- """
10
-
11
- import glob
12
- import json
13
- import pandas as pd
14
- import os
15
- import logging
16
-
17
- logger = logging.getLogger(__name__)
18
-
19
-
20
- def aggregate_all_results(
21
- results_dir: str = '../results/raw',
22
- output_file: str = '../results/processed/aggregated_results.csv'
23
- ) -> pd.DataFrame:
24
- """
25
- Aggregate all experiment results into single DataFrame.
26
-
27
- Parameters
28
- ----------
29
- results_dir : str
30
- Directory containing result JSON files
31
- output_file : str
32
- Where to save aggregated CSV
33
-
34
- Returns
35
- -------
36
- df : pd.DataFrame
37
- Aggregated results
38
- """
39
- logger.info(f"Aggregating results from {results_dir}")
40
-
41
- result_files = glob.glob(os.path.join(results_dir, '*.json'))
42
- logger.info(f"Found {len(result_files)} result files")
43
-
44
- aggregated = []
45
-
46
- for file in result_files:
47
- try:
48
- with open(file) as f:
49
- data = json.load(f)
50
-
51
- record = {
52
- 'dataset': data['dataset'],
53
- 'model': data['model'],
54
- 'task_type': data['task_type'],
55
- 'n_samples': data['n_samples'],
56
- 'n_features': data['n_features'],
57
- 'n_folds': data['n_folds']
58
- }
59
-
60
- # Add mean metrics
61
- for metric, value in data['mean_metrics'].items():
62
- record[f'mean_{metric}'] = value
63
-
64
- # Add std metrics
65
- for metric, value in data['std_metrics'].items():
66
- record[f'std_{metric}'] = value
67
-
68
- # Add compute info
69
- if 'compute' in data:
70
- record['elapsed_hours'] = data['compute'].get('elapsed_hours')
71
- record['cost_usd'] = data['compute'].get('cost_usd')
72
-
73
- aggregated.append(record)
74
-
75
- except Exception as e:
76
- logger.warning(f"Failed to process {file}: {e}")
77
-
78
- # Create DataFrame
79
- df = pd.DataFrame(aggregated)
80
-
81
- # Save
82
- os.makedirs(os.path.dirname(output_file), exist_ok=True)
83
- df.to_csv(output_file, index=False)
84
-
85
- logger.info(f"Aggregated {len(df)} results to {output_file}")
86
-
87
- return df
88
-
89
-
90
- if __name__ == "__main__":
91
- logging.basicConfig(level=logging.INFO)
92
-
93
- df = aggregate_all_results()
94
-
95
- print(f"\nβœ… Aggregated {len(df)} experiment results")
96
- print(f"\nDatasets: {df['dataset'].nunique()}")
97
- print(f"Models: {df['model'].nunique()}")
98
- print(f"\nSample of results:")
99
- print(df.head())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
code/config/datasets.yaml DELETED
@@ -1,33 +0,0 @@
1
- # Dataset Configuration
2
- # =====================
3
-
4
- # Local Datasets (from datasets folder)
5
- local_datasets:
6
- enabled: true
7
- path: '../datasets'
8
-
9
- # TabZilla Datasets (subset of 20)
10
- tabzilla:
11
- enabled: false # Enable when data is available
12
- path: '../datasets/tabzilla'
13
-
14
- # OpenML-CC18 (Classification subset)
15
- openml_cc18:
16
- enabled: false
17
- path: '../datasets/openml_cc18'
18
-
19
- # Dataset Filters
20
- filters:
21
- min_samples: 100
22
- max_samples: 100000
23
- min_features: 2
24
- max_features: 1000
25
- task_types:
26
- - classification
27
- - regression
28
-
29
- # Preprocessing
30
- preprocessing:
31
- handle_missing: 'mean' # mean, median, most_frequent, drop
32
- encode_categoricals: true
33
- scale_features: false # Most models handle scaling internally
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
code/config/experiments.yaml DELETED
@@ -1,64 +0,0 @@
1
- # Experiment Configuration
2
- # ========================
3
-
4
- # Cross-Validation Settings
5
- n_folds: 10
6
- random_state: 42
7
- timeout: 86400 # 24 hours per experiment
8
-
9
- # Compute Resources
10
- cost_per_hour: 0.90 # USD per GPU-hour (H200)
11
- gpu_type: 'H200'
12
- gpu_memory_limit: 80 # GB
13
- checkpoint_interval: 3600 # Save checkpoint every hour
14
-
15
- # Model-Specific Parameters
16
- model_params:
17
- sap_rpt1:
18
- context_size: 4096
19
- bagging_factor: 4
20
- model_size: 'small' # or 'large'
21
-
22
- sap_rpt1_hf:
23
- max_context_size: 4096
24
- bagging: 4
25
-
26
- tabpfn:
27
- n_ensemble: 1
28
- device: 'auto'
29
-
30
- autogluon:
31
- time_limit: 300 # 5 minutes
32
- preset: 'medium_quality' # best_quality, high_quality, good_quality, medium_quality
33
-
34
- xgboost:
35
- n_estimators: 100
36
- learning_rate: 0.1
37
- max_depth: 6
38
-
39
- catboost:
40
- iterations: 100
41
- learning_rate: 0.1
42
- depth: 6
43
-
44
- lightgbm:
45
- n_estimators: 100
46
- learning_rate: 0.1
47
- max_depth: -1
48
-
49
- # Evaluation Metrics
50
- primary_metric:
51
- classification: 'roc_auc'
52
- regression: 'r2'
53
-
54
- # Statistical Testing
55
- statistical_tests:
56
- friedman_alpha: 0.05
57
- nemenyi_alpha: 0.05
58
-
59
- # Reproducibility
60
- reproducibility:
61
- save_predictions: true
62
- save_models: false # Models can be large
63
- log_hyperparameters: true
64
- track_compute: true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
code/config/models.yaml DELETED
@@ -1,84 +0,0 @@
1
- # Model Configuration
2
- # ====================
3
-
4
- models:
5
- # SAP RPT-1 (Primary Model)
6
- - name: 'sap-rpt1-small'
7
- enabled: true
8
- priority: 'high'
9
- docker_image: 'sap-rpt1'
10
-
11
- - name: 'sap-rpt1-large'
12
- enabled: true
13
- priority: 'high'
14
- docker_image: 'sap-rpt1'
15
-
16
- # SAP RPT-1 OSS via Hugging Face (Open Source)
17
- - name: 'sap-rpt1-hf'
18
- enabled: true
19
- priority: 'high'
20
- docker_image: 'sap-rpt1'
21
- description: 'SAP RPT-1 OSS model via HuggingFace token authentication'
22
-
23
- # Pretrained Competitors
24
- - name: 'tabpfn'
25
- enabled: true
26
- priority: 'high'
27
- docker_image: 'tabpfn'
28
-
29
- - name: 'tabicl'
30
- enabled: false # Enable when implementation ready
31
- priority: 'medium'
32
- docker_image: 'tabicl'
33
-
34
- # AutoML
35
- - name: 'autogluon'
36
- enabled: true
37
- priority: 'medium'
38
- docker_image: 'autogluon'
39
-
40
- # Gradient Boosting Baselines
41
- - name: 'xgboost'
42
- enabled: true
43
- priority: 'medium'
44
- docker_image: 'baselines'
45
-
46
- - name: 'catboost'
47
- enabled: true
48
- priority: 'medium'
49
- docker_image: 'baselines'
50
-
51
- - name: 'lightgbm'
52
- enabled: true
53
- priority: 'low'
54
- docker_image: 'baselines'
55
-
56
- # Model Groups (for batch experiments)
57
- model_groups:
58
- all:
59
- - sap-rpt1-small
60
- - sap-rpt1-large
61
- - sap-rpt1-hf
62
- - tabpfn
63
- - autogluon
64
- - xgboost
65
- - catboost
66
- - lightgbm
67
-
68
- pretrained_only:
69
- - sap-rpt1-small
70
- - sap-rpt1-large
71
- - sap-rpt1-hf
72
- - tabpfn
73
-
74
- baselines_only:
75
- - xgboost
76
- - catboost
77
- - lightgbm
78
-
79
- high_priority:
80
- - sap-rpt1-small
81
- - sap-rpt1-large
82
- - sap-rpt1-hf
83
- - tabpfn
84
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
code/docker/Dockerfile DELETED
@@ -1,102 +0,0 @@
1
- # =============================================================================
2
- # SAP RPT-1 Benchmarking - Multi-stage Dockerfile
3
- # =============================================================================
4
- # Builds two targets:
5
- # - sap-rpt1: Python 3.11 with SAP RPT-1 OSS + all dependencies
6
- # - baselines: Python 3.11 with XGBoost, CatBoost, LightGBM
7
- #
8
- # Usage:
9
- # docker-compose build
10
- # docker-compose run sap-rpt1
11
- # docker-compose run baselines
12
- # =============================================================================
13
-
14
- # ---------- Base stage (shared by all targets) ----------
15
- FROM python:3.11-slim AS base
16
-
17
- # System dependencies
18
- RUN apt-get update && apt-get install -y --no-install-recommends \
19
- git \
20
- build-essential \
21
- && rm -rf /var/lib/apt/lists/*
22
-
23
- WORKDIR /app
24
-
25
- # Copy requirements first (for Docker layer caching)
26
- COPY requirements.txt /app/requirements.txt
27
-
28
- # ---------- SAP RPT-1 target ----------
29
- FROM base AS sap-rpt1
30
-
31
- # Install core scientific stack first (heavy packages)
32
- RUN pip install --default-timeout=1000 --retries 5 --no-cache-dir \
33
- numpy==1.26.4 \
34
- pandas==2.2.3 \
35
- scikit-learn==1.6.1 \
36
- scipy==1.14.1 \
37
- matplotlib==3.9.2 \
38
- seaborn==0.13.2
39
-
40
- # Install Hugging Face and PyTorch stack
41
- RUN pip install --default-timeout=1000 --retries 5 --no-cache-dir \
42
- --extra-index-url https://download.pytorch.org/whl/cpu \
43
- torch==2.7.0+cpu \
44
- transformers==4.52.4 \
45
- accelerate==1.6.0 \
46
- huggingface-hub==0.30.2 \
47
- datasets==3.5.0 \
48
- pyarrow==20.0.0 \
49
- torcheval==0.0.7
50
-
51
- # Install SAP RPT-1 and remaining requirements
52
- RUN pip install --default-timeout=1000 --retries 5 --no-cache-dir -r requirements.txt
53
-
54
- # Copy project code
55
- COPY . /app
56
-
57
- # Set Python path
58
- ENV PYTHONPATH=/app/code
59
-
60
- WORKDIR /app/code
61
-
62
- # Set entrypoint so you can run via arguments natively
63
- ENTRYPOINT ["python"]
64
- CMD ["-m", "runners.run_experiment", "--dataset", "adult", "--model", "sap-rpt1-hf"]
65
-
66
- # ---------- Baselines target ----------
67
- FROM base AS baselines
68
-
69
- # Install core scientific stack (heavy packages)
70
- RUN pip install --default-timeout=1000 --retries 5 --no-cache-dir \
71
- numpy==1.26.4 \
72
- pandas==2.2.3 \
73
- scikit-learn==1.6.1 \
74
- scipy==1.14.1
75
-
76
- # Install visualization and utilities
77
- RUN pip install --default-timeout=1000 --retries 5 --no-cache-dir \
78
- matplotlib==3.9.2 \
79
- seaborn==0.13.2 \
80
- pyyaml==6.0.2 \
81
- tqdm==4.67.1 \
82
- joblib==1.4.2 \
83
- python-dotenv==1.0.1
84
-
85
- # Install ML frameworks and OpenML
86
- RUN pip install --default-timeout=1000 --retries 5 --no-cache-dir \
87
- openml==0.14.2 \
88
- xgboost \
89
- catboost \
90
- lightgbm
91
-
92
- # Copy project code
93
- COPY . /app
94
-
95
- # Set Python path
96
- ENV PYTHONPATH=/app/code
97
-
98
- WORKDIR /app/code
99
-
100
- # Set entrypoint so you can run via arguments natively
101
- ENTRYPOINT ["python"]
102
- CMD ["-m", "runners.run_batch", "--datasets", "config/datasets.yaml", "--models", "config/models.yaml"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
code/evaluation/__init__.py DELETED
@@ -1,24 +0,0 @@
1
- """
2
- Evaluation Package
3
- ==================
4
-
5
- Tools for model evaluation, statistical testing, and benchmarking.
6
-
7
- Author: UW MSIM Team
8
- Date: November 2025
9
- """
10
-
11
- from .metrics import calculate_classification_metrics, calculate_regression_metrics
12
- from .cross_validation import run_cross_validation
13
- from .statistical_tests import friedman_test, nemenyi_post_hoc, critical_difference
14
- from .compute_tracker import ComputeTracker
15
-
16
- __all__ = [
17
- 'calculate_classification_metrics',
18
- 'calculate_regression_metrics',
19
- 'run_cross_validation',
20
- 'friedman_test',
21
- 'nemenyi_post_hoc',
22
- 'critical_difference',
23
- 'ComputeTracker'
24
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
code/evaluation/compute_tracker.py DELETED
@@ -1,114 +0,0 @@
1
- """
2
- Compute Resource Tracker
3
- =========================
4
-
5
- Track GPU hours, costs, and resource usage for experiments.
6
-
7
- Author: UW MSIM Team
8
- Date: November 2025
9
- """
10
-
11
- import time
12
- import numpy as np
13
- from typing import Dict, Optional, List
14
-
15
- try:
16
- import psutil
17
- HAS_PSUTIL = True
18
- except ImportError:
19
- HAS_PSUTIL = False
20
- import logging
21
-
22
- logger = logging.getLogger(__name__)
23
-
24
-
25
- class ComputeTracker:
26
- """
27
- Track compute resources and costs.
28
-
29
- Parameters
30
- ----------
31
- cost_per_hour : float
32
- Cost per GPU-hour in USD
33
- gpu_type : str
34
- GPU type (e.g., 'H200', 'A100', 'L40S')
35
- """
36
-
37
- def __init__(self, cost_per_hour: float = 0.90, gpu_type: str = 'H200'):
38
- self.cost_per_hour = cost_per_hour
39
- self.gpu_type = gpu_type
40
- self.start_time: Optional[float] = None
41
- self.end_time: Optional[float] = None
42
- self.gpu_usage_log: List[Dict] = []
43
-
44
- def start(self):
45
- """Start tracking."""
46
- self.start_time = time.time()
47
- self.gpu_usage_log = []
48
- logger.info(f"Compute tracking started (GPU: {self.gpu_type}, ${self.cost_per_hour}/hr)")
49
-
50
- def log_gpu_usage(self):
51
- """Log current GPU usage."""
52
- try:
53
- import GPUtil
54
- gpus = GPUtil.getGPUs()
55
-
56
- for gpu in gpus:
57
- self.gpu_usage_log.append({
58
- 'timestamp': time.time(),
59
- 'gpu_id': gpu.id,
60
- 'gpu_load': gpu.load * 100,
61
- 'memory_used_mb': gpu.memoryUsed,
62
- 'memory_total_mb': gpu.memoryTotal,
63
- 'memory_util': (gpu.memoryUsed / gpu.memoryTotal) * 100,
64
- 'temperature': getattr(gpu, 'temperature', None)
65
- })
66
- except ImportError:
67
- logger.warning("GPUtil not installed, GPU tracking unavailable")
68
- except Exception as e:
69
- logger.warning(f"GPU logging failed: {e}")
70
-
71
- def stop(self) -> Dict:
72
- """
73
- Stop tracking and calculate costs.
74
-
75
- Returns
76
- -------
77
- summary : dict
78
- Elapsed time, costs, and GPU usage summary
79
- """
80
- self.end_time = time.time()
81
-
82
- elapsed_hours = (self.end_time - self.start_time) / 3600
83
- total_cost = elapsed_hours * self.cost_per_hour
84
-
85
- # CPU usage
86
- if HAS_PSUTIL:
87
- cpu_percent = psutil.cpu_percent(interval=1)
88
- memory_info = psutil.virtual_memory()
89
- memory_percent = memory_info.percent
90
- memory_used_gb = memory_info.used / (1024 ** 3)
91
- else:
92
- cpu_percent = 0.0
93
- memory_percent = 0.0
94
- memory_used_gb = 0.0
95
-
96
- summary = {
97
- 'elapsed_hours': elapsed_hours,
98
- 'cost_usd': total_cost,
99
- 'cost_per_hour': self.cost_per_hour,
100
- 'gpu_type': self.gpu_type,
101
- 'cpu_percent': cpu_percent,
102
- 'memory_percent': memory_percent,
103
- 'memory_used_gb': memory_used_gb,
104
- 'gpu_logs_count': len(self.gpu_usage_log)
105
- }
106
-
107
- # Average GPU utilization
108
- if self.gpu_usage_log:
109
- summary['avg_gpu_load'] = np.mean([log['gpu_load'] for log in self.gpu_usage_log])
110
- summary['avg_gpu_memory_util'] = np.mean([log['memory_util'] for log in self.gpu_usage_log])
111
-
112
- logger.info(f"Compute tracking stopped: {elapsed_hours:.2f} hours, ${total_cost:.2f}")
113
-
114
- return summary
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
code/evaluation/cross_validation.py DELETED
@@ -1,127 +0,0 @@
1
- """
2
- Cross-Validation
3
- ================
4
-
5
- 10-fold stratified cross-validation for model evaluation.
6
-
7
- Author: UW MSIM Team
8
- Date: November 2025
9
- """
10
-
11
- import numpy as np
12
- import pandas as pd
13
- from sklearn.model_selection import StratifiedKFold, KFold
14
- from sklearn.preprocessing import LabelEncoder
15
- from typing import List, Dict
16
- import logging
17
-
18
- from .metrics import calculate_classification_metrics, calculate_regression_metrics
19
-
20
- logger = logging.getLogger(__name__)
21
-
22
-
23
- def _encode_categorical_columns(X_train, X_val):
24
- """
25
- Label-encode object/categorical columns. Fitted on X_train,
26
- applied to both X_train and X_val. Unknown categories in X_val
27
- are mapped to -1.
28
- """
29
- X_train = X_train.copy()
30
- X_val = X_val.copy()
31
-
32
- cat_cols = X_train.select_dtypes(include=['object', 'category']).columns
33
- if len(cat_cols) == 0:
34
- return X_train, X_val
35
-
36
- logger.info(f" Encoding {len(cat_cols)} categorical columns: {list(cat_cols[:5])}{'...' if len(cat_cols) > 5 else ''}")
37
-
38
- for col in cat_cols:
39
- le = LabelEncoder()
40
- # Fit on combined unique values from train (+ handle unseen in val)
41
- combined = pd.concat([X_train[col], X_val[col]], axis=0).astype(str)
42
- le.fit(combined)
43
- X_train[col] = le.transform(X_train[col].astype(str))
44
- X_val[col] = le.transform(X_val[col].astype(str))
45
-
46
- return X_train, X_val
47
-
48
-
49
- def run_cross_validation(
50
- model,
51
- X: pd.DataFrame,
52
- y: pd.Series,
53
- task_type: str = 'classification',
54
- n_folds: int = 10,
55
- random_state: int = 42
56
- ) -> List[Dict]:
57
- """
58
- Run k-fold cross-validation.
59
-
60
- Parameters
61
- ----------
62
- model : BaseModelWrapper
63
- Model to evaluate (must have fit/predict methods)
64
- X : pd.DataFrame
65
- Features
66
- y : pd.Series
67
- Target
68
- task_type : str
69
- 'classification' or 'regression'
70
- n_folds : int
71
- Number of folds
72
- random_state : int
73
- Random seed
74
-
75
- Returns
76
- -------
77
- fold_results : list of dict
78
- Results for each fold
79
- """
80
- logger.info(f"Running {n_folds}-fold CV for {model.__class__.__name__}")
81
-
82
- # Choose CV splitter
83
- if task_type == 'classification':
84
- cv = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_state)
85
- else:
86
- cv = KFold(n_splits=n_folds, shuffle=True, random_state=random_state)
87
-
88
- fold_results = []
89
-
90
- for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X, y)):
91
- logger.info(f" Fold {fold_idx + 1}/{n_folds}")
92
-
93
- # Split data
94
- X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
95
- y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
96
-
97
- # Auto-encode categorical columns so tree models can handle them
98
- X_train, X_val = _encode_categorical_columns(X_train, X_val)
99
-
100
- # Fit model
101
- model.fit(X_train, y_train)
102
-
103
- # Predict
104
- y_pred = model.predict(X_val)
105
- y_proba = None
106
- if task_type == 'classification':
107
- try:
108
- y_proba = model.predict_proba(X_val)
109
- except:
110
- pass
111
-
112
- # Calculate metrics
113
- if task_type == 'classification':
114
- metrics = calculate_classification_metrics(y_val, y_pred, y_proba)
115
- else:
116
- metrics = calculate_regression_metrics(y_val, y_pred)
117
-
118
- # Add timing info
119
- metrics.update({
120
- 'fold': fold_idx,
121
- 'fit_time': model.fit_time,
122
- 'predict_time': model.predict_time
123
- })
124
-
125
- fold_results.append(metrics)
126
-
127
- return fold_results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
code/evaluation/metrics.py DELETED
@@ -1,116 +0,0 @@
1
- """
2
- Evaluation Metrics
3
- ==================
4
-
5
- Comprehensive metrics for classification and regression tasks.
6
-
7
- Author: UW MSIM Team
8
- Date: November 2025
9
- """
10
-
11
- import numpy as np
12
- from sklearn.metrics import (
13
- roc_auc_score, accuracy_score, f1_score, precision_score, recall_score,
14
- r2_score, mean_squared_error, mean_absolute_error, log_loss
15
- )
16
- from typing import Dict, Optional
17
- import logging
18
-
19
- logger = logging.getLogger(__name__)
20
-
21
-
22
- def calculate_classification_metrics(
23
- y_true: np.ndarray,
24
- y_pred: np.ndarray,
25
- y_proba: Optional[np.ndarray] = None
26
- ) -> Dict[str, float]:
27
- """
28
- Calculate all classification metrics.
29
-
30
- Parameters
31
- ----------
32
- y_true : np.ndarray
33
- True labels
34
- y_pred : np.ndarray
35
- Predicted labels
36
- y_proba : np.ndarray, optional
37
- Predicted probabilities (n_samples, n_classes)
38
-
39
- Returns
40
- -------
41
- metrics : dict
42
- Dictionary of metric names and values
43
- """
44
- metrics = {
45
- 'accuracy': accuracy_score(y_true, y_pred),
46
- 'f1_macro': f1_score(y_true, y_pred, average='macro', zero_division=0),
47
- 'f1_weighted': f1_score(y_true, y_pred, average='weighted', zero_division=0),
48
- 'precision_macro': precision_score(y_true, y_pred, average='macro', zero_division=0),
49
- 'recall_macro': recall_score(y_true, y_pred, average='macro', zero_division=0)
50
- }
51
-
52
- # ROC-AUC (if probabilities available)
53
- if y_proba is not None:
54
- try:
55
- n_classes = len(np.unique(y_true))
56
-
57
- if n_classes == 2:
58
- # Binary classification
59
- metrics['roc_auc'] = roc_auc_score(y_true, y_proba[:, 1])
60
- else:
61
- # Multi-class classification
62
- metrics['roc_auc'] = roc_auc_score(
63
- y_true, y_proba,
64
- multi_class='ovr',
65
- average='macro'
66
- )
67
-
68
- # Log loss
69
- metrics['log_loss'] = log_loss(y_true, y_proba)
70
-
71
- except Exception as e:
72
- logger.warning(f"ROC-AUC calculation failed: {e}")
73
- metrics['roc_auc'] = np.nan
74
- metrics['log_loss'] = np.nan
75
-
76
- return metrics
77
-
78
-
79
- def calculate_regression_metrics(
80
- y_true: np.ndarray,
81
- y_pred: np.ndarray
82
- ) -> Dict[str, float]:
83
- """
84
- Calculate all regression metrics.
85
-
86
- Parameters
87
- ----------
88
- y_true : np.ndarray
89
- True values
90
- y_pred : np.ndarray
91
- Predicted values
92
-
93
- Returns
94
- -------
95
- metrics : dict
96
- Dictionary of metric names and values
97
- """
98
- metrics = {
99
- 'r2': r2_score(y_true, y_pred),
100
- 'rmse': np.sqrt(mean_squared_error(y_true, y_pred)),
101
- 'mae': mean_absolute_error(y_true, y_pred),
102
- 'mse': mean_squared_error(y_true, y_pred)
103
- }
104
-
105
- # MAPE (avoid division by zero)
106
- try:
107
- non_zero_mask = y_true != 0
108
- if np.any(non_zero_mask):
109
- mape = np.mean(np.abs((y_true[non_zero_mask] - y_pred[non_zero_mask]) / y_true[non_zero_mask])) * 100
110
- metrics['mape'] = mape
111
- else:
112
- metrics['mape'] = np.nan
113
- except:
114
- metrics['mape'] = np.nan
115
-
116
- return metrics
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
code/evaluation/statistical_tests.py DELETED
@@ -1,109 +0,0 @@
1
- """
2
- Statistical Tests
3
- =================
4
-
5
- Statistical significance testing for model comparisons.
6
-
7
- Implements:
8
- - Friedman test (non-parametric ANOVA)
9
- - Nemenyi post-hoc test
10
- - Critical difference calculation
11
-
12
- Author: UW MSIM Team
13
- Date: November 2025
14
- """
15
-
16
- import numpy as np
17
- import pandas as pd
18
- from scipy import stats
19
- from typing import Dict, Tuple
20
- import logging
21
-
22
- logger = logging.getLogger(__name__)
23
-
24
-
25
- def friedman_test(results_df: pd.DataFrame) -> Dict:
26
- """
27
- Friedman test for comparing multiple models.
28
-
29
- Parameters
30
- ----------
31
- results_df : pd.DataFrame
32
- Rows = datasets, columns = models, values = metric scores
33
-
34
- Returns
35
- -------
36
- results : dict
37
- Test statistic, p-value, and significance
38
- """
39
- # Rank models for each dataset (higher is better)
40
- ranks = results_df.rank(axis=1, ascending=False)
41
-
42
- # Friedman test
43
- stat, p_value = stats.friedmanchisquare(*[ranks[col] for col in ranks.columns])
44
-
45
- logger.info(f"Friedman Test: statistic={stat:.4f}, p-value={p_value:.4e}")
46
-
47
- return {
48
- 'statistic': stat,
49
- 'p_value': p_value,
50
- 'significant': p_value < 0.05,
51
- 'avg_ranks': ranks.mean().to_dict()
52
- }
53
-
54
-
55
- def nemenyi_post_hoc(results_df: pd.DataFrame) -> pd.DataFrame:
56
- """
57
- Nemenyi post-hoc test (pairwise comparisons).
58
-
59
- Parameters
60
- ----------
61
- results_df : pd.DataFrame
62
- Rows = datasets, columns = models, values = metric scores
63
-
64
- Returns
65
- -------
66
- p_values : pd.DataFrame
67
- Pairwise p-values
68
- """
69
- try:
70
- import scikit_posthocs as sp
71
- ranks = results_df.rank(axis=1, ascending=False)
72
- p_values = sp.posthoc_nemenyi_friedman(ranks.T)
73
- return p_values
74
- except ImportError:
75
- logger.error("scikit-posthocs not installed. Install with: pip install scikit-posthocs")
76
- raise
77
-
78
-
79
- def critical_difference(
80
- n_datasets: int,
81
- n_models: int,
82
- alpha: float = 0.05
83
- ) -> float:
84
- """
85
- Calculate critical difference for CD diagrams.
86
-
87
- Parameters
88
- ----------
89
- n_datasets : int
90
- Number of datasets
91
- n_models : int
92
- Number of models
93
- alpha : float
94
- Significance level
95
-
96
- Returns
97
- -------
98
- cd : float
99
- Critical difference value
100
- """
101
- # Critical value from Nemenyi distribution
102
- # Approximation using normal distribution
103
- q_alpha = stats.norm.ppf(1 - alpha / 2)
104
-
105
- cd = q_alpha * np.sqrt((n_models * (n_models + 1)) / (6 * n_datasets))
106
-
107
- logger.info(f"Critical Difference: {cd:.4f} (alpha={alpha})")
108
-
109
- return cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
code/runners/__init__.py DELETED
@@ -1,11 +0,0 @@
1
- """
2
- Experiment Runners Package
3
- ===========================
4
-
5
- Tools for executing benchmarking experiments.
6
-
7
- Author: UW MSIM Team
8
- Date: November 2025
9
- """
10
-
11
- __all__ = ['run_experiment', 'run_batch']
 
 
 
 
 
 
 
 
 
 
 
 
code/runners/run_baselines.py DELETED
@@ -1,50 +0,0 @@
1
- """
2
- Baseline Models Batch Runner
3
- ==============================
4
-
5
- Run all baseline models (XGBoost, CatBoost, LightGBM) on all or specific datasets.
6
-
7
- Usage:
8
- # Run on ALL datasets
9
- py -3.12 -m runners.run_baselines
10
-
11
- # Run on specific datasets
12
- py -3.12 -m runners.run_baselines --dataset analcatdata_authorship diabetes
13
-
14
- Author: UW MSIM Team
15
- Date: April 2026
16
- """
17
-
18
- import argparse
19
- import sys
20
- from pathlib import Path
21
-
22
- # Add parent directory to path
23
- sys.path.insert(0, str(Path(__file__).parent.parent))
24
-
25
- from runners.run_batch import main as run_batch_main
26
-
27
-
28
- BASELINE_MODELS = ['xgboost', 'catboost', 'lightgbm']
29
-
30
-
31
- def main():
32
- """Run all baseline models on all or specific datasets."""
33
- parser = argparse.ArgumentParser(description='Run baseline models')
34
- parser.add_argument('--dataset', nargs='*', default=None,
35
- help='Specific dataset(s) to run (e.g., --dataset analcatdata_authorship diabetes)')
36
-
37
- args = parser.parse_args()
38
-
39
- # Build sys.argv for run_batch
40
- batch_args = ['run_baselines', '--model-filter', *BASELINE_MODELS]
41
-
42
- if args.dataset:
43
- batch_args.extend(['--dataset-filter', *args.dataset])
44
-
45
- sys.argv = batch_args
46
- run_batch_main()
47
-
48
-
49
- if __name__ == '__main__':
50
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
code/runners/run_batch.py DELETED
@@ -1,289 +0,0 @@
1
- """
2
- Batch Experiment Runner
3
- ========================
4
-
5
- Run multiple models on multiple datasets.
6
-
7
- Usage:
8
- python -m runners.run_batch \
9
- --datasets config/datasets.yaml \
10
- --models config/models.yaml
11
-
12
- Author: UW MSIM Team
13
- Date: April 2026
14
- """
15
-
16
- import argparse
17
- import yaml
18
- import logging
19
- import sys
20
- import os
21
- import json
22
- import time
23
- from pathlib import Path
24
- from typing import List, Dict, Optional
25
-
26
- # Add parent directory to path
27
- sys.path.insert(0, str(Path(__file__).parent.parent))
28
-
29
- from runners.run_experiment import run_single_experiment, get_model
30
-
31
- logger = logging.getLogger(__name__)
32
-
33
-
34
- def get_dataset_list(datasets_config: dict, dataset_dir: str = None) -> List[str]:
35
- """
36
- Get list of available dataset names from the download directory.
37
-
38
- Parameters
39
- ----------
40
- datasets_config : dict
41
- Datasets YAML configuration
42
- dataset_dir : str
43
- Directory containing downloaded datasets
44
-
45
- Returns
46
- -------
47
- datasets : list of str
48
- List of dataset names
49
- """
50
- datasets = []
51
-
52
- if dataset_dir is None:
53
- dataset_dir = str(Path(__file__).parent.parent.parent / 'datasets')
54
-
55
- if os.path.isdir(dataset_dir):
56
- # Find all *_X.csv files and extract dataset names
57
- for f in sorted(os.listdir(dataset_dir)):
58
- if f.endswith('_X.csv'):
59
- name = f[:-6] # Remove '_X.csv'
60
- # Verify y file also exists
61
- y_file = os.path.join(dataset_dir, f"{name}_y.csv")
62
- if os.path.exists(y_file):
63
- datasets.append(name)
64
-
65
- logger.info(f"Found {len(datasets)} datasets in {dataset_dir}")
66
- else:
67
- logger.warning(f"Dataset directory not found: {dataset_dir}")
68
-
69
- return datasets
70
-
71
-
72
- def get_model_list(models_config: dict) -> List[str]:
73
- """
74
- Get list of enabled model names from configuration.
75
-
76
- Parameters
77
- ----------
78
- models_config : dict
79
- Models YAML configuration
80
-
81
- Returns
82
- -------
83
- models : list of str
84
- List of enabled model names
85
- """
86
- models = []
87
-
88
- for model_entry in models_config.get('models', []):
89
- if model_entry.get('enabled', True):
90
- models.append(model_entry['name'])
91
-
92
- return models
93
-
94
-
95
- def run_batch_experiments(
96
- datasets: List[str],
97
- models: List[str],
98
- experiment_config: dict,
99
- output_dir: str = '../results/raw',
100
- skip_existing: bool = True
101
- ) -> dict:
102
- """
103
- Run experiments for all dataset Γ— model combinations.
104
-
105
- Parameters
106
- ----------
107
- datasets : list of str
108
- Dataset names
109
- models : list of str
110
- Model names
111
- experiment_config : dict
112
- Experiment configuration (n_folds, random_state, etc.)
113
- output_dir : str
114
- Where to save results
115
- skip_existing : bool
116
- If True, skip experiments that already have result files
117
-
118
- Returns
119
- -------
120
- summary : dict
121
- Batch run summary with successes and failures
122
- """
123
- total_experiments = len(datasets) * len(models)
124
- logger.info(f"\n{'='*60}")
125
- logger.info(f"BATCH RUN: {len(datasets)} datasets Γ— {len(models)} models = {total_experiments} experiments")
126
- logger.info(f"{'='*60}\n")
127
-
128
- summary = {
129
- 'total': total_experiments,
130
- 'completed': 0,
131
- 'skipped': 0,
132
- 'failed': 0,
133
- 'results': [],
134
- 'errors': []
135
- }
136
-
137
- batch_start_time = time.time()
138
-
139
- for i, dataset_name in enumerate(datasets):
140
- for j, model_name in enumerate(models):
141
- experiment_num = i * len(models) + j + 1
142
- output_file = os.path.join(output_dir, f"{dataset_name}_{model_name}.json")
143
-
144
- # Skip existing results
145
- if skip_existing and os.path.exists(output_file):
146
- logger.info(
147
- f"[{experiment_num}/{total_experiments}] "
148
- f"SKIP {model_name} on {dataset_name} (result exists)"
149
- )
150
- summary['skipped'] += 1
151
- continue
152
-
153
- logger.info(
154
- f"\n[{experiment_num}/{total_experiments}] "
155
- f"Running {model_name} on {dataset_name}..."
156
- )
157
-
158
- try:
159
- result = run_single_experiment(
160
- dataset_name=dataset_name,
161
- model_name=model_name,
162
- config=experiment_config,
163
- output_dir=output_dir
164
- )
165
- summary['completed'] += 1
166
- summary['results'].append({
167
- 'dataset': dataset_name,
168
- 'model': model_name,
169
- 'status': 'success'
170
- })
171
-
172
- except Exception as e:
173
- logger.error(f"FAILED: {model_name} on {dataset_name}: {e}")
174
- summary['failed'] += 1
175
- summary['errors'].append({
176
- 'dataset': dataset_name,
177
- 'model': model_name,
178
- 'error': str(e)
179
- })
180
-
181
- batch_elapsed = time.time() - batch_start_time
182
-
183
- # Print summary
184
- logger.info(f"\n{'='*60}")
185
- logger.info(f"BATCH RUN COMPLETE")
186
- logger.info(f"{'='*60}")
187
- logger.info(f" Total experiments: {summary['total']}")
188
- logger.info(f" Completed: {summary['completed']}")
189
- logger.info(f" Skipped: {summary['skipped']}")
190
- logger.info(f" Failed: {summary['failed']}")
191
- logger.info(f" Total time: {batch_elapsed / 3600:.2f} hours")
192
- logger.info(f"{'='*60}\n")
193
-
194
- # Save batch summary
195
- os.makedirs(output_dir, exist_ok=True)
196
- summary_file = os.path.join(output_dir, '_batch_summary.json')
197
- summary['elapsed_hours'] = batch_elapsed / 3600
198
- with open(summary_file, 'w') as f:
199
- json.dump(summary, f, indent=2)
200
- logger.info(f"Batch summary saved to {summary_file}")
201
-
202
- return summary
203
-
204
-
205
- def main():
206
- """Entry point for batch runner."""
207
- # Setup logging
208
- logging.basicConfig(
209
- level=logging.INFO,
210
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
211
- )
212
-
213
- # Parse arguments
214
- parser = argparse.ArgumentParser(description='Run batch benchmarking experiments')
215
- parser.add_argument('--datasets', default='config/datasets.yaml',
216
- help='Datasets config file')
217
- parser.add_argument('--models', default='config/models.yaml',
218
- help='Models config file')
219
- parser.add_argument('--config', default='config/experiments.yaml',
220
- help='Experiment config file')
221
- parser.add_argument('--output-dir', default='../results/raw',
222
- help='Output directory')
223
- parser.add_argument('--dataset-dir', default=None,
224
- help='Directory containing downloaded datasets')
225
- parser.add_argument('--no-skip', action='store_true',
226
- help='Re-run experiments even if results exist')
227
- parser.add_argument('--model-filter', nargs='*', default=None,
228
- help='Only run specific models (e.g., --model-filter sap-rpt1-hf xgboost)')
229
- parser.add_argument('--dataset-filter', nargs='*', default=None,
230
- help='Only run specific datasets')
231
-
232
- args = parser.parse_args()
233
-
234
- # Load configs
235
- if os.path.exists(args.datasets):
236
- with open(args.datasets) as f:
237
- datasets_config = yaml.safe_load(f)
238
- else:
239
- datasets_config = {}
240
-
241
- if os.path.exists(args.models):
242
- with open(args.models) as f:
243
- models_config = yaml.safe_load(f)
244
- else:
245
- models_config = {}
246
-
247
- if os.path.exists(args.config):
248
- with open(args.config) as f:
249
- experiment_config = yaml.safe_load(f)
250
- else:
251
- experiment_config = {
252
- 'n_folds': 10,
253
- 'random_state': 42,
254
- 'cost_per_hour': 0.90,
255
- 'gpu_type': 'H200'
256
- }
257
-
258
- # Get dataset and model lists
259
- dataset_list = args.dataset_filter or get_dataset_list(datasets_config, args.dataset_dir)
260
- model_list = args.model_filter or get_model_list(models_config)
261
-
262
- if not dataset_list:
263
- print("[ERROR] No datasets found in the datasets directory.")
264
- sys.exit(1)
265
-
266
- if not model_list:
267
- print("[ERROR] No models enabled in config. Check config/models.yaml")
268
- sys.exit(1)
269
-
270
- print(f"\n[INFO] Datasets ({len(dataset_list)}): {dataset_list[:5]}{'...' if len(dataset_list) > 5 else ''}")
271
- print(f"[INFO] Models ({len(model_list)}): {model_list}")
272
-
273
- # Add dataset_dir to config for run_experiment to use
274
- experiment_config['dataset_dir'] = args.dataset_dir if args.dataset_dir else str(Path(__file__).parent.parent.parent / 'datasets')
275
-
276
- # Run batch
277
- summary = run_batch_experiments(
278
- datasets=dataset_list,
279
- models=model_list,
280
- experiment_config=experiment_config,
281
- output_dir=args.output_dir,
282
- skip_existing=not args.no_skip
283
- )
284
-
285
- print(f"\n[SUCCESS] Batch complete! {summary['completed']} succeeded, {summary['failed']} failed")
286
-
287
-
288
- if __name__ == "__main__":
289
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
code/runners/run_experiment.py DELETED
@@ -1,260 +0,0 @@
1
- """
2
- Single Experiment Runner
3
- =========================
4
-
5
- Run a single model on a single dataset.
6
-
7
- Usage:
8
- python -m runners.run_experiment --dataset adult --model sap-rpt1
9
-
10
- Author: UW MSIM Team
11
- Date: November 2025
12
- """
13
-
14
- import argparse
15
- import json
16
- import yaml
17
- import logging
18
- import sys
19
- import os
20
- from pathlib import Path
21
-
22
- # Add parent directory to path
23
- sys.path.insert(0, str(Path(__file__).parent.parent))
24
-
25
- from models import *
26
- from datasets.preprocessors import load_dataset
27
- from datasets.dataset_catalog import DatasetCatalog
28
- from evaluation import run_cross_validation, ComputeTracker
29
-
30
- logger = logging.getLogger(__name__)
31
-
32
-
33
- def get_model(model_name: str, task_type: str, config: dict):
34
- """
35
- Initialize model by name.
36
-
37
- Parameters
38
- ----------
39
- model_name : str
40
- Model identifier
41
- task_type : str
42
- 'classification' or 'regression'
43
- config : dict
44
- Model configuration
45
-
46
- Returns
47
- -------
48
- model : BaseModelWrapper
49
- Initialized model
50
- """
51
- model_map = {
52
- 'sap-rpt1': SAPRPT1Wrapper,
53
- 'sap-rpt1-small': lambda **kwargs: SAPRPT1Wrapper(model_size='small', **kwargs),
54
- 'sap-rpt1-large': lambda **kwargs: SAPRPT1Wrapper(model_size='large', **kwargs),
55
- 'sap-rpt1-hf': SAPRPT1HFWrapper,
56
- 'tabpfn': TabPFNWrapper,
57
- 'tabicl': TabICLWrapper,
58
- 'autogluon': AutoGluonWrapper,
59
- 'xgboost': XGBoostWrapper,
60
- 'catboost': CatBoostWrapper,
61
- 'lightgbm': LightGBMWrapper
62
- }
63
-
64
- if model_name not in model_map:
65
- raise ValueError(f"Unknown model: {model_name}. Choose from {list(model_map.keys())}")
66
-
67
- model_class = model_map[model_name]
68
-
69
- # Get specific parameters for this model
70
- model_config_key = model_name.replace('-', '_')
71
- # Special handling for size variants like sap-rpt1-small -> sap_rpt1
72
- if model_name.startswith('sap-rpt1-') and model_name not in ['sap-rpt1-hf']:
73
- model_config_key = 'sap_rpt1'
74
-
75
- model_params = config.get('model_params', {}).get(model_config_key, {})
76
-
77
- model = model_class(task_type=task_type, **model_params)
78
-
79
- logger.info(f"Initialized {model_name} for {task_type}")
80
-
81
- return model
82
-
83
-
84
- def run_single_experiment(
85
- dataset_name: str,
86
- model_name: str,
87
- config: dict,
88
- output_dir: str = '../results/raw'
89
- ) -> dict:
90
- """
91
- Run experiment on single dataset with single model.
92
-
93
- Parameters
94
- ----------
95
- dataset_name : str
96
- Dataset name
97
- model_name : str
98
- Model name
99
- config : dict
100
- Experiment configuration
101
- output_dir : str
102
- Where to save results
103
-
104
- Returns
105
- -------
106
- summary : dict
107
- Experiment results
108
- """
109
- logger.info(f"\n{'='*60}")
110
- logger.info(f"Experiment: {model_name} on {dataset_name}")
111
- logger.info(f"{'='*60}\n")
112
-
113
- # Create output directory
114
- os.makedirs(output_dir, exist_ok=True)
115
-
116
- # Start compute tracking
117
- tracker = ComputeTracker(
118
- cost_per_hour=config.get('cost_per_hour', 0.90),
119
- gpu_type=config.get('gpu_type', 'H200')
120
- )
121
- tracker.start()
122
-
123
- try:
124
- # Load dataset
125
- logger.info("Loading dataset...")
126
- default_dataset_dir = str(Path(__file__).parent.parent.parent / 'datasets')
127
- dataset_dir = config.get('dataset_dir', default_dataset_dir)
128
- dataset_path = config.get('dataset_path', None)
129
-
130
- if dataset_path and os.path.exists(dataset_path):
131
- # Explicit path provided
132
- X, y, task_type = load_dataset(dataset_path)
133
- elif os.path.isdir(dataset_dir):
134
- # Search for dataset files in the download directory
135
- X_file = None
136
- y_file = None
137
- for f in os.listdir(dataset_dir):
138
- fname_lower = f.lower()
139
- dname_lower = dataset_name.lower()
140
- if fname_lower == f"{dname_lower}_x.csv" or (fname_lower.endswith('_x.csv') and dname_lower in fname_lower):
141
- X_file = os.path.join(dataset_dir, f)
142
- if fname_lower == f"{dname_lower}_y.csv" or (fname_lower.endswith('_y.csv') and dname_lower in fname_lower):
143
- y_file = os.path.join(dataset_dir, f)
144
-
145
- if X_file and y_file:
146
- import pandas as pd_load
147
- X = pd_load.read_csv(X_file)
148
- y = pd_load.read_csv(y_file).iloc[:, 0]
149
- # Determine task type
150
- if y.dtype == 'object' or len(y.unique()) < 20:
151
- task_type = 'classification'
152
- else:
153
- task_type = 'regression'
154
- logger.info(f"Loaded {dataset_name}: {X.shape[0]} samples, {X.shape[1]} features, task={task_type}")
155
- else:
156
- # Fallback: try as a single CSV file
157
- csv_path = os.path.join(dataset_dir, f"{dataset_name}.csv")
158
- if os.path.exists(csv_path):
159
- X, y, task_type = load_dataset(csv_path)
160
- else:
161
- raise FileNotFoundError(
162
- f"Dataset '{dataset_name}' not found in {dataset_dir}.\n"
163
- f"Available files: {os.listdir(dataset_dir)[:10]}..."
164
- )
165
- else:
166
- raise FileNotFoundError(
167
- f"Dataset directory not found: {dataset_dir}"
168
- )
169
-
170
- # Initialize model
171
- model = get_model(model_name, task_type, config)
172
-
173
- # Run cross-validation
174
- fold_results = run_cross_validation(
175
- model=model,
176
- X=X,
177
- y=y,
178
- task_type=task_type,
179
- n_folds=config.get('n_folds', 10),
180
- random_state=config.get('random_state', 42)
181
- )
182
-
183
- # Stop tracking
184
- compute_summary = tracker.stop()
185
-
186
- # Aggregate results
187
- import pandas as pd
188
- results_df = pd.DataFrame(fold_results)
189
-
190
- summary = {
191
- 'dataset': dataset_name,
192
- 'model': model_name,
193
- 'task_type': task_type,
194
- 'n_samples': len(X),
195
- 'n_features': X.shape[1],
196
- 'n_folds': config.get('n_folds', 10),
197
- 'mean_metrics': results_df.mean().to_dict(),
198
- 'std_metrics': results_df.std().to_dict(),
199
- 'fold_results': fold_results,
200
- 'compute': compute_summary
201
- }
202
-
203
- # Save results
204
- output_file = os.path.join(output_dir, f"{dataset_name}_{model_name}.json")
205
- with open(output_file, 'w') as f:
206
- json.dump(summary, f, indent=2)
207
-
208
- logger.info(f"\n[SUCCESS] Results saved to {output_file}")
209
-
210
- # Print summary
211
- primary_metric = 'roc_auc' if task_type == 'classification' else 'r2'
212
- if primary_metric in summary['mean_metrics']:
213
- mean_val = summary['mean_metrics'][primary_metric]
214
- std_val = summary['std_metrics'][primary_metric]
215
- logger.info(f"\nPrimary Metric ({primary_metric}): {mean_val:.4f} Β± {std_val:.4f}")
216
-
217
- return summary
218
-
219
- except Exception as e:
220
- logger.error(f"Experiment failed: {e}", exc_info=True)
221
- raise
222
-
223
-
224
- if __name__ == "__main__":
225
- # Setup logging
226
- logging.basicConfig(
227
- level=logging.INFO,
228
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
229
- )
230
-
231
- # Parse arguments
232
- parser = argparse.ArgumentParser(description='Run single benchmarking experiment')
233
- parser.add_argument('--dataset', required=True, help='Dataset name')
234
- parser.add_argument('--model', required=True, help='Model name')
235
- parser.add_argument('--config', default='../config/experiments.yaml', help='Config file')
236
- parser.add_argument('--output-dir', default='../results/raw', help='Output directory')
237
-
238
- args = parser.parse_args()
239
-
240
- # Load config
241
- if os.path.exists(args.config):
242
- with open(args.config) as f:
243
- config = yaml.safe_load(f)
244
- else:
245
- config = {
246
- 'n_folds': 10,
247
- 'random_state': 42,
248
- 'cost_per_hour': 0.90,
249
- 'gpu_type': 'H200'
250
- }
251
-
252
- # Run experiment
253
- results = run_single_experiment(
254
- dataset_name=args.dataset,
255
- model_name=args.model,
256
- config=config,
257
- output_dir=args.output_dir
258
- )
259
-
260
- print("\n[SUCCESS] Experiment complete!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
code/utils/__init__.py DELETED
@@ -1,11 +0,0 @@
1
- """
2
- Utilities Package
3
- =================
4
-
5
- Logging, result export, and helper functions.
6
-
7
- Author: UW MSIM Team
8
- Date: November 2025
9
- """
10
-
11
- __all__ = []
 
 
 
 
 
 
 
 
 
 
 
 
code/utils/logging_utils.py DELETED
@@ -1,63 +0,0 @@
1
- """
2
- Logging Utilities
3
- =================
4
-
5
- Structured logging for experiments.
6
-
7
- Author: UW MSIM Team
8
- Date: November 2025
9
- """
10
-
11
- import logging
12
- import sys
13
- from pathlib import Path
14
-
15
-
16
- def setup_logger(
17
- name: str,
18
- log_file: str = None,
19
- level: int = logging.INFO,
20
- format_string: str = None
21
- ) -> logging.Logger:
22
- """
23
- Setup logger with file and console handlers.
24
-
25
- Parameters
26
- ----------
27
- name : str
28
- Logger name
29
- log_file : str, optional
30
- Log file path
31
- level : int
32
- Logging level
33
- format_string : str, optional
34
- Custom format string
35
-
36
- Returns
37
- -------
38
- logger : logging.Logger
39
- Configured logger
40
- """
41
- if format_string is None:
42
- format_string = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
43
-
44
- # Create logger
45
- logger = logging.getLogger(name)
46
- logger.setLevel(level)
47
- logger.handlers = [] # Clear existing handlers
48
-
49
- # Console handler
50
- console_handler = logging.StreamHandler(sys.stdout)
51
- console_handler.setLevel(level)
52
- console_handler.setFormatter(logging.Formatter(format_string))
53
- logger.addHandler(console_handler)
54
-
55
- # File handler (if specified)
56
- if log_file:
57
- Path(log_file).parent.mkdir(parents=True, exist_ok=True)
58
- file_handler = logging.FileHandler(log_file)
59
- file_handler.setLevel(level)
60
- file_handler.setFormatter(logging.Formatter(format_string))
61
- logger.addHandler(file_handler)
62
-
63
- return logger
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
webapp/benchmark.py CHANGED
@@ -24,8 +24,8 @@ os.environ.setdefault("TABPFN_ACCEPT_TERMS", "1")
24
  os.environ.setdefault("TABPFN_LICENSE_ACCEPTED", "1")
25
  os.environ.setdefault("AGREE_TABPFN_LICENSE", "1")
26
 
27
- # Allow importing model wrappers from the code directory
28
- sys.path.insert(0, str(Path(__file__).parent.parent / "code"))
29
 
30
  N_FOLDS = int(os.getenv("N_FOLDS", "3"))
31
  RAND = int(os.getenv("RANDOM_STATE", "42"))
@@ -62,7 +62,6 @@ def _cat(task):
62
  def _tabpfn(task):
63
  if task != "classification":
64
  raise ValueError("TabPFN only supports classification tasks")
65
- from models.tabpfn_wrapper import TabPFNWrapper
66
  # TabPFNWrapper uses a class-level _shared_classifier so weights are only
67
  # loaded once per process regardless of how many instances are created.
68
  return TabPFNWrapper(task_type=task, random_state=RAND)
 
24
  os.environ.setdefault("TABPFN_LICENSE_ACCEPTED", "1")
25
  os.environ.setdefault("AGREE_TABPFN_LICENSE", "1")
26
 
27
+ # Imports are handled via absolute package paths
28
+ from webapp.models.tabpfn_wrapper import TabPFNWrapper
29
 
30
  N_FOLDS = int(os.getenv("N_FOLDS", "3"))
31
  RAND = int(os.getenv("RANDOM_STATE", "42"))
 
62
  def _tabpfn(task):
63
  if task != "classification":
64
  raise ValueError("TabPFN only supports classification tasks")
 
65
  # TabPFNWrapper uses a class-level _shared_classifier so weights are only
66
  # loaded once per process regardless of how many instances are created.
67
  return TabPFNWrapper(task_type=task, random_state=RAND)
webapp/main.py CHANGED
@@ -1,31 +1,31 @@
1
  import sys
2
- from pathlib import Path
3
- # Add both root and webapp directory to sys.path to resolve all import issues
4
- BASE_DIR = Path(__file__).resolve().parent.parent
5
- sys.path.insert(0, str(BASE_DIR))
6
- sys.path.insert(0, str(BASE_DIR / "webapp"))
7
-
8
  import io, os
 
 
9
  from dotenv import load_dotenv
10
-
11
- # Load .env before anything else so HF_TOKEN is available to benchmark.py
12
- load_dotenv(BASE_DIR / "webapp" / ".env")
13
-
14
  import pandas as pd
15
  import numpy as np
16
  from fastapi import FastAPI, File, UploadFile, Form, HTTPException
17
- from fastapi.responses import JSONResponse
18
  from fastapi.staticfiles import StaticFiles
19
 
20
- # Now we can import benchmark reliably
21
- try:
22
- from benchmark import run_benchmark, infer_task
23
- except ImportError:
24
- from webapp.benchmark import run_benchmark, infer_task
25
 
26
- # ── Config ─────────────────────────────────────────────────────────────────────
27
- MAX_FILE_BYTES = int(os.getenv("MAX_FILE_SIZE_MB", "5")) * 1024 * 1024 # default 5 MB
 
28
 
 
 
 
 
 
 
 
 
 
 
29
  app = FastAPI(title="SAP RPT-1 Benchmarking API", version="1.0.0")
30
 
31
  # ── Static files (frontend) ────────────────────────────────────────────────────
@@ -184,10 +184,6 @@ async def benchmark(
184
 
185
  # Cache the Best Overall model for the Live Playground
186
  best_name = result["recommendation"]["recommendations"]["best_overall"]["model"]
187
- try:
188
- from benchmark import BUILDERS, _prep, _encode_target
189
- except ImportError:
190
- from webapp.benchmark import BUILDERS, _prep, _encode_target
191
  X = df.drop(columns=[target_col])
192
  y_raw = df[target_col]
193
  task = result["dataset_info"]["task"]
@@ -241,10 +237,6 @@ async def predict(data: dict):
241
  # Ensure column order matches training
242
  input_df = input_df[CHAMPION_INFO["features"]]
243
 
244
- try:
245
- from benchmark import _prep
246
- except ImportError:
247
- from webapp.benchmark import _prep
248
  # Use the EXACT same encoders that were used during training
249
  X_test, _ = _prep(input_df, encoders=CHAMPION_INFO.get("encoders"))
250
 
 
1
  import sys
 
 
 
 
 
 
2
  import io, os
3
+ import logging
4
+ from pathlib import Path
5
  from dotenv import load_dotenv
 
 
 
 
6
  import pandas as pd
7
  import numpy as np
8
  from fastapi import FastAPI, File, UploadFile, Form, HTTPException
9
+ from fastapi.responses import JSONResponse, FileResponse
10
  from fastapi.staticfiles import StaticFiles
11
 
12
+ # Absolute imports based on project root
13
+ from webapp.benchmark import run_benchmark, infer_task, BUILDERS, _prep, _encode_target
 
 
 
14
 
15
+ # Setup logging
16
+ logging.basicConfig(level=logging.INFO)
17
+ logger = logging.getLogger(__name__)
18
 
19
+ # Load .env
20
+ BASE_DIR = Path(__file__).resolve().parent.parent
21
+ load_dotenv(BASE_DIR / "webapp" / ".env")
22
+
23
+ # Verify Secrets on startup
24
+ logger.info(f"TABPFN_TOKEN status: {'SET' if os.environ.get('TABPFN_TOKEN') else 'MISSING'}")
25
+ logger.info(f"HF_TOKEN status: {'SET' if os.environ.get('HUGGING_FACE_HUB_TOKEN') else 'MISSING'}")
26
+
27
+ # ── Config ─────────────────────────────────────────────────────────────────────
28
+ MAX_FILE_BYTES = int(os.getenv("MAX_FILE_SIZE_MB", "5")) * 1024 * 1024
29
  app = FastAPI(title="SAP RPT-1 Benchmarking API", version="1.0.0")
30
 
31
  # ── Static files (frontend) ────────────────────────────────────────────────────
 
184
 
185
  # Cache the Best Overall model for the Live Playground
186
  best_name = result["recommendation"]["recommendations"]["best_overall"]["model"]
 
 
 
 
187
  X = df.drop(columns=[target_col])
188
  y_raw = df[target_col]
189
  task = result["dataset_info"]["task"]
 
237
  # Ensure column order matches training
238
  input_df = input_df[CHAMPION_INFO["features"]]
239
 
 
 
 
 
240
  # Use the EXACT same encoders that were used during training
241
  X_test, _ = _prep(input_df, encoders=CHAMPION_INFO.get("encoders"))
242
 
{code β†’ webapp}/models/__init__.py RENAMED
File without changes
{code β†’ webapp}/models/autogluon_wrapper.py RENAMED
File without changes
{code β†’ webapp}/models/base_wrapper.py RENAMED
File without changes
{code β†’ webapp}/models/baseline_wrappers.py RENAMED
File without changes
{code β†’ webapp}/models/sap_rpt1_hf_wrapper.py RENAMED
File without changes
{code β†’ webapp}/models/sap_rpt1_wrapper.py RENAMED
File without changes
{code β†’ webapp}/models/tabicl_wrapper.py RENAMED
File without changes
{code β†’ webapp}/models/tabpfn_wrapper.py RENAMED
File without changes