Spaces:

Akshay4506
/

ModelMatrix

Sleeping

App Files Files Community

Akshay4506 commited on 21 days ago

Commit

f800c3b

1 Parent(s): 9423e26

feat: finalized stable architecture with consolidated structure and single-worker mode

Browse files

Files changed (28) hide show

Dockerfile +2 -2
code/analysis/__init__.py +0 -11
code/analysis/aggregate_results.py +0 -99
code/config/datasets.yaml +0 -33
code/config/experiments.yaml +0 -64
code/config/models.yaml +0 -84
code/docker/Dockerfile +0 -102
code/evaluation/__init__.py +0 -24
code/evaluation/compute_tracker.py +0 -114
code/evaluation/cross_validation.py +0 -127
code/evaluation/metrics.py +0 -116
code/evaluation/statistical_tests.py +0 -109
code/runners/__init__.py +0 -11
code/runners/run_baselines.py +0 -50
code/runners/run_batch.py +0 -289
code/runners/run_experiment.py +0 -260
code/utils/__init__.py +0 -11
code/utils/logging_utils.py +0 -63
webapp/benchmark.py +2 -3
webapp/main.py +18 -26
{code → webapp}/models/__init__.py +0 -0
{code → webapp}/models/autogluon_wrapper.py +0 -0
{code → webapp}/models/base_wrapper.py +0 -0
{code → webapp}/models/baseline_wrappers.py +0 -0
{code → webapp}/models/sap_rpt1_hf_wrapper.py +0 -0
{code → webapp}/models/sap_rpt1_wrapper.py +0 -0
{code → webapp}/models/tabicl_wrapper.py +0 -0
{code → webapp}/models/tabpfn_wrapper.py +0 -0

Dockerfile CHANGED Viewed

@@ -40,5 +40,5 @@ RUN pip install --no-cache-dir git+https://github.com/SAP-samples/sap-rpt-1-oss.
 # Expose port 7860 (Hugging Face Spaces default port)
 EXPOSE 7860
-# Run the FastAPI app
-CMD ["python", "-m", "uvicorn", "webapp.main:app", "--host", "0.0.0.0", "--port", "7860"]

 # Expose port 7860 (Hugging Face Spaces default port)
 EXPOSE 7860
+# Run the FastAPI app with a single worker to save RAM and avoid download race conditions
+CMD ["python", "-m", "uvicorn", "webapp.main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]

code/analysis/__init__.py DELETED Viewed

@@ -1,11 +0,0 @@
-"""
-Analysis Package
-================
-Results aggregation, statistical analysis, and visualization.
-Author: UW MSIM Team
-Date: November 2025
-"""
-__all__ = ['aggregate_results']

code/analysis/aggregate_results.py DELETED Viewed

@@ -1,99 +0,0 @@
-"""
-Results Aggregation
-===================
-Aggregate all experiment results into summary tables.
-Author: UW MSIM Team
-Date: November 2025
-"""
-import glob
-import json
-import pandas as pd
-import os
-import logging
-logger = logging.getLogger(__name__)
-def aggregate_all_results(
-    results_dir: str = '../results/raw',
-    output_file: str = '../results/processed/aggregated_results.csv'
-) -> pd.DataFrame:
-    """
-    Aggregate all experiment results into single DataFrame.
-    Parameters
-    ----------
-    results_dir : str
-        Directory containing result JSON files
-    output_file : str
-        Where to save aggregated CSV
-    Returns
-    -------
-    df : pd.DataFrame
-        Aggregated results
-    """
-    logger.info(f"Aggregating results from {results_dir}")
-    result_files = glob.glob(os.path.join(results_dir, '*.json'))
-    logger.info(f"Found {len(result_files)} result files")
-    aggregated = []
-    for file in result_files:
-        try:
-            with open(file) as f:
-                data = json.load(f)
-            record = {
-                'dataset': data['dataset'],
-                'model': data['model'],
-                'task_type': data['task_type'],
-                'n_samples': data['n_samples'],
-                'n_features': data['n_features'],
-                'n_folds': data['n_folds']
-            }
-            # Add mean metrics
-            for metric, value in data['mean_metrics'].items():
-                record[f'mean_{metric}'] = value
-            # Add std metrics
-            for metric, value in data['std_metrics'].items():
-                record[f'std_{metric}'] = value
-            # Add compute info
-            if 'compute' in data:
-                record['elapsed_hours'] = data['compute'].get('elapsed_hours')
-                record['cost_usd'] = data['compute'].get('cost_usd')
-            aggregated.append(record)
-        except Exception as e:
-            logger.warning(f"Failed to process {file}: {e}")
-    # Create DataFrame
-    df = pd.DataFrame(aggregated)
-    # Save
-    os.makedirs(os.path.dirname(output_file), exist_ok=True)
-    df.to_csv(output_file, index=False)
-    logger.info(f"Aggregated {len(df)} results to {output_file}")
-    return df
-if __name__ == "__main__":
-    logging.basicConfig(level=logging.INFO)
-    df = aggregate_all_results()
-    print(f"\n✅ Aggregated {len(df)} experiment results")
-    print(f"\nDatasets: {df['dataset'].nunique()}")
-    print(f"Models: {df['model'].nunique()}")
-    print(f"\nSample of results:")
-    print(df.head())

code/config/datasets.yaml DELETED Viewed

@@ -1,33 +0,0 @@
-# Dataset Configuration
-# =====================
-# Local Datasets (from datasets folder)
-local_datasets:
-  enabled: true
-  path: '../datasets'
-# TabZilla Datasets (subset of 20)
-tabzilla:
-  enabled: false  # Enable when data is available
-  path: '../datasets/tabzilla'
-# OpenML-CC18 (Classification subset)
-openml_cc18:
-  enabled: false
-  path: '../datasets/openml_cc18'
-# Dataset Filters
-filters:
-  min_samples: 100
-  max_samples: 100000
-  min_features: 2
-  max_features: 1000
-  task_types:
-    - classification
-    - regression
-# Preprocessing
-preprocessing:
-  handle_missing: 'mean'  # mean, median, most_frequent, drop
-  encode_categoricals: true
-  scale_features: false  # Most models handle scaling internally

code/config/experiments.yaml DELETED Viewed

@@ -1,64 +0,0 @@
-# Experiment Configuration
-# ========================
-# Cross-Validation Settings
-n_folds: 10
-random_state: 42
-timeout: 86400  # 24 hours per experiment
-# Compute Resources
-cost_per_hour: 0.90  # USD per GPU-hour (H200)
-gpu_type: 'H200'
-gpu_memory_limit: 80  # GB
-checkpoint_interval: 3600  # Save checkpoint every hour
-# Model-Specific Parameters
-model_params:
-  sap_rpt1:
-    context_size: 4096
-    bagging_factor: 4
-    model_size: 'small'  # or 'large'
-  sap_rpt1_hf:
-    max_context_size: 4096
-    bagging: 4
-  tabpfn:
-    n_ensemble: 1
-    device: 'auto'
-  autogluon:
-    time_limit: 300  # 5 minutes
-    preset: 'medium_quality'  # best_quality, high_quality, good_quality, medium_quality
-  xgboost:
-    n_estimators: 100
-    learning_rate: 0.1
-    max_depth: 6
-  catboost:
-    iterations: 100
-    learning_rate: 0.1
-    depth: 6
-  lightgbm:
-    n_estimators: 100
-    learning_rate: 0.1
-    max_depth: -1
-# Evaluation Metrics
-primary_metric:
-  classification: 'roc_auc'
-  regression: 'r2'
-# Statistical Testing
-statistical_tests:
-  friedman_alpha: 0.05
-  nemenyi_alpha: 0.05
-# Reproducibility
-reproducibility:
-  save_predictions: true
-  save_models: false  # Models can be large
-  log_hyperparameters: true
-  track_compute: true

code/config/models.yaml DELETED Viewed

@@ -1,84 +0,0 @@
-# Model Configuration
-# ====================
-models:
-  # SAP RPT-1 (Primary Model)
-  - name: 'sap-rpt1-small'
-    enabled: true
-    priority: 'high'
-    docker_image: 'sap-rpt1'
-  - name: 'sap-rpt1-large'
-    enabled: true
-    priority: 'high'
-    docker_image: 'sap-rpt1'
-  # SAP RPT-1 OSS via Hugging Face (Open Source)
-  - name: 'sap-rpt1-hf'
-    enabled: true
-    priority: 'high'
-    docker_image: 'sap-rpt1'
-    description: 'SAP RPT-1 OSS model via HuggingFace token authentication'
-  # Pretrained Competitors
-  - name: 'tabpfn'
-    enabled: true
-    priority: 'high'
-    docker_image: 'tabpfn'
-  - name: 'tabicl'
-    enabled: false  # Enable when implementation ready
-    priority: 'medium'
-    docker_image: 'tabicl'
-  # AutoML
-  - name: 'autogluon'
-    enabled: true
-    priority: 'medium'
-    docker_image: 'autogluon'
-  # Gradient Boosting Baselines
-  - name: 'xgboost'
-    enabled: true
-    priority: 'medium'
-    docker_image: 'baselines'
-  - name: 'catboost'
-    enabled: true
-    priority: 'medium'
-    docker_image: 'baselines'
-  - name: 'lightgbm'
-    enabled: true
-    priority: 'low'
-    docker_image: 'baselines'
-# Model Groups (for batch experiments)
-model_groups:
-  all:
-    - sap-rpt1-small
-    - sap-rpt1-large
-    - sap-rpt1-hf
-    - tabpfn
-    - autogluon
-    - xgboost
-    - catboost
-    - lightgbm
-  pretrained_only:
-    - sap-rpt1-small
-    - sap-rpt1-large
-    - sap-rpt1-hf
-    - tabpfn
-  baselines_only:
-    - xgboost
-    - catboost
-    - lightgbm
-  high_priority:
-    - sap-rpt1-small
-    - sap-rpt1-large
-    - sap-rpt1-hf
-    - tabpfn

code/docker/Dockerfile DELETED Viewed

@@ -1,102 +0,0 @@
-# =============================================================================
-# SAP RPT-1 Benchmarking - Multi-stage Dockerfile
-# =============================================================================
-# Builds two targets:
-#   - sap-rpt1: Python 3.11 with SAP RPT-1 OSS + all dependencies
-#   - baselines: Python 3.11 with XGBoost, CatBoost, LightGBM
-#
-# Usage:
-#   docker-compose build
-#   docker-compose run sap-rpt1
-#   docker-compose run baselines
-# =============================================================================
-# ---------- Base stage (shared by all targets) ----------
-FROM python:3.11-slim AS base
-# System dependencies
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    git \
-    build-essential \
-    && rm -rf /var/lib/apt/lists/*
-WORKDIR /app
-# Copy requirements first (for Docker layer caching)
-COPY requirements.txt /app/requirements.txt
-# ---------- SAP RPT-1 target ----------
-FROM base AS sap-rpt1
-# Install core scientific stack first (heavy packages)
-RUN pip install --default-timeout=1000 --retries 5 --no-cache-dir \
-    numpy==1.26.4 \
-    pandas==2.2.3 \
-    scikit-learn==1.6.1 \
-    scipy==1.14.1 \
-    matplotlib==3.9.2 \
-    seaborn==0.13.2
-# Install Hugging Face and PyTorch stack
-RUN pip install --default-timeout=1000 --retries 5 --no-cache-dir \
-    --extra-index-url https://download.pytorch.org/whl/cpu \
-    torch==2.7.0+cpu \
-    transformers==4.52.4 \
-    accelerate==1.6.0 \
-    huggingface-hub==0.30.2 \
-    datasets==3.5.0 \
-    pyarrow==20.0.0 \
-    torcheval==0.0.7
-# Install SAP RPT-1 and remaining requirements
-RUN pip install --default-timeout=1000 --retries 5 --no-cache-dir -r requirements.txt
-# Copy project code
-COPY . /app
-# Set Python path
-ENV PYTHONPATH=/app/code
-WORKDIR /app/code
-# Set entrypoint so you can run via arguments natively
-ENTRYPOINT ["python"]
-CMD ["-m", "runners.run_experiment", "--dataset", "adult", "--model", "sap-rpt1-hf"]
-# ---------- Baselines target ----------
-FROM base AS baselines
-# Install core scientific stack (heavy packages)
-RUN pip install --default-timeout=1000 --retries 5 --no-cache-dir \
-    numpy==1.26.4 \
-    pandas==2.2.3 \
-    scikit-learn==1.6.1 \
-    scipy==1.14.1
-# Install visualization and utilities
-RUN pip install --default-timeout=1000 --retries 5 --no-cache-dir \
-    matplotlib==3.9.2 \
-    seaborn==0.13.2 \
-    pyyaml==6.0.2 \
-    tqdm==4.67.1 \
-    joblib==1.4.2 \
-    python-dotenv==1.0.1
-# Install ML frameworks and OpenML
-RUN pip install --default-timeout=1000 --retries 5 --no-cache-dir \
-    openml==0.14.2 \
-    xgboost \
-    catboost \
-    lightgbm
-# Copy project code
-COPY . /app
-# Set Python path
-ENV PYTHONPATH=/app/code
-WORKDIR /app/code
-# Set entrypoint so you can run via arguments natively
-ENTRYPOINT ["python"]
-CMD ["-m", "runners.run_batch", "--datasets", "config/datasets.yaml", "--models", "config/models.yaml"]

code/evaluation/__init__.py DELETED Viewed

@@ -1,24 +0,0 @@
-"""
-Evaluation Package
-==================
-Tools for model evaluation, statistical testing, and benchmarking.
-Author: UW MSIM Team
-Date: November 2025
-"""
-from .metrics import calculate_classification_metrics, calculate_regression_metrics
-from .cross_validation import run_cross_validation
-from .statistical_tests import friedman_test, nemenyi_post_hoc, critical_difference
-from .compute_tracker import ComputeTracker
-__all__ = [
-    'calculate_classification_metrics',
-    'calculate_regression_metrics',
-    'run_cross_validation',
-    'friedman_test',
-    'nemenyi_post_hoc',
-    'critical_difference',
-    'ComputeTracker'
-]

code/evaluation/compute_tracker.py DELETED Viewed

@@ -1,114 +0,0 @@
-"""
-Compute Resource Tracker
-=========================
-Track GPU hours, costs, and resource usage for experiments.
-Author: UW MSIM Team
-Date: November 2025
-"""
-import time
-import numpy as np
-from typing import Dict, Optional, List
-try:
-    import psutil
-    HAS_PSUTIL = True
-except ImportError:
-    HAS_PSUTIL = False
-import logging
-logger = logging.getLogger(__name__)
-class ComputeTracker:
-    """
-    Track compute resources and costs.
-    Parameters
-    ----------
-    cost_per_hour : float
-        Cost per GPU-hour in USD
-    gpu_type : str
-        GPU type (e.g., 'H200', 'A100', 'L40S')
-    """
-    def __init__(self, cost_per_hour: float = 0.90, gpu_type: str = 'H200'):
-        self.cost_per_hour = cost_per_hour
-        self.gpu_type = gpu_type
-        self.start_time: Optional[float] = None
-        self.end_time: Optional[float] = None
-        self.gpu_usage_log: List[Dict] = []
-    def start(self):
-        """Start tracking."""
-        self.start_time = time.time()
-        self.gpu_usage_log = []
-        logger.info(f"Compute tracking started (GPU: {self.gpu_type}, ${self.cost_per_hour}/hr)")
-    def log_gpu_usage(self):
-        """Log current GPU usage."""
-        try:
-            import GPUtil
-            gpus = GPUtil.getGPUs()
-            for gpu in gpus:
-                self.gpu_usage_log.append({
-                    'timestamp': time.time(),
-                    'gpu_id': gpu.id,
-                    'gpu_load': gpu.load * 100,
-                    'memory_used_mb': gpu.memoryUsed,
-                    'memory_total_mb': gpu.memoryTotal,
-                    'memory_util': (gpu.memoryUsed / gpu.memoryTotal) * 100,
-                    'temperature': getattr(gpu, 'temperature', None)
-                })
-        except ImportError:
-            logger.warning("GPUtil not installed, GPU tracking unavailable")
-        except Exception as e:
-            logger.warning(f"GPU logging failed: {e}")
-    def stop(self) -> Dict:
-        """
-        Stop tracking and calculate costs.
-        Returns
-        -------
-        summary : dict
-            Elapsed time, costs, and GPU usage summary
-        """
-        self.end_time = time.time()
-        elapsed_hours = (self.end_time - self.start_time) / 3600
-        total_cost = elapsed_hours * self.cost_per_hour
-        # CPU usage
-        if HAS_PSUTIL:
-            cpu_percent = psutil.cpu_percent(interval=1)
-            memory_info = psutil.virtual_memory()
-            memory_percent = memory_info.percent
-            memory_used_gb = memory_info.used / (1024 ** 3)
-        else:
-            cpu_percent = 0.0
-            memory_percent = 0.0
-            memory_used_gb = 0.0
-        summary = {
-            'elapsed_hours': elapsed_hours,
-            'cost_usd': total_cost,
-            'cost_per_hour': self.cost_per_hour,
-            'gpu_type': self.gpu_type,
-            'cpu_percent': cpu_percent,
-            'memory_percent': memory_percent,
-            'memory_used_gb': memory_used_gb,
-            'gpu_logs_count': len(self.gpu_usage_log)
-        }
-        # Average GPU utilization
-        if self.gpu_usage_log:
-            summary['avg_gpu_load'] = np.mean([log['gpu_load'] for log in self.gpu_usage_log])
-            summary['avg_gpu_memory_util'] = np.mean([log['memory_util'] for log in self.gpu_usage_log])
-        logger.info(f"Compute tracking stopped: {elapsed_hours:.2f} hours, ${total_cost:.2f}")
-        return summary

code/evaluation/cross_validation.py DELETED Viewed

@@ -1,127 +0,0 @@
-"""
-Cross-Validation
-================
-10-fold stratified cross-validation for model evaluation.
-Author: UW MSIM Team
-Date: November 2025
-"""
-import numpy as np
-import pandas as pd
-from sklearn.model_selection import StratifiedKFold, KFold
-from sklearn.preprocessing import LabelEncoder
-from typing import List, Dict
-import logging
-from .metrics import calculate_classification_metrics, calculate_regression_metrics
-logger = logging.getLogger(__name__)
-def _encode_categorical_columns(X_train, X_val):
-    """
-    Label-encode object/categorical columns. Fitted on X_train,
-    applied to both X_train and X_val. Unknown categories in X_val
-    are mapped to -1.
-    """
-    X_train = X_train.copy()
-    X_val = X_val.copy()
-    cat_cols = X_train.select_dtypes(include=['object', 'category']).columns
-    if len(cat_cols) == 0:
-        return X_train, X_val
-    logger.info(f"  Encoding {len(cat_cols)} categorical columns: {list(cat_cols[:5])}{'...' if len(cat_cols) > 5 else ''}")
-    for col in cat_cols:
-        le = LabelEncoder()
-        # Fit on combined unique values from train (+ handle unseen in val)
-        combined = pd.concat([X_train[col], X_val[col]], axis=0).astype(str)
-        le.fit(combined)
-        X_train[col] = le.transform(X_train[col].astype(str))
-        X_val[col] = le.transform(X_val[col].astype(str))
-    return X_train, X_val
-def run_cross_validation(
-    model,
-    X: pd.DataFrame,
-    y: pd.Series,
-    task_type: str = 'classification',
-    n_folds: int = 10,
-    random_state: int = 42
-) -> List[Dict]:
-    """
-    Run k-fold cross-validation.
-    Parameters
-    ----------
-    model : BaseModelWrapper
-        Model to evaluate (must have fit/predict methods)
-    X : pd.DataFrame
-        Features
-    y : pd.Series
-        Target
-    task_type : str
-        'classification' or 'regression'
-    n_folds : int
-        Number of folds
-    random_state : int
-        Random seed
-    Returns
-    -------
-    fold_results : list of dict
-        Results for each fold
-    """
-    logger.info(f"Running {n_folds}-fold CV for {model.__class__.__name__}")
-    # Choose CV splitter
-    if task_type == 'classification':
-        cv = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_state)
-    else:
-        cv = KFold(n_splits=n_folds, shuffle=True, random_state=random_state)
-    fold_results = []
-    for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X, y)):
-        logger.info(f"  Fold {fold_idx + 1}/{n_folds}")
-        # Split data
-        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
-        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
-        # Auto-encode categorical columns so tree models can handle them
-        X_train, X_val = _encode_categorical_columns(X_train, X_val)
-        # Fit model
-        model.fit(X_train, y_train)
-        # Predict
-        y_pred = model.predict(X_val)
-        y_proba = None
-        if task_type == 'classification':
-            try:
-                y_proba = model.predict_proba(X_val)
-            except:
-                pass
-        # Calculate metrics
-        if task_type == 'classification':
-            metrics = calculate_classification_metrics(y_val, y_pred, y_proba)
-        else:
-            metrics = calculate_regression_metrics(y_val, y_pred)
-        # Add timing info
-        metrics.update({
-            'fold': fold_idx,
-            'fit_time': model.fit_time,
-            'predict_time': model.predict_time
-        })
-        fold_results.append(metrics)
-    return fold_results

code/evaluation/metrics.py DELETED Viewed

@@ -1,116 +0,0 @@
-"""
-Evaluation Metrics
-==================
-Comprehensive metrics for classification and regression tasks.
-Author: UW MSIM Team
-Date: November 2025
-"""
-import numpy as np
-from sklearn.metrics import (
-    roc_auc_score, accuracy_score, f1_score, precision_score, recall_score,
-    r2_score, mean_squared_error, mean_absolute_error, log_loss
-)
-from typing import Dict, Optional
-import logging
-logger = logging.getLogger(__name__)
-def calculate_classification_metrics(
-    y_true: np.ndarray,
-    y_pred: np.ndarray,
-    y_proba: Optional[np.ndarray] = None
-) -> Dict[str, float]:
-    """
-    Calculate all classification metrics.
-    Parameters
-    ----------
-    y_true : np.ndarray
-        True labels
-    y_pred : np.ndarray
-        Predicted labels
-    y_proba : np.ndarray, optional
-        Predicted probabilities (n_samples, n_classes)
-    Returns
-    -------
-    metrics : dict
-        Dictionary of metric names and values
-    """
-    metrics = {
-        'accuracy': accuracy_score(y_true, y_pred),
-        'f1_macro': f1_score(y_true, y_pred, average='macro', zero_division=0),
-        'f1_weighted': f1_score(y_true, y_pred, average='weighted', zero_division=0),
-        'precision_macro': precision_score(y_true, y_pred, average='macro', zero_division=0),
-        'recall_macro': recall_score(y_true, y_pred, average='macro', zero_division=0)
-    }
-    # ROC-AUC (if probabilities available)
-    if y_proba is not None:
-        try:
-            n_classes = len(np.unique(y_true))
-            if n_classes == 2:
-                # Binary classification
-                metrics['roc_auc'] = roc_auc_score(y_true, y_proba[:, 1])
-            else:
-                # Multi-class classification
-                metrics['roc_auc'] = roc_auc_score(
-                    y_true, y_proba,
-                    multi_class='ovr',
-                    average='macro'
-                )
-            # Log loss
-            metrics['log_loss'] = log_loss(y_true, y_proba)
-        except Exception as e:
-            logger.warning(f"ROC-AUC calculation failed: {e}")
-            metrics['roc_auc'] = np.nan
-            metrics['log_loss'] = np.nan
-    return metrics
-def calculate_regression_metrics(
-    y_true: np.ndarray,
-    y_pred: np.ndarray
-) -> Dict[str, float]:
-    """
-    Calculate all regression metrics.
-    Parameters
-    ----------
-    y_true : np.ndarray
-        True values
-    y_pred : np.ndarray
-        Predicted values
-    Returns
-    -------
-    metrics : dict
-        Dictionary of metric names and values
-    """
-    metrics = {
-        'r2': r2_score(y_true, y_pred),
-        'rmse': np.sqrt(mean_squared_error(y_true, y_pred)),
-        'mae': mean_absolute_error(y_true, y_pred),
-        'mse': mean_squared_error(y_true, y_pred)
-    }
-    # MAPE (avoid division by zero)
-    try:
-        non_zero_mask = y_true != 0
-        if np.any(non_zero_mask):
-            mape = np.mean(np.abs((y_true[non_zero_mask] - y_pred[non_zero_mask]) / y_true[non_zero_mask])) * 100
-            metrics['mape'] = mape
-        else:
-            metrics['mape'] = np.nan
-    except:
-        metrics['mape'] = np.nan
-    return metrics

code/evaluation/statistical_tests.py DELETED Viewed

@@ -1,109 +0,0 @@
-"""
-Statistical Tests
-=================
-Statistical significance testing for model comparisons.
-Implements:
-- Friedman test (non-parametric ANOVA)
-- Nemenyi post-hoc test
-- Critical difference calculation
-Author: UW MSIM Team
-Date: November 2025
-"""
-import numpy as np
-import pandas as pd
-from scipy import stats
-from typing import Dict, Tuple
-import logging
-logger = logging.getLogger(__name__)
-def friedman_test(results_df: pd.DataFrame) -> Dict:
-    """
-    Friedman test for comparing multiple models.
-    Parameters
-    ----------
-    results_df : pd.DataFrame
-        Rows = datasets, columns = models, values = metric scores
-    Returns
-    -------
-    results : dict
-        Test statistic, p-value, and significance
-    """
-    # Rank models for each dataset (higher is better)
-    ranks = results_df.rank(axis=1, ascending=False)
-    # Friedman test
-    stat, p_value = stats.friedmanchisquare(*[ranks[col] for col in ranks.columns])
-    logger.info(f"Friedman Test: statistic={stat:.4f}, p-value={p_value:.4e}")
-    return {
-        'statistic': stat,
-        'p_value': p_value,
-        'significant': p_value < 0.05,
-        'avg_ranks': ranks.mean().to_dict()
-    }
-def nemenyi_post_hoc(results_df: pd.DataFrame) -> pd.DataFrame:
-    """
-    Nemenyi post-hoc test (pairwise comparisons).
-    Parameters
-    ----------
-    results_df : pd.DataFrame
-        Rows = datasets, columns = models, values = metric scores
-    Returns
-    -------
-    p_values : pd.DataFrame
-        Pairwise p-values
-    """
-    try:
-        import scikit_posthocs as sp
-        ranks = results_df.rank(axis=1, ascending=False)
-        p_values = sp.posthoc_nemenyi_friedman(ranks.T)
-        return p_values
-    except ImportError:
-        logger.error("scikit-posthocs not installed. Install with: pip install scikit-posthocs")
-        raise
-def critical_difference(
-    n_datasets: int,
-    n_models: int,
-    alpha: float = 0.05
-) -> float:
-    """
-    Calculate critical difference for CD diagrams.
-    Parameters
-    ----------
-    n_datasets : int
-        Number of datasets
-    n_models : int
-        Number of models
-    alpha : float
-        Significance level
-    Returns
-    -------
-    cd : float
-        Critical difference value
-    """
-    # Critical value from Nemenyi distribution
-    # Approximation using normal distribution
-    q_alpha = stats.norm.ppf(1 - alpha / 2)
-    cd = q_alpha * np.sqrt((n_models * (n_models + 1)) / (6 * n_datasets))
-    logger.info(f"Critical Difference: {cd:.4f} (alpha={alpha})")
-    return cd

code/runners/__init__.py DELETED Viewed

@@ -1,11 +0,0 @@
-"""
-Experiment Runners Package
-===========================
-Tools for executing benchmarking experiments.
-Author: UW MSIM Team
-Date: November 2025
-"""
-__all__ = ['run_experiment', 'run_batch']

code/runners/run_baselines.py DELETED Viewed

@@ -1,50 +0,0 @@
-"""
-Baseline Models Batch Runner
-==============================
-Run all baseline models (XGBoost, CatBoost, LightGBM) on all or specific datasets.
-Usage:
-    # Run on ALL datasets
-    py -3.12 -m runners.run_baselines
-    # Run on specific datasets
-    py -3.12 -m runners.run_baselines --dataset analcatdata_authorship diabetes
-Author: UW MSIM Team
-Date: April 2026
-"""
-import argparse
-import sys
-from pathlib import Path
-# Add parent directory to path
-sys.path.insert(0, str(Path(__file__).parent.parent))
-from runners.run_batch import main as run_batch_main
-BASELINE_MODELS = ['xgboost', 'catboost', 'lightgbm']
-def main():
-    """Run all baseline models on all or specific datasets."""
-    parser = argparse.ArgumentParser(description='Run baseline models')
-    parser.add_argument('--dataset', nargs='*', default=None,
-                        help='Specific dataset(s) to run (e.g., --dataset analcatdata_authorship diabetes)')
-    args = parser.parse_args()
-    # Build sys.argv for run_batch
-    batch_args = ['run_baselines', '--model-filter', *BASELINE_MODELS]
-    if args.dataset:
-        batch_args.extend(['--dataset-filter', *args.dataset])
-    sys.argv = batch_args
-    run_batch_main()
-if __name__ == '__main__':
-    main()

code/runners/run_batch.py DELETED Viewed

@@ -1,289 +0,0 @@
-"""
-Batch Experiment Runner
-========================
-Run multiple models on multiple datasets.
-Usage:
-    python -m runners.run_batch \
-        --datasets config/datasets.yaml \
-        --models config/models.yaml
-Author: UW MSIM Team
-Date: April 2026
-"""
-import argparse
-import yaml
-import logging
-import sys
-import os
-import json
-import time
-from pathlib import Path
-from typing import List, Dict, Optional
-# Add parent directory to path
-sys.path.insert(0, str(Path(__file__).parent.parent))
-from runners.run_experiment import run_single_experiment, get_model
-logger = logging.getLogger(__name__)
-def get_dataset_list(datasets_config: dict, dataset_dir: str = None) -> List[str]:
-    """
-    Get list of available dataset names from the download directory.
-    Parameters
-    ----------
-    datasets_config : dict
-        Datasets YAML configuration
-    dataset_dir : str
-        Directory containing downloaded datasets
-    Returns
-    -------
-    datasets : list of str
-        List of dataset names
-    """
-    datasets = []
-    if dataset_dir is None:
-        dataset_dir = str(Path(__file__).parent.parent.parent / 'datasets')
-    if os.path.isdir(dataset_dir):
-        # Find all *_X.csv files and extract dataset names
-        for f in sorted(os.listdir(dataset_dir)):
-            if f.endswith('_X.csv'):
-                name = f[:-6]  # Remove '_X.csv'
-                # Verify y file also exists
-                y_file = os.path.join(dataset_dir, f"{name}_y.csv")
-                if os.path.exists(y_file):
-                    datasets.append(name)
-        logger.info(f"Found {len(datasets)} datasets in {dataset_dir}")
-    else:
-        logger.warning(f"Dataset directory not found: {dataset_dir}")
-    return datasets
-def get_model_list(models_config: dict) -> List[str]:
-    """
-    Get list of enabled model names from configuration.
-    Parameters
-    ----------
-    models_config : dict
-        Models YAML configuration
-    Returns
-    -------
-    models : list of str
-        List of enabled model names
-    """
-    models = []
-    for model_entry in models_config.get('models', []):
-        if model_entry.get('enabled', True):
-            models.append(model_entry['name'])
-    return models
-def run_batch_experiments(
-    datasets: List[str],
-    models: List[str],
-    experiment_config: dict,
-    output_dir: str = '../results/raw',
-    skip_existing: bool = True
-) -> dict:
-    """
-    Run experiments for all dataset × model combinations.
-    Parameters
-    ----------
-    datasets : list of str
-        Dataset names
-    models : list of str
-        Model names
-    experiment_config : dict
-        Experiment configuration (n_folds, random_state, etc.)
-    output_dir : str
-        Where to save results
-    skip_existing : bool
-        If True, skip experiments that already have result files
-    Returns
-    -------
-    summary : dict
-        Batch run summary with successes and failures
-    """
-    total_experiments = len(datasets) * len(models)
-    logger.info(f"\n{'='*60}")
-    logger.info(f"BATCH RUN: {len(datasets)} datasets × {len(models)} models = {total_experiments} experiments")
-    logger.info(f"{'='*60}\n")
-    summary = {
-        'total': total_experiments,
-        'completed': 0,
-        'skipped': 0,
-        'failed': 0,
-        'results': [],
-        'errors': []
-    }
-    batch_start_time = time.time()
-    for i, dataset_name in enumerate(datasets):
-        for j, model_name in enumerate(models):
-            experiment_num = i * len(models) + j + 1
-            output_file = os.path.join(output_dir, f"{dataset_name}_{model_name}.json")
-            # Skip existing results
-            if skip_existing and os.path.exists(output_file):
-                logger.info(
-                    f"[{experiment_num}/{total_experiments}] "
-                    f"SKIP {model_name} on {dataset_name} (result exists)"
-                )
-                summary['skipped'] += 1
-                continue
-            logger.info(
-                f"\n[{experiment_num}/{total_experiments}] "
-                f"Running {model_name} on {dataset_name}..."
-            )
-            try:
-                result = run_single_experiment(
-                    dataset_name=dataset_name,
-                    model_name=model_name,
-                    config=experiment_config,
-                    output_dir=output_dir
-                )
-                summary['completed'] += 1
-                summary['results'].append({
-                    'dataset': dataset_name,
-                    'model': model_name,
-                    'status': 'success'
-                })
-            except Exception as e:
-                logger.error(f"FAILED: {model_name} on {dataset_name}: {e}")
-                summary['failed'] += 1
-                summary['errors'].append({
-                    'dataset': dataset_name,
-                    'model': model_name,
-                    'error': str(e)
-                })
-    batch_elapsed = time.time() - batch_start_time
-    # Print summary
-    logger.info(f"\n{'='*60}")
-    logger.info(f"BATCH RUN COMPLETE")
-    logger.info(f"{'='*60}")
-    logger.info(f"  Total experiments: {summary['total']}")
-    logger.info(f"  Completed: {summary['completed']}")
-    logger.info(f"  Skipped: {summary['skipped']}")
-    logger.info(f"  Failed: {summary['failed']}")
-    logger.info(f"  Total time: {batch_elapsed / 3600:.2f} hours")
-    logger.info(f"{'='*60}\n")
-    # Save batch summary
-    os.makedirs(output_dir, exist_ok=True)
-    summary_file = os.path.join(output_dir, '_batch_summary.json')
-    summary['elapsed_hours'] = batch_elapsed / 3600
-    with open(summary_file, 'w') as f:
-        json.dump(summary, f, indent=2)
-    logger.info(f"Batch summary saved to {summary_file}")
-    return summary
-def main():
-    """Entry point for batch runner."""
-    # Setup logging
-    logging.basicConfig(
-        level=logging.INFO,
-        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-    )
-    # Parse arguments
-    parser = argparse.ArgumentParser(description='Run batch benchmarking experiments')
-    parser.add_argument('--datasets', default='config/datasets.yaml',
-                        help='Datasets config file')
-    parser.add_argument('--models', default='config/models.yaml',
-                        help='Models config file')
-    parser.add_argument('--config', default='config/experiments.yaml',
-                        help='Experiment config file')
-    parser.add_argument('--output-dir', default='../results/raw',
-                        help='Output directory')
-    parser.add_argument('--dataset-dir', default=None,
-                        help='Directory containing downloaded datasets')
-    parser.add_argument('--no-skip', action='store_true',
-                        help='Re-run experiments even if results exist')
-    parser.add_argument('--model-filter', nargs='*', default=None,
-                        help='Only run specific models (e.g., --model-filter sap-rpt1-hf xgboost)')
-    parser.add_argument('--dataset-filter', nargs='*', default=None,
-                        help='Only run specific datasets')
-    args = parser.parse_args()
-    # Load configs
-    if os.path.exists(args.datasets):
-        with open(args.datasets) as f:
-            datasets_config = yaml.safe_load(f)
-    else:
-        datasets_config = {}
-    if os.path.exists(args.models):
-        with open(args.models) as f:
-            models_config = yaml.safe_load(f)
-    else:
-        models_config = {}
-    if os.path.exists(args.config):
-        with open(args.config) as f:
-            experiment_config = yaml.safe_load(f)
-    else:
-        experiment_config = {
-            'n_folds': 10,
-            'random_state': 42,
-            'cost_per_hour': 0.90,
-            'gpu_type': 'H200'
-        }
-    # Get dataset and model lists
-    dataset_list = args.dataset_filter or get_dataset_list(datasets_config, args.dataset_dir)
-    model_list = args.model_filter or get_model_list(models_config)
-    if not dataset_list:
-        print("[ERROR] No datasets found in the datasets directory.")
-        sys.exit(1)
-    if not model_list:
-        print("[ERROR] No models enabled in config. Check config/models.yaml")
-        sys.exit(1)
-    print(f"\n[INFO] Datasets ({len(dataset_list)}): {dataset_list[:5]}{'...' if len(dataset_list) > 5 else ''}")
-    print(f"[INFO] Models ({len(model_list)}): {model_list}")
-    # Add dataset_dir to config for run_experiment to use
-    experiment_config['dataset_dir'] = args.dataset_dir if args.dataset_dir else str(Path(__file__).parent.parent.parent / 'datasets')
-    # Run batch
-    summary = run_batch_experiments(
-        datasets=dataset_list,
-        models=model_list,
-        experiment_config=experiment_config,
-        output_dir=args.output_dir,
-        skip_existing=not args.no_skip
-    )
-    print(f"\n[SUCCESS] Batch complete! {summary['completed']} succeeded, {summary['failed']} failed")
-if __name__ == "__main__":
-    main()

code/runners/run_experiment.py DELETED Viewed

@@ -1,260 +0,0 @@
-"""
-Single Experiment Runner
-=========================
-Run a single model on a single dataset.
-Usage:
-    python -m runners.run_experiment --dataset adult --model sap-rpt1
-Author: UW MSIM Team
-Date: November 2025
-"""
-import argparse
-import json
-import yaml
-import logging
-import sys
-import os
-from pathlib import Path
-# Add parent directory to path
-sys.path.insert(0, str(Path(__file__).parent.parent))
-from models import *
-from datasets.preprocessors import load_dataset
-from datasets.dataset_catalog import DatasetCatalog
-from evaluation import run_cross_validation, ComputeTracker
-logger = logging.getLogger(__name__)
-def get_model(model_name: str, task_type: str, config: dict):
-    """
-    Initialize model by name.
-    Parameters
-    ----------
-    model_name : str
-        Model identifier
-    task_type : str
-        'classification' or 'regression'
-    config : dict
-        Model configuration
-    Returns
-    -------
-    model : BaseModelWrapper
-        Initialized model
-    """
-    model_map = {
-        'sap-rpt1': SAPRPT1Wrapper,
-        'sap-rpt1-small': lambda **kwargs: SAPRPT1Wrapper(model_size='small', **kwargs),
-        'sap-rpt1-large': lambda **kwargs: SAPRPT1Wrapper(model_size='large', **kwargs),
-        'sap-rpt1-hf': SAPRPT1HFWrapper,
-        'tabpfn': TabPFNWrapper,
-        'tabicl': TabICLWrapper,
-        'autogluon': AutoGluonWrapper,
-        'xgboost': XGBoostWrapper,
-        'catboost': CatBoostWrapper,
-        'lightgbm': LightGBMWrapper
-    }
-    if model_name not in model_map:
-        raise ValueError(f"Unknown model: {model_name}. Choose from {list(model_map.keys())}")
-    model_class = model_map[model_name]
-    # Get specific parameters for this model
-    model_config_key = model_name.replace('-', '_')
-    # Special handling for size variants like sap-rpt1-small -> sap_rpt1
-    if model_name.startswith('sap-rpt1-') and model_name not in ['sap-rpt1-hf']:
-        model_config_key = 'sap_rpt1'
-    model_params = config.get('model_params', {}).get(model_config_key, {})
-    model = model_class(task_type=task_type, **model_params)
-    logger.info(f"Initialized {model_name} for {task_type}")
-    return model
-def run_single_experiment(
-    dataset_name: str,
-    model_name: str,
-    config: dict,
-    output_dir: str = '../results/raw'
-) -> dict:
-    """
-    Run experiment on single dataset with single model.
-    Parameters
-    ----------
-    dataset_name : str
-        Dataset name
-    model_name : str
-        Model name
-    config : dict
-        Experiment configuration
-    output_dir : str
-        Where to save results
-    Returns
-    -------
-    summary : dict
-        Experiment results
-    """
-    logger.info(f"\n{'='*60}")
-    logger.info(f"Experiment: {model_name} on {dataset_name}")
-    logger.info(f"{'='*60}\n")
-    # Create output directory
-    os.makedirs(output_dir, exist_ok=True)
-    # Start compute tracking
-    tracker = ComputeTracker(
-        cost_per_hour=config.get('cost_per_hour', 0.90),
-        gpu_type=config.get('gpu_type', 'H200')
-    )
-    tracker.start()
-    try:
-        # Load dataset
-        logger.info("Loading dataset...")
-        default_dataset_dir = str(Path(__file__).parent.parent.parent / 'datasets')
-        dataset_dir = config.get('dataset_dir', default_dataset_dir)
-        dataset_path = config.get('dataset_path', None)
-        if dataset_path and os.path.exists(dataset_path):
-            # Explicit path provided
-            X, y, task_type = load_dataset(dataset_path)
-        elif os.path.isdir(dataset_dir):
-            # Search for dataset files in the download directory
-            X_file = None
-            y_file = None
-            for f in os.listdir(dataset_dir):
-                fname_lower = f.lower()
-                dname_lower = dataset_name.lower()
-                if fname_lower == f"{dname_lower}_x.csv" or (fname_lower.endswith('_x.csv') and dname_lower in fname_lower):
-                    X_file = os.path.join(dataset_dir, f)
-                if fname_lower == f"{dname_lower}_y.csv" or (fname_lower.endswith('_y.csv') and dname_lower in fname_lower):
-                    y_file = os.path.join(dataset_dir, f)
-            if X_file and y_file:
-                import pandas as pd_load
-                X = pd_load.read_csv(X_file)
-                y = pd_load.read_csv(y_file).iloc[:, 0]
-                # Determine task type
-                if y.dtype == 'object' or len(y.unique()) < 20:
-                    task_type = 'classification'
-                else:
-                    task_type = 'regression'
-                logger.info(f"Loaded {dataset_name}: {X.shape[0]} samples, {X.shape[1]} features, task={task_type}")
-            else:
-                # Fallback: try as a single CSV file
-                csv_path = os.path.join(dataset_dir, f"{dataset_name}.csv")
-                if os.path.exists(csv_path):
-                    X, y, task_type = load_dataset(csv_path)
-                else:
-                    raise FileNotFoundError(
-                        f"Dataset '{dataset_name}' not found in {dataset_dir}.\n"
-                        f"Available files: {os.listdir(dataset_dir)[:10]}..."
-                    )
-        else:
-            raise FileNotFoundError(
-                f"Dataset directory not found: {dataset_dir}"
-            )
-        # Initialize model
-        model = get_model(model_name, task_type, config)
-        # Run cross-validation
-        fold_results = run_cross_validation(
-            model=model,
-            X=X,
-            y=y,
-            task_type=task_type,
-            n_folds=config.get('n_folds', 10),
-            random_state=config.get('random_state', 42)
-        )
-        # Stop tracking
-        compute_summary = tracker.stop()
-        # Aggregate results
-        import pandas as pd
-        results_df = pd.DataFrame(fold_results)
-        summary = {
-            'dataset': dataset_name,
-            'model': model_name,
-            'task_type': task_type,
-            'n_samples': len(X),
-            'n_features': X.shape[1],
-            'n_folds': config.get('n_folds', 10),
-            'mean_metrics': results_df.mean().to_dict(),
-            'std_metrics': results_df.std().to_dict(),
-            'fold_results': fold_results,
-            'compute': compute_summary
-        }
-        # Save results
-        output_file = os.path.join(output_dir, f"{dataset_name}_{model_name}.json")
-        with open(output_file, 'w') as f:
-            json.dump(summary, f, indent=2)
-        logger.info(f"\n[SUCCESS] Results saved to {output_file}")
-        # Print summary
-        primary_metric = 'roc_auc' if task_type == 'classification' else 'r2'
-        if primary_metric in summary['mean_metrics']:
-            mean_val = summary['mean_metrics'][primary_metric]
-            std_val = summary['std_metrics'][primary_metric]
-            logger.info(f"\nPrimary Metric ({primary_metric}): {mean_val:.4f} ± {std_val:.4f}")
-        return summary
-    except Exception as e:
-        logger.error(f"Experiment failed: {e}", exc_info=True)
-        raise
-if __name__ == "__main__":
-    # Setup logging
-    logging.basicConfig(
-        level=logging.INFO,
-        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-    )
-    # Parse arguments
-    parser = argparse.ArgumentParser(description='Run single benchmarking experiment')
-    parser.add_argument('--dataset', required=True, help='Dataset name')
-    parser.add_argument('--model', required=True, help='Model name')
-    parser.add_argument('--config', default='../config/experiments.yaml', help='Config file')
-    parser.add_argument('--output-dir', default='../results/raw', help='Output directory')
-    args = parser.parse_args()
-    # Load config
-    if os.path.exists(args.config):
-        with open(args.config) as f:
-            config = yaml.safe_load(f)
-    else:
-        config = {
-            'n_folds': 10,
-            'random_state': 42,
-            'cost_per_hour': 0.90,
-            'gpu_type': 'H200'
-        }
-    # Run experiment
-    results = run_single_experiment(
-        dataset_name=args.dataset,
-        model_name=args.model,
-        config=config,
-        output_dir=args.output_dir
-    )
-    print("\n[SUCCESS] Experiment complete!")

code/utils/__init__.py DELETED Viewed

@@ -1,11 +0,0 @@
-"""
-Utilities Package
-=================
-Logging, result export, and helper functions.
-Author: UW MSIM Team
-Date: November 2025
-"""
-__all__ = []

code/utils/logging_utils.py DELETED Viewed

@@ -1,63 +0,0 @@
-"""
-Logging Utilities
-=================
-Structured logging for experiments.
-Author: UW MSIM Team
-Date: November 2025
-"""
-import logging
-import sys
-from pathlib import Path
-def setup_logger(
-    name: str,
-    log_file: str = None,
-    level: int = logging.INFO,
-    format_string: str = None
-) -> logging.Logger:
-    """
-    Setup logger with file and console handlers.
-    Parameters
-    ----------
-    name : str
-        Logger name
-    log_file : str, optional
-        Log file path
-    level : int
-        Logging level
-    format_string : str, optional
-        Custom format string
-    Returns
-    -------
-    logger : logging.Logger
-        Configured logger
-    """
-    if format_string is None:
-        format_string = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-    # Create logger
-    logger = logging.getLogger(name)
-    logger.setLevel(level)
-    logger.handlers = []  # Clear existing handlers
-    # Console handler
-    console_handler = logging.StreamHandler(sys.stdout)
-    console_handler.setLevel(level)
-    console_handler.setFormatter(logging.Formatter(format_string))
-    logger.addHandler(console_handler)
-    # File handler (if specified)
-    if log_file:
-        Path(log_file).parent.mkdir(parents=True, exist_ok=True)
-        file_handler = logging.FileHandler(log_file)
-        file_handler.setLevel(level)
-        file_handler.setFormatter(logging.Formatter(format_string))
-        logger.addHandler(file_handler)
-    return logger

webapp/benchmark.py CHANGED Viewed

@@ -24,8 +24,8 @@ os.environ.setdefault("TABPFN_ACCEPT_TERMS",     "1")
 os.environ.setdefault("TABPFN_LICENSE_ACCEPTED", "1")
 os.environ.setdefault("AGREE_TABPFN_LICENSE",    "1")
-# Allow importing model wrappers from the code directory
-sys.path.insert(0, str(Path(__file__).parent.parent / "code"))
 N_FOLDS   = int(os.getenv("N_FOLDS",   "3"))
 RAND      = int(os.getenv("RANDOM_STATE", "42"))
@@ -62,7 +62,6 @@ def _cat(task):
 def _tabpfn(task):
     if task != "classification":
         raise ValueError("TabPFN only supports classification tasks")
-    from models.tabpfn_wrapper import TabPFNWrapper
     # TabPFNWrapper uses a class-level _shared_classifier so weights are only
     # loaded once per process regardless of how many instances are created.
     return TabPFNWrapper(task_type=task, random_state=RAND)

 os.environ.setdefault("TABPFN_LICENSE_ACCEPTED", "1")
 os.environ.setdefault("AGREE_TABPFN_LICENSE",    "1")
+# Imports are handled via absolute package paths
+from webapp.models.tabpfn_wrapper import TabPFNWrapper
 N_FOLDS   = int(os.getenv("N_FOLDS",   "3"))
 RAND      = int(os.getenv("RANDOM_STATE", "42"))
 def _tabpfn(task):
     if task != "classification":
         raise ValueError("TabPFN only supports classification tasks")
     # TabPFNWrapper uses a class-level _shared_classifier so weights are only
     # loaded once per process regardless of how many instances are created.
     return TabPFNWrapper(task_type=task, random_state=RAND)

webapp/main.py CHANGED Viewed

@@ -1,31 +1,31 @@
 import sys
-from pathlib import Path
-# Add both root and webapp directory to sys.path to resolve all import issues
-BASE_DIR = Path(__file__).resolve().parent.parent
-sys.path.insert(0, str(BASE_DIR))
-sys.path.insert(0, str(BASE_DIR / "webapp"))
 import io, os
 from dotenv import load_dotenv
-# Load .env before anything else so HF_TOKEN is available to benchmark.py
-load_dotenv(BASE_DIR / "webapp" / ".env")
 import pandas as pd
 import numpy as np
 from fastapi import FastAPI, File, UploadFile, Form, HTTPException
-from fastapi.responses import JSONResponse
 from fastapi.staticfiles import StaticFiles
-# Now we can import benchmark reliably
-try:
-    from benchmark import run_benchmark, infer_task
-except ImportError:
-    from webapp.benchmark import run_benchmark, infer_task
-# ── Config ─────────────────────────────────────────────────────────────────────
-MAX_FILE_BYTES = int(os.getenv("MAX_FILE_SIZE_MB", "5")) * 1024 * 1024   # default 5 MB
 app = FastAPI(title="SAP RPT-1 Benchmarking API", version="1.0.0")
 # ── Static files (frontend) ────────────────────────────────────────────────────
@@ -184,10 +184,6 @@ async def benchmark(
         # Cache the Best Overall model for the Live Playground
         best_name = result["recommendation"]["recommendations"]["best_overall"]["model"]
-        try:
-            from benchmark import BUILDERS, _prep, _encode_target
-        except ImportError:
-            from webapp.benchmark import BUILDERS, _prep, _encode_target
         X = df.drop(columns=[target_col])
         y_raw = df[target_col]
         task = result["dataset_info"]["task"]
@@ -241,10 +237,6 @@ async def predict(data: dict):
         # Ensure column order matches training
         input_df = input_df[CHAMPION_INFO["features"]]
-        try:
-            from benchmark import _prep
-        except ImportError:
-            from webapp.benchmark import _prep
         # Use the EXACT same encoders that were used during training
         X_test, _ = _prep(input_df, encoders=CHAMPION_INFO.get("encoders"))

 import sys
 import io, os
+import logging
+from pathlib import Path
 from dotenv import load_dotenv
 import pandas as pd
 import numpy as np
 from fastapi import FastAPI, File, UploadFile, Form, HTTPException
+from fastapi.responses import JSONResponse, FileResponse
 from fastapi.staticfiles import StaticFiles
+# Absolute imports based on project root
+from webapp.benchmark import run_benchmark, infer_task, BUILDERS, _prep, _encode_target
+# Setup logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Load .env
+BASE_DIR = Path(__file__).resolve().parent.parent
+load_dotenv(BASE_DIR / "webapp" / ".env")
+# Verify Secrets on startup
+logger.info(f"TABPFN_TOKEN status: {'SET' if os.environ.get('TABPFN_TOKEN') else 'MISSING'}")
+logger.info(f"HF_TOKEN status:     {'SET' if os.environ.get('HUGGING_FACE_HUB_TOKEN') else 'MISSING'}")
+# ── Config ─────────────────────────────────────────────────────────────────────
+MAX_FILE_BYTES = int(os.getenv("MAX_FILE_SIZE_MB", "5")) * 1024 * 1024
 app = FastAPI(title="SAP RPT-1 Benchmarking API", version="1.0.0")
 # ── Static files (frontend) ────────────────────────────────────────────────────
         # Cache the Best Overall model for the Live Playground
         best_name = result["recommendation"]["recommendations"]["best_overall"]["model"]
         X = df.drop(columns=[target_col])
         y_raw = df[target_col]
         task = result["dataset_info"]["task"]
         # Ensure column order matches training
         input_df = input_df[CHAMPION_INFO["features"]]
         # Use the EXACT same encoders that were used during training
         X_test, _ = _prep(input_df, encoders=CHAMPION_INFO.get("encoders"))

{code → webapp}/models/__init__.py RENAMED Viewed

File without changes

{code → webapp}/models/autogluon_wrapper.py RENAMED Viewed

File without changes

{code → webapp}/models/base_wrapper.py RENAMED Viewed

File without changes

{code → webapp}/models/baseline_wrappers.py RENAMED Viewed

File without changes

{code → webapp}/models/sap_rpt1_hf_wrapper.py RENAMED Viewed

File without changes

{code → webapp}/models/sap_rpt1_wrapper.py RENAMED Viewed

File without changes

{code → webapp}/models/tabicl_wrapper.py RENAMED Viewed

File without changes

{code → webapp}/models/tabpfn_wrapper.py RENAMED Viewed

File without changes