Spaces:
Sleeping
Sleeping
File size: 7,294 Bytes
fede53c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 | """Example usage of the ODSE Sandbox Environment.
Run with: python -m core.example (from the ODSE root directory)
Demonstrates:
1. Classification (Breast Cancer) - explore, train, evaluate, submit
2. Regression (House Price) - full pipeline in fewer steps
"""
from __future__ import annotations
import sys
from pathlib import Path
# Ensure the ODSE package root is on sys.path when running as a script
_root = str(Path(__file__).resolve().parent.parent)
if _root not in sys.path:
sys.path.insert(0, _root)
from core.env import ODSEnvironment
from core.models import RunCodeAction, SubmitAction
# ============================================================================
# Helper
# ============================================================================
def _print_step(label: str, result) -> None:
obs = result.observation
print(f" [{label}]")
print(f" status : {obs.execution_status}")
print(f" reward : {result.reward:+.3f}")
if obs.stdout:
# Show first 300 chars of stdout
preview = obs.stdout[:300].rstrip()
print(f" stdout : {preview}")
if obs.stderr:
print(f" stderr : {obs.stderr[:200].rstrip()}")
if obs.validation_score is not None:
print(f" val_score : {obs.validation_score:.4f}")
print()
# ============================================================================
# Classification example
# ============================================================================
def run_classification_example() -> None:
"""Run a classification episode on the Breast Cancer dataset."""
print("=" * 64)
print(" CLASSIFICATION EXAMPLE - Breast Cancer (easy)")
print("=" * 64)
env = ODSEnvironment(dataset="breast_cancer", difficulty="easy", seed=42)
obs = env.reset()
di = obs.dataset_info
print(f"\n Dataset : breast_cancer (easy)")
print(f" Train shape : {di.train_shape}")
print(f" Val shape : {di.val_shape}")
print(f" Test shape : {di.test_shape}")
print(f" Target : {di.target_column} ({di.problem_type})")
print(f" Metric : {di.metric}")
print(f" Max steps : {obs.max_steps}")
print(f"\n Task:\n {obs.task_description}\n")
# -- Step 1: Explore -----------------------------------------------------
r = env.step(RunCodeAction(code="""\
print("== Training Data ==")
print(f"Shape: {train_df.shape}")
print(f"Columns: {list(train_df.columns)}")
print(f"\\nNull counts:\\n{train_df.isnull().sum()}")
print(f"\\nTarget distribution:\\n{train_df[target_column].value_counts()}")
"""))
_print_step("Step 1 - Explore data", r)
# -- Step 2: Train a model -----------------------------------------------
r = env.step(RunCodeAction(code="""\
from sklearn.linear_model import LogisticRegression
# Prepare training data (all numeric, no encoding needed)
X_train = train_df.drop(target_column, axis=1).copy()
y_train = train_df[target_column]
# Fill any missing values
X_train = X_train.fillna(X_train.median(numeric_only=True))
# Train model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)
print(f"Model trained: {X_train.shape[0]} samples, {X_train.shape[1]} features")
"""))
_print_step("Step 2 - Train model", r)
# -- Step 3: Evaluate on validation --------------------------------------
r = env.step(RunCodeAction(code="""\
# Prepare validation features (same transforms)
X_val = val_features.copy()
X_val = X_val.fillna(X_val.median(numeric_only=True))
val_preds = model.predict(X_val)
score = evaluate(val_preds)
print(f"Validation report: {score}")
"""))
_print_step("Step 3 - Evaluate on validation", r)
# -- Step 4: Predict on test and submit ----------------------------------
r = env.step(RunCodeAction(code="""\
# Prepare test features (same transforms)
X_test = test_features.copy()
X_test = X_test.fillna(X_test.median(numeric_only=True))
predictions = model.predict(X_test)
print(f"Test predictions shape: {predictions.shape}")
"""))
_print_step("Step 4 - Predict on test set", r)
r = env.step(SubmitAction())
print(f" [Submit]")
print(f" final reward : {r.reward:.4f}")
print(f" test_score : {r.info.get('test_score')}")
print(f" test_report : {r.info.get('test_report')}")
print(f" steps_taken : {r.info.get('steps_taken')}")
print(f" done : {r.done}")
print()
# ============================================================================
# Regression example
# ============================================================================
def run_regression_example() -> None:
"""Run a regression episode on the House Price dataset."""
print("=" * 64)
print(" REGRESSION EXAMPLE - House Price (easy)")
print("=" * 64)
env = ODSEnvironment(dataset="house_price", difficulty="easy", seed=42)
obs = env.reset()
di = obs.dataset_info
print(f"\n Dataset : house_price (easy)")
print(f" Train shape : {di.train_shape}")
print(f" Target : {di.target_column} ({di.problem_type})")
print(f" Metric : {di.metric}")
print(f" Target stats : {di.target_stats}")
print()
# -- Step 1: Full pipeline in one cell -----------------------------------
r = env.step(RunCodeAction(code="""\
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
# Prepare
X = train_df.drop(target_column, axis=1).copy()
y = train_df[target_column]
# Encode categoricals (e.g. neighborhood)
enc = {}
for col in X.select_dtypes(include=['object', 'category']).columns:
le = LabelEncoder()
X[col] = le.fit_transform(X[col].astype(str))
enc[col] = le
X = X.fillna(X.median(numeric_only=True))
# Train
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X, y)
# Validate
Xv = val_features.copy()
for col in Xv.select_dtypes(include=['object', 'category']).columns:
if col in enc:
Xv[col] = enc[col].transform(Xv[col].astype(str))
else:
le = LabelEncoder()
Xv[col] = le.fit_transform(Xv[col].astype(str))
Xv = Xv.fillna(Xv.median(numeric_only=True))
val_preds = model.predict(Xv)
print(f"Validation: {evaluate(val_preds)}")
# Test predictions
Xt = test_features.copy()
for col in Xt.select_dtypes(include=['object', 'category']).columns:
if col in enc:
Xt[col] = enc[col].transform(Xt[col].astype(str))
else:
le = LabelEncoder()
Xt[col] = le.fit_transform(Xt[col].astype(str))
Xt = Xt.fillna(Xt.median(numeric_only=True))
predictions = model.predict(Xt)
print(f"Test predictions: {predictions.shape}")
"""))
_print_step("Step 1 - Full pipeline", r)
# -- Submit --------------------------------------------------------------
r = env.step(SubmitAction())
print(f" [Submit]")
print(f" final reward : {r.reward:.4f}")
print(f" test_report : {r.info.get('test_report')}")
print()
# ============================================================================
# Main
# ============================================================================
if __name__ == "__main__":
run_classification_example()
print("\n" + "-" * 64 + "\n")
run_regression_example() |