File size: 10,671 Bytes
e057d08
 
 
 
e17f3ba
f800c3b
e17f3ba
e057d08
 
c4ff02d
e057d08
c4ff02d
e057d08
e17f3ba
 
e057d08
e17f3ba
 
ea7bee1
e057d08
 
 
f800c3b
 
e057d08
 
e17f3ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e057d08
e17f3ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e057d08
e17f3ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e057d08
e17f3ba
 
e057d08
e17f3ba
 
 
 
e057d08
e17f3ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e057d08
e17f3ba
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
"""
main.py β€” FastAPI backend for the SAP RPT-1 Benchmarking Web App.
"""

import io, os
from pathlib import Path
from dotenv import load_dotenv

# Load .env before anything else so HF_TOKEN is available to benchmark.py
# Load .env from current or parent directory
load_dotenv(Path(__file__).parent / ".env")
load_dotenv(Path(__file__).parent.parent / ".env")

import pandas as pd
from fastapi import FastAPI, File, UploadFile, Form, HTTPException
from fastapi.responses import JSONResponse
from fastapi.staticfiles import StaticFiles

try:
    from benchmark import run_benchmark, infer_task
except ImportError:
    from webapp.benchmark import run_benchmark, infer_task

# ── Config ─────────────────────────────────────────────────────────────────────
MAX_FILE_BYTES = int(os.getenv("MAX_FILE_SIZE_MB", "5")) * 1024 * 1024   # default 5 MB

app = FastAPI(title="SAP RPT-1 Benchmarking API", version="1.0.0")

# ── Static files (frontend) ────────────────────────────────────────────────────
STATIC_DIR = Path(__file__).parent / "static"
app.mount("/static", StaticFiles(directory=str(STATIC_DIR)), name="static")

from fastapi.responses import FileResponse

@app.get("/")
def root():
    return FileResponse(str(STATIC_DIR / "landing.html"))

@app.get("/arena")
def arena():
    return FileResponse(str(STATIC_DIR / "arena.html"))


# ── /preview ───────────────────────────────────────────────────────────────────
@app.post("/preview")
async def preview(file: UploadFile = File(...)):
    """
    Return column names + first 5 rows of the uploaded CSV.
    Used by the frontend to let the user pick the target column.
    """
    content = await file.read()
    if len(content) > MAX_FILE_BYTES:
        raise HTTPException(413, f"File too large. Max size is {MAX_FILE_BYTES // (1024*1024)} MB.")

    try:
        df = pd.read_csv(io.BytesIO(content))
    except Exception as e:
        raise HTTPException(400, f"Could not parse CSV: {e}")

    if df.shape[1] < 2:
        raise HTTPException(400, "CSV must have at least 2 columns (features + target).")

    # Guess default target: last column
    default_target = df.columns[-1]

    return JSONResponse({
        "columns":        list(df.columns),
        "default_target": default_target,
        "n_rows":         len(df),
        "n_cols":         df.shape[1],
        "preview":        df.head(5).fillna("").to_dict("records"),
    })



# ── Live Prediction Wrappers ──────────────────────────────────────────────────
import numpy as np

class LiveVotingEnsemble:
    def __init__(self, names, builders, task):
        self.models = [(n, builders[n](task)) for n in names]
        self.task = task
    def fit(self, X, y):
        for _, m in self.models: m.fit(X, y)
    def predict(self, X):
        if self.task == "regression":
            preds = [m.predict(X).ravel()[0] for _, m in self.models]
            return np.array([np.mean(preds)])
        
        # Classification
        try:
            proba = self.predict_proba(X)
            return np.argmax(proba, axis=1)
        except:
            preds = [int(m.predict(X).ravel()[0]) for _, m in self.models]
            return np.array([np.bincount(preds).argmax()])

    def predict_proba(self, X):
        all_probas = []
        for _, m in self.models:
            try:
                p = m.predict_proba(X)
                all_probas.append(p)
            except:
                # Fallback: one-hot from prediction
                pred = int(m.predict(X).ravel()[0])
                # We'll use a 100-wide array just to be safe, or 
                # ideally we'd know n_classes. For the playground, 
                # the RAVEL logic in /predict handles the cleanup.
                oh = np.zeros((1, 100))
                if pred < 100: oh[0, pred] = 1.0
                all_probas.append(oh)
        
        # Average only if we have consistent shapes
        return np.mean(all_probas, axis=0)

class LiveStackingEnsemble:
    def __init__(self, names, builders, task):
        from sklearn.ensemble import StackingClassifier, StackingRegressor
        from sklearn.linear_model import LogisticRegression, Ridge
        estimators = [(n, builders[n](task)) for n in names]
        if task == "classification":
            self.model = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(), cv=3)
        else:
            self.model = StackingRegressor(estimators=estimators, final_estimator=Ridge(), cv=3)
    def fit(self, X, y):
        self.model.fit(X, y)
    def predict(self, X):
        res = self.model.predict(X)
        return res.reshape(1, -1) if res.ndim == 1 else res
    def predict_proba(self, X):
        return self.model.predict_proba(X)

# ── Live Prediction Cache ──────────────────────────────────────────────────────
CHAMPION_MODEL = None
CHAMPION_INFO  = {"name": None, "task": None, "features": []}

@app.post("/benchmark")
async def benchmark(
    file:       UploadFile = File(...),
    target_col: str        = Form(...),
):
    global CHAMPION_MODEL, CHAMPION_INFO
    content = await file.read()
    if len(content) > MAX_FILE_BYTES:
        raise HTTPException(413, f"File too large. Max {MAX_FILE_BYTES // (1024*1024)} MB.")

    try:
        df = pd.read_csv(io.BytesIO(content))
    except Exception as e:
        raise HTTPException(400, f"Could not parse CSV: {e}")

    if target_col not in df.columns:
        raise HTTPException(400, f"Column '{target_col}' not found.")

    try:
        result = run_benchmark(df, target_col)
        
        # Deep-sanitize the result to ensure 100% JSON compatibility
        def sanitize(obj):
            if isinstance(obj, dict):
                return {k: sanitize(v) for k, v in obj.items()}
            elif isinstance(obj, list):
                return [sanitize(v) for v in obj]
            elif hasattr(obj, "item"): # Handle numpy scalars
                return obj.item()
            elif isinstance(obj, np.bool_):
                return bool(obj)
            return obj

        result = sanitize(result)
        
        # Add explicit feature types for the playground UI
        feature_types = {}
        for col in df.columns:
            if col == target_col: continue
            if pd.api.types.is_numeric_dtype(df[col]):
                feature_types[col] = "numeric"
            else:
                feature_types[col] = "categorical"
        result["dataset_info"]["feature_types"] = feature_types
        
        # Cache the Best Overall model for the Live Playground
        best_name = result["recommendation"]["recommendations"]["best_overall"]["model"]
        from benchmark import BUILDERS, _prep, _encode_target
        X = df.drop(columns=[target_col])
        y_raw = df[target_col]
        task = result["dataset_info"]["task"]
        y, le = _encode_target(y_raw, task)
        
        # Capture the final encoders from the full dataset
        X_p, feat_encoders = _prep(X)
        
        if best_name == "Voting Ensemble":
            comp_names = result["ensemble_info"]["Voting Ensemble"]["components"]
            CHAMPION_MODEL = LiveVotingEnsemble(comp_names, BUILDERS, task)
            CHAMPION_MODEL.fit(X_p, y)
        elif best_name == "Stacking Ensemble":
            comp_names = result["ensemble_info"]["Stacking Ensemble"]["components"]
            CHAMPION_MODEL = LiveStackingEnsemble(comp_names, BUILDERS, task)
            CHAMPION_MODEL.fit(X_p, y)
        else:
            builder = BUILDERS.get(best_name)
            if builder:
                CHAMPION_MODEL = builder(task)
                CHAMPION_MODEL.fit(X_p, y)
            
        CHAMPION_INFO = {
            "name": best_name,
            "task": task,
            "features": list(X.columns),
            "labels": list(le.classes_) if le else None,
            "encoders": feat_encoders  # Store these for the /predict endpoint!
        }
            
    except Exception as e:
        raise HTTPException(500, f"Benchmarking failed: {e}")

    return JSONResponse(result)


@app.post("/predict")
async def predict(data: dict):
    """
    Get a live prediction from the cached champion model.
    """
    global CHAMPION_MODEL, CHAMPION_INFO
    if not CHAMPION_MODEL:
        raise HTTPException(400, "No champion model loaded. Run a benchmark first.")
    
    try:
        # Convert input dict to DataFrame
        input_df = pd.DataFrame([data])
        # Ensure column order matches training
        input_df = input_df[CHAMPION_INFO["features"]]
        
        from benchmark import _prep
        # Use the EXACT same encoders that were used during training
        X_test, _ = _prep(input_df, encoders=CHAMPION_INFO.get("encoders"))
        
        if CHAMPION_INFO["task"] == "classification":
            raw_pred = CHAMPION_MODEL.predict(X_test)
            # Flatten if nested (CatBoost/Sklearn sometimes return [[val]] or [val])
            pred_val = raw_pred.ravel()[0]
            pred_idx = int(pred_val)
            
            label = CHAMPION_INFO["labels"][pred_idx] if CHAMPION_INFO["labels"] and pred_idx < len(CHAMPION_INFO["labels"]) else str(pred_idx)
            
            try:
                proba_raw = CHAMPION_MODEL.predict_proba(X_test)
                proba = proba_raw.ravel().tolist()
                # Ensure we only return as many probabilities as we have labels
                if CHAMPION_INFO["labels"] and len(proba) > len(CHAMPION_INFO["labels"]):
                    proba = proba[:len(CHAMPION_INFO["labels"])]
            except:
                proba = None
            return {
                "prediction": label, 
                "probabilities": proba, 
                "labels": CHAMPION_INFO["labels"]
            }
        else:
            raw_pred = CHAMPION_MODEL.predict(X_test)
            pred = float(raw_pred.ravel()[0])
            return {"prediction": pred}
            
    except Exception as e:
        import traceback
        traceback.print_exc()
        return JSONResponse({"error": str(e)}, status_code=400)