File size: 5,263 Bytes
184147c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
from fastapi import FastAPI
from fastapi.responses import JSONResponse
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

app = FastAPI(title="Displacement Prediction API", description="API for predicting displacement using LightGBM quantile regression")

@app.get("/predict", response_class=JSONResponse)
async def predict_displacement():
    """

    Endpoint to predict displacement using LightGBM quantile regression.

    Returns JSON with evaluation metrics and chart data for the first 100 test samples.

    """
    # Load data
    try:
        df = pd.read_csv("synthetic_ps_points.csv")
    except FileNotFoundError:
        return JSONResponse(
            status_code=404,
            content={"error": "Dataset file 'synthetic_ps_points.csv' not found in the working directory."}
        )

    # Identify displacement columns
    disp_cols = [c for c in df.columns if c.startswith("disp_mm_")]
    disp_cols = sorted(disp_cols, key=lambda x: pd.to_datetime(x.replace("disp_mm_",""), format="%Y%m%d"))

    # Convert wide to long format
    long_df = df.melt(
        id_vars=["ps_id", "lat", "lon", "velocity_mm_yr", "risk"],
        value_vars=disp_cols,
        var_name="date",
        value_name="disp_mm"
    )

    # Parse date
    long_df["date"] = long_df["date"].str.replace("disp_mm_", "").astype(int)
    long_df["date"] = pd.to_datetime(long_df["date"], format="%Y%m%d")

    # Sort by ps_id and date
    long_df = long_df.sort_values(["ps_id", "date"])

    # Create lag features (last 3 displacements)
    long_df["lag1"] = long_df.groupby("ps_id")["disp_mm"].shift(1)
    long_df["lag2"] = long_df.groupby("ps_id")["disp_mm"].shift(2)
    long_df["lag3"] = long_df.groupby("ps_id")["disp_mm"].shift(3)

    # Drop rows with NaN values from lagging
    long_df = long_df.dropna()

    # Features and target
    X = long_df[["lat", "lon", "velocity_mm_yr", "lag1", "lag2", "lag3"]]
    y = long_df["disp_mm"]

    # Train-test split (no shuffling to preserve time series order)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

    # Train LightGBM models for quantiles
    quantiles = [0.1, 0.5, 0.9]
    models = {}
    for q in quantiles:
        params = {
            "objective": "quantile",
            "alpha": q,
            "learning_rate": 0.05,
            "n_estimators": 500,
            "max_depth": 6,
            "verbose": -1
        }
        model = lgb.LGBMRegressor(**params)
        model.fit(X_train, y_train)
        models[q] = model

    # Predict quantiles
    preds = {q: models[q].predict(X_test) for q in quantiles}

    # Evaluation metrics
    # Point forecast evaluation (using 0.5 quantile)
    mse = mean_squared_error(y_test, preds[0.5])
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, preds[0.5])

    # Pinball loss function for quantile evaluation
    def pinball_loss(y_true, y_pred, alpha):
        error = y_true - y_pred
        loss = np.maximum(alpha * error, (alpha - 1) * error)
        return np.mean(loss)

    pinball_loss_01 = pinball_loss(y_test, preds[0.1], 0.1)
    pinball_loss_05 = pinball_loss(y_test, preds[0.5], 0.5)
    pinball_loss_09 = pinball_loss(y_test, preds[0.9], 0.9)

    # Coverage and width of the 80% prediction interval
    coverage = np.mean((y_test >= preds[0.1]) & (y_test <= preds[0.9]))
    interval_width = np.mean(preds[0.9] - preds[0.1])

    # Calculate velocity
    last_disp_test = X_test["lag1"].iloc[-1]
    last_pred_disp = preds[0.5][-1]
    time_diff_days = (long_df["date"].iloc[-1] - long_df["date"].iloc[-2]).days
    time_diff_years = time_diff_days / 365.25
    predicted_velocity = (last_pred_disp - last_disp_test) / time_diff_years
    actual_velocity = long_df["velocity_mm_yr"].iloc[-1]

    # Prepare Chartify-compatible JSON output (first 100 samples)
    chart_data = []
    for i in range(min(100, len(y_test))):
        chart_data.append({
            "index": i,
            "actual": float(y_test.values[i]),
            "predicted_median": float(preds[0.5][i]),
            "lower_bound": float(preds[0.1][i]),
            "upper_bound": float(preds[0.9][i])
        })

    # Combine metrics and chart data
    response = {
        "metrics": {
            "mse": float(mse),
            "rmse": float(rmse),
            "mae": float(mae),
            "pinball_loss_0.1": float(pinball_loss_01),
            "pinball_loss_0.5": float(pinball_loss_05),
            "pinball_loss_0.9": float(pinball_loss_09),
            "coverage_80_percent": float(coverage * 100),
            "interval_width": float(interval_width),
            "actual_velocity": float(actual_velocity),
            "predicted_velocity": float(predicted_velocity),
            "velocity_error": float(abs(actual_velocity - predicted_velocity))
        },
        "chart_data": chart_data
    }

    return JSONResponse(content=response)

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)