File size: 3,197 Bytes
45e0498
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# multi_species_pipeline.py

import os
import json
import pickle
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# -------- CONFIG --------

SPECIES_FILES = {
    "mackerel":   "migration_timeseries_mackerel.csv",
    "sardinella": "migration_timeseries_sardinella.csv",
    "scomber":    "migration_timeseries_scomber.csv",
    "skipjack":   "migration_timeseries_skipjack.csv",
    "tuna":       "migration_timeseries_tuna.csv",
}

# ๐Ÿšจ This is ONLY a training hyperparameter (not exposed to frontend)
SEQUENCE_LENGTH = 3


def train_for_species(species_id: str, ts_csv: str):
    if not os.path.exists(ts_csv):
        print(f"[WARN] Timeseries CSV not found for {species_id}: {ts_csv}")
        return

    print(f"\n=== Training LSTM for {species_id} from {ts_csv} ===")

    df = pd.read_csv(ts_csv)
    df = df.sort_values(["year", "month"]).reset_index(drop=True)

    required = {"year", "month", "decimalLatitude", "decimalLongitude"}
    missing = required - set(df.columns)
    if missing:
        print(f"[ERROR] Missing columns {missing} in {ts_csv}")
        return

    coords = df[["decimalLatitude", "decimalLongitude"]].values

    scaler = MinMaxScaler()
    coords_scaled = scaler.fit_transform(coords)

    X, y = [], []
    for i in range(SEQUENCE_LENGTH, len(coords_scaled)):
        X.append(coords_scaled[i - SEQUENCE_LENGTH:i])
        y.append(coords_scaled[i])

    X = np.array(X)
    y = np.array(y)

    if len(X) == 0:
        print(f"[ERROR] Not enough data to train for {species_id}")
        return

    model = Sequential()
    model.add(LSTM(64, activation="tanh", input_shape=(SEQUENCE_LENGTH, 2)))
    model.add(Dense(32, activation="relu"))
    model.add(Dense(2))
    model.compile(optimizer="adam", loss="mse")

    model.fit(X, y, epochs=50, batch_size=8, verbose=1)

    out_dir = os.path.join("models", species_id)
    os.makedirs(out_dir, exist_ok=True)

    # ๐Ÿ”น Species-specific filenames
    model_path = os.path.join(out_dir, f"{species_id}_model.h5")
    scaler_path = os.path.join(out_dir, f"{species_id}_scaler.pkl")
    meta_path = os.path.join(out_dir, f"{species_id}_metadata.json")

    model.save(model_path)

    with open(scaler_path, "wb") as f:
        pickle.dump(scaler, f)

    # ๐Ÿ‘‰ Store everything backend needs (no frontend involvement)
    metadata = {
        "species": species_id,
        "sequence_length": SEQUENCE_LENGTH,                         # internal
        "last_year": int(df["year"].iloc[-1]),
        "last_month": int(df["month"].iloc[-1]),
        "last_sequence": coords_scaled[-SEQUENCE_LENGTH:].tolist()  # internal
    }

    with open(meta_path, "w") as f:
        json.dump(metadata, f, indent=2)

    print(f"[OK] Saved {model_path}, {scaler_path}, {meta_path}")


def main():
    os.makedirs("models", exist_ok=True)
    for species_id, ts_csv in SPECIES_FILES.items():
        train_for_species(species_id, ts_csv)


if __name__ == "__main__":
    main()