Spaces:

nexacore
/

solana-data-training

Sleeping

App Files Files Community

nexacore commited on 16 days ago

Commit

00fb193

verified ·

1 Parent(s): 3582907

Update app.py

Browse files

Files changed (1) hide show

app.py +57 -60

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import os
-import io
 import json
 import joblib
 import shutil
@@ -16,18 +15,29 @@ import matplotlib.pyplot as plt
 from datetime import datetime, timezone
 from huggingface_hub import HfApi, hf_hub_download, list_repo_files
 # ── Config ────────────────────────────────────────────────────────────────────
-DATASET_REPO   = "nexacore/solana-dex-data"
-MODEL_FILE     = "nexa_lgbm_v1.joblib"
-MANIFEST_FILE  = "trained_files_manifest.json"  # tracks which files already trained
-HF_TOKEN       = os.environ.get("HF_TOKEN")
-DATA_DIR       = "/tmp/nexa_data"
-MODEL_PATH     = f"/tmp/{MODEL_FILE}"
-MANIFEST_PATH  = f"/tmp/{MANIFEST_FILE}"
 api = HfApi(token=HF_TOKEN)
 # ── Shared state ──────────────────────────────────────────────────────────────
 state = {
     "status":           "idle",
@@ -50,9 +60,8 @@ def log(msg):
     if len(state["log"]) > 300:
         state["log"] = state["log"][-300:]
-# ── Manifest: tracks which CSV files have already been trained on ──────────────
 def load_manifest():
-    """Download manifest from HF, or return empty if first run."""
     try:
         local = hf_hub_download(
             repo_id   = DATASET_REPO,
@@ -67,7 +76,6 @@ def load_manifest():
         return {"trained_files": [], "model_version": 0, "total_rows": 0}
 def save_manifest(manifest):
-    """Upload manifest back to HF."""
     with open(MANIFEST_PATH, "w") as f:
         json.dump(manifest, f, indent=2)
     api.upload_file(
@@ -87,7 +95,7 @@ def download_new_files(manifest):
         f for f in list_repo_files(DATASET_REPO, repo_type="dataset", token=HF_TOKEN)
         if f.endswith(".csv") and not f.startswith("refs/")
     ]
     already_trained = set(manifest["trained_files"])
     new_files = [f for f in all_remote if f not in already_trained]
@@ -103,16 +111,16 @@ def download_new_files(manifest):
         local_path = os.path.join(DATA_DIR, filename)
         try:
             hf_hub_download(
-                repo_id   = DATASET_REPO,
-                filename  = remote_path,
-                repo_type = "dataset",
-                token     = HF_TOKEN,
-                local_dir = "/tmp/nexa_raw",
                 force_download = True,
             )
             src = f"/tmp/nexa_raw/{remote_path}"
             if not os.path.exists(src):
-                src = f"/tmp/nexa_raw/{os.path.basename(remote_path)}"
             shutil.copy(src, local_path)
             downloaded.append((remote_path, local_path))
             log(f"  downloaded: {filename}")
@@ -157,21 +165,23 @@ def engineer_features(df):
     df["dt"] = pd.to_datetime(df["block_time_unix"], unit="s", utc=True)
     df = df.set_index("dt").sort_index()
-    df["is_buy"]   = (df["side"] == "BUY").astype(float)
-    df["is_sell"]  = (df["side"] == "SELL").astype(float)
-    df["is_noise"] = (df["side"] == "NOISE").astype(float)
-    df["buy_vol"]  = df["amount_sol"] * df["is_buy"]
-    df["sell_vol"] = df["amount_sol"] * df["is_sell"]
-    df["noise_vol"]= df["amount_sol"] * df["is_noise"]
     price_1s = df["binance_price"].resample("1s").last().ffill()
-    flows_1s  = df[["buy_vol","sell_vol","noise_vol","is_buy","is_sell","is_noise"]]\
-                  .resample("1s").sum()
     feat = flows_1s.join(price_1s.rename("price"), how="outer").ffill().fillna(0)
     eps  = 1e-9
-    # Rolling windows — multiple sizes so model chooses what matters
     for w in ["15s", "30s", "1min", "5min", "15min"]:
         bv = feat["buy_vol"].rolling(w).sum()
         sv = feat["sell_vol"].rolling(w).sum()
@@ -190,8 +200,6 @@ def engineer_features(df):
         feat[f"flow_imbalance_{w}"] = bv / (bv + sv + eps)
         feat[f"noise_ratio_{w}"]    = nc / (tc + eps)
         feat[f"tx_freq_{w}"]        = tc
-        feat[f"large_buy_{w}"]      = ((df["buy_vol"] > 1.0).resample("1s").sum()
-                                        if w == "30s" else feat.get(f"large_buy_{w}", 0))
     # Price features
     for secs, label in [(30,"30s"),(60,"1min"),(300,"5min"),(900,"15min")]:
@@ -202,10 +210,9 @@ def engineer_features(df):
     feat["price_vol_1m"]   = feat["price"].rolling("1min").std()
     # CEX/DEX spread
-    jup_1s = df["jupiter_price"].resample("1s").last().ffill()
-    feat["dex_cex_spread"] = (jup_1s - feat["price"]) / (feat["price"] + eps)
-    # Divergence (core hypothesis)
     fi = feat["flow_imbalance_30s"]
     pc = feat["price_change_30s"]
     feat["divergence_buy"]  = ((fi > 0.7) & (pc < 0)).astype(float)
@@ -224,30 +231,30 @@ def engineer_features(df):
     log(f"Features: {len(feat):,} rows × {len(feat.columns)} cols")
     return feat
-# ── Step 4: Incremental LightGBM update ───────────────────────────────────────
 def train_model(feat, manifest):
     log("Training / updating LightGBM model...")
-    target_col   = "target_30s"  # primary target
-    drop_cols    = [c for c in feat.columns if c.startswith("target_") or c == "price"]
     feature_cols = [c for c in feat.columns if c not in drop_cols]
     X = feat[feature_cols].values
     y = feat[target_col].values
     # Chronological 80/20 — never shuffle
-    split    = int(len(X) * 0.8)
-    X_train  = X[:split];  X_test = X[split:]
-    y_train  = y[:split];  y_test = y[split:]
     log(f"Train: {len(X_train):,} | Test: {len(X_test):,}")
-    # Load existing model for incremental update
     init_model = None
     if os.path.exists(MODEL_PATH):
         try:
-            wrapped    = joblib.load(MODEL_PATH)
-            init_model = wrapped.booster_
             log(f"Incrementally updating model v{manifest['model_version']}")
         except Exception:
             log("Starting fresh model")
@@ -266,7 +273,7 @@ def train_model(feat, manifest):
         "n_jobs":           -1,
     }
-    # Map -1,0,1 → 0,1,2
     y_tr = y_train + 1
     y_te = y_test  + 1
@@ -281,11 +288,11 @@ def train_model(feat, manifest):
     )
     # Evaluate
-    proba    = model.predict(X_test)           # (n, 3)
-    pred     = proba.argmax(axis=1) - 1        # back to -1,0,1
     buy_mask = y_test != 0
     try:
-        from sklearn.metrics import roc_auc_score
         auc = roc_auc_score(
             (y_test[buy_mask] == 1).astype(int),
             proba[buy_mask, 2]
@@ -311,17 +318,9 @@ def train_model(feat, manifest):
     ax.invert_yaxis()
     plt.tight_layout()
     state["fig_importance"] = fig
-    # Save wrapped model
-    class LGBWrapper:
-        def __init__(self, booster, features):
-            self.booster_       = booster
-            self.feature_names_ = features
-        def predict(self, X):
-            return self.booster_.predict(X).argmax(axis=1) - 1
-        def predict_proba(self, X):
-            return self.booster_.predict(X)
     wrapped = LGBWrapper(model, feature_cols)
     joblib.dump(wrapped, MODEL_PATH)
     log("Model saved locally")
@@ -338,7 +337,6 @@ def upload_and_update(manifest, newly_trained_files):
         token           = HF_TOKEN,
     )
-    # Update manifest
     manifest["trained_files"].extend(newly_trained_files)
     manifest["model_version"] += 1
     manifest["total_rows"]    += state["new_rows"]
@@ -369,7 +367,7 @@ def run_pipeline():
             state["status"] = "idle"
             return
-        df   = load_new_data(downloaded)
         if df is None or len(df) < 1000:
             log(f"Not enough new data ({len(df) if df is not None else 0} rows) — skipping")
             state["status"] = "idle"
@@ -389,7 +387,7 @@ def run_pipeline():
     finally:
         state["status"] = "idle"
-# ── Scheduler ─────────────────────────────────────────────────────────────────
 def start_scheduler():
     schedule.every(24).hours.do(run_pipeline)
     while True:
@@ -449,10 +447,9 @@ with gr.Blocks(title="NEXA ML Dashboard") as demo:
                         lambda: state["fig_importance"], outputs=imp_plot
                     )
-    # Refresh status every 10s
     gr.Timer(10).tick(get_status, outputs=status_md)
-# Startup: run pipeline if no model yet
 def startup():
     time.sleep(8)
     manifest = load_manifest()

 import os
 import json
 import joblib
 import shutil
 from datetime import datetime, timezone
 from huggingface_hub import HfApi, hf_hub_download, list_repo_files
+from sklearn.metrics import roc_auc_score
 # ── Config ────────────────────────────────────────────────────────────────────
+DATASET_REPO  = "nexacore/solana-dex-data"
+MODEL_FILE    = "nexa_lgbm_v1.joblib"
+MANIFEST_FILE = "trained_files_manifest.json"
+HF_TOKEN      = os.environ.get("HF_TOKEN")
+DATA_DIR      = "/tmp/nexa_data"
+MODEL_PATH    = f"/tmp/{MODEL_FILE}"
+MANIFEST_PATH = f"/tmp/{MANIFEST_FILE}"
 api = HfApi(token=HF_TOKEN)
+# ── Module-level wrapper — MUST be here for joblib pickling to work ───────────
+class LGBWrapper:
+    def __init__(self, booster, features):
+        self.booster_       = booster
+        self.feature_names_ = features
+    def predict(self, X):
+        return self.booster_.predict(X).argmax(axis=1) - 1
+    def predict_proba(self, X):
+        return self.booster_.predict(X)
 # ── Shared state ──────────────────────────────────────────────────────────────
 state = {
     "status":           "idle",
     if len(state["log"]) > 300:
         state["log"] = state["log"][-300:]
+# ── Manifest ──────────────────────────────────────────────────────────────────
 def load_manifest():
     try:
         local = hf_hub_download(
             repo_id   = DATASET_REPO,
         return {"trained_files": [], "model_version": 0, "total_rows": 0}
 def save_manifest(manifest):
     with open(MANIFEST_PATH, "w") as f:
         json.dump(manifest, f, indent=2)
     api.upload_file(
         f for f in list_repo_files(DATASET_REPO, repo_type="dataset", token=HF_TOKEN)
         if f.endswith(".csv") and not f.startswith("refs/")
     ]
     already_trained = set(manifest["trained_files"])
     new_files = [f for f in all_remote if f not in already_trained]
         local_path = os.path.join(DATA_DIR, filename)
         try:
             hf_hub_download(
+                repo_id        = DATASET_REPO,
+                filename       = remote_path,
+                repo_type      = "dataset",
+                token          = HF_TOKEN,
+                local_dir      = "/tmp/nexa_raw",
                 force_download = True,
             )
             src = f"/tmp/nexa_raw/{remote_path}"
             if not os.path.exists(src):
+                src = f"/tmp/nexa_raw/{filename}"
             shutil.copy(src, local_path)
             downloaded.append((remote_path, local_path))
             log(f"  downloaded: {filename}")
     df["dt"] = pd.to_datetime(df["block_time_unix"], unit="s", utc=True)
     df = df.set_index("dt").sort_index()
+    df["is_buy"]    = (df["side"] == "BUY").astype(float)
+    df["is_sell"]   = (df["side"] == "SELL").astype(float)
+    df["is_noise"]  = (df["side"] == "NOISE").astype(float)
+    df["buy_vol"]   = df["amount_sol"] * df["is_buy"]
+    df["sell_vol"]  = df["amount_sol"] * df["is_sell"]
+    df["noise_vol"] = df["amount_sol"] * df["is_noise"]
     price_1s = df["binance_price"].resample("1s").last().ffill()
+    jup_1s   = df["jupiter_price"].resample("1s").last().ffill()
+    flows_1s = df[["buy_vol","sell_vol","noise_vol","is_buy","is_sell","is_noise"]]\
+                 .resample("1s").sum()
     feat = flows_1s.join(price_1s.rename("price"), how="outer").ffill().fillna(0)
+    feat = feat.join(jup_1s.rename("jup_price"), how="left").ffill().fillna(0)
     eps  = 1e-9
+    # Rolling windows
     for w in ["15s", "30s", "1min", "5min", "15min"]:
         bv = feat["buy_vol"].rolling(w).sum()
         sv = feat["sell_vol"].rolling(w).sum()
         feat[f"flow_imbalance_{w}"] = bv / (bv + sv + eps)
         feat[f"noise_ratio_{w}"]    = nc / (tc + eps)
         feat[f"tx_freq_{w}"]        = tc
     # Price features
     for secs, label in [(30,"30s"),(60,"1min"),(300,"5min"),(900,"15min")]:
     feat["price_vol_1m"]   = feat["price"].rolling("1min").std()
     # CEX/DEX spread
+    feat["dex_cex_spread"] = (feat["jup_price"] - feat["price"]) / (feat["price"] + eps)
+    # Divergence features (core hypothesis)
     fi = feat["flow_imbalance_30s"]
     pc = feat["price_change_30s"]
     feat["divergence_buy"]  = ((fi > 0.7) & (pc < 0)).astype(float)
     log(f"Features: {len(feat):,} rows × {len(feat.columns)} cols")
     return feat
+# ── Step 4: Train / Incrementally Update LightGBM ────────────────────────────
 def train_model(feat, manifest):
     log("Training / updating LightGBM model...")
+    target_col   = "target_30s"
+    drop_cols    = [c for c in feat.columns if c.startswith("target_") or c in ("price","jup_price")]
     feature_cols = [c for c in feat.columns if c not in drop_cols]
     X = feat[feature_cols].values
     y = feat[target_col].values
     # Chronological 80/20 — never shuffle
+    split   = int(len(X) * 0.8)
+    X_train = X[:split];  X_test = X[split:]
+    y_train = y[:split];  y_test = y[split:]
     log(f"Train: {len(X_train):,} | Test: {len(X_test):,}")
+    # Load existing booster for incremental update
     init_model = None
     if os.path.exists(MODEL_PATH):
         try:
+            existing   = joblib.load(MODEL_PATH)
+            init_model = existing.booster_
             log(f"Incrementally updating model v{manifest['model_version']}")
         except Exception:
             log("Starting fresh model")
         "n_jobs":           -1,
     }
+    # Map -1,0,1 → 0,1,2 for multiclass
     y_tr = y_train + 1
     y_te = y_test  + 1
     )
     # Evaluate
+    proba    = model.predict(X_test)
+    pred     = proba.argmax(axis=1) - 1
     buy_mask = y_test != 0
     try:
         auc = roc_auc_score(
             (y_test[buy_mask] == 1).astype(int),
             proba[buy_mask, 2]
     ax.invert_yaxis()
     plt.tight_layout()
     state["fig_importance"] = fig
+    plt.close(fig)
+    # Save using module-level LGBWrapper — joblib can pickle it correctly
     wrapped = LGBWrapper(model, feature_cols)
     joblib.dump(wrapped, MODEL_PATH)
     log("Model saved locally")
         token           = HF_TOKEN,
     )
     manifest["trained_files"].extend(newly_trained_files)
     manifest["model_version"] += 1
     manifest["total_rows"]    += state["new_rows"]
             state["status"] = "idle"
             return
+        df = load_new_data(downloaded)
         if df is None or len(df) < 1000:
             log(f"Not enough new data ({len(df) if df is not None else 0} rows) — skipping")
             state["status"] = "idle"
     finally:
         state["status"] = "idle"
+# ── Scheduler: every 24 hours ─────────────────────────────────────────────────
 def start_scheduler():
     schedule.every(24).hours.do(run_pipeline)
     while True:
                         lambda: state["fig_importance"], outputs=imp_plot
                     )
     gr.Timer(10).tick(get_status, outputs=status_md)
+# ── Startup ───────────────────────────────────────────────────────────────────
 def startup():
     time.sleep(8)
     manifest = load_manifest()