Spaces:
Sleeping
Sleeping
Aurora Brain v1.0 — Regime Detector + Feature Engine + API
Browse files- Dockerfile +23 -0
- README.md +75 -11
- app.py +214 -0
- download_data.py +436 -0
- feature_engine.py +511 -0
- regime_detector.py +286 -0
- regime_labeler.py +217 -0
- requirements.txt +33 -0
Dockerfile
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
# Dependencias del sistema
|
| 6 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 7 |
+
gcc g++ && rm -rf /var/lib/apt/lists/*
|
| 8 |
+
|
| 9 |
+
# Dependencias Python
|
| 10 |
+
COPY requirements.txt .
|
| 11 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 12 |
+
|
| 13 |
+
# Código
|
| 14 |
+
COPY . .
|
| 15 |
+
|
| 16 |
+
# Crear directorios de datos y modelos
|
| 17 |
+
RUN mkdir -p data models
|
| 18 |
+
|
| 19 |
+
# Puerto de HuggingFace Spaces
|
| 20 |
+
EXPOSE 7860
|
| 21 |
+
|
| 22 |
+
# Arrancar API
|
| 23 |
+
CMD ["python", "app.py"]
|
README.md
CHANGED
|
@@ -1,11 +1,75 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 🧠 Aurora Brain — ML Trading Intelligence
|
| 2 |
+
|
| 3 |
+
Sistema de Machine Learning para detección de régimen de mercado y predicción de señales de trading.
|
| 4 |
+
Parte del proyecto **Aurora Trader Bot** (Cerebro Guardián).
|
| 5 |
+
|
| 6 |
+
## Arquitectura
|
| 7 |
+
|
| 8 |
+
```
|
| 9 |
+
CAPA 1: Detector de Régimen (XGBoost)
|
| 10 |
+
→ Clasifica: TRENDING / RANGING / VOLATILE / BREAKOUT
|
| 11 |
+
|
| 12 |
+
CAPA 2: Modelos Especialistas (TFT + XGBoost)
|
| 13 |
+
→ Un modelo por régimen, entrenado solo con datos de ese régimen
|
| 14 |
+
|
| 15 |
+
CAPA 3: Feature Engineering (200+ features)
|
| 16 |
+
→ Microestructura + Momentum + Volumen + Cross-asset + On-chain + Sentimiento
|
| 17 |
+
```
|
| 18 |
+
|
| 19 |
+
## Uso rápido
|
| 20 |
+
|
| 21 |
+
```bash
|
| 22 |
+
# 1. Descargar datos históricos (BTC/ETH/SOL, 5 años, 4H)
|
| 23 |
+
python download_data.py --symbols BTCUSDT ETHUSDT SOLUSDT --days 1825
|
| 24 |
+
|
| 25 |
+
# 2. Generar features (200+ por vela)
|
| 26 |
+
python feature_engine.py --all
|
| 27 |
+
|
| 28 |
+
# 3. Etiquetar regímenes
|
| 29 |
+
python regime_labeler.py --all
|
| 30 |
+
|
| 31 |
+
# 4. Entrenar detector de régimen
|
| 32 |
+
python regime_detector.py --symbol BTCUSDT
|
| 33 |
+
|
| 34 |
+
# 5. Levantar API
|
| 35 |
+
python app.py
|
| 36 |
+
```
|
| 37 |
+
|
| 38 |
+
## API Endpoints
|
| 39 |
+
|
| 40 |
+
| Endpoint | Método | Descripción |
|
| 41 |
+
|----------|--------|-------------|
|
| 42 |
+
| `/health` | GET | Estado del servicio |
|
| 43 |
+
| `/regime` | POST | Régimen actual del mercado |
|
| 44 |
+
| `/predict` | POST | Predicción completa (régimen + señal) |
|
| 45 |
+
| `/models` | GET | Info de modelos cargados |
|
| 46 |
+
|
| 47 |
+
## Pipeline completo
|
| 48 |
+
|
| 49 |
+
```
|
| 50 |
+
Binance API → download_data.py → data/*.parquet
|
| 51 |
+
↓
|
| 52 |
+
feature_engine.py → data/features_*.parquet
|
| 53 |
+
↓
|
| 54 |
+
regime_labeler.py → data/labeled_*.parquet
|
| 55 |
+
↓
|
| 56 |
+
regime_detector.py → models/regime_model.pkl
|
| 57 |
+
↓
|
| 58 |
+
app.py → API REST (/regime, /predict)
|
| 59 |
+
↓
|
| 60 |
+
Raspberry Pi 5 (aurora_brain.py) → Strategy Runner
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
## Stack
|
| 64 |
+
|
| 65 |
+
- **Feature Engine:** pandas + pandas_ta + numpy
|
| 66 |
+
- **Detector Régimen:** XGBoost (scikit-learn)
|
| 67 |
+
- **Modelos Especialistas:** TFT (Darts/PyTorch) — Fase 3+
|
| 68 |
+
- **API:** FastAPI + uvicorn
|
| 69 |
+
- **Datos:** Binance API + yfinance + CoinGecko
|
| 70 |
+
- **Infraestructura:** HuggingFace Space (GPU para entrenamiento, CPU para inferencia)
|
| 71 |
+
|
| 72 |
+
## Autor
|
| 73 |
+
|
| 74 |
+
Eduardo (Mendoza, Argentina) + Claude Opus (Anthropic)
|
| 75 |
+
Proyecto Aurora Trader Bot — v5.3
|
app.py
ADDED
|
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
╔══════════════════════════════════════════════════════════════╗
|
| 3 |
+
║ AURORA BRAIN — API Server (HuggingFace Space) ║
|
| 4 |
+
║ ║
|
| 5 |
+
║ FastAPI server que expone las predicciones del Brain. ║
|
| 6 |
+
║ Endpoints: ║
|
| 7 |
+
║ GET /health — Estado del servicio ║
|
| 8 |
+
║ POST /regime — Régimen actual del mercado ║
|
| 9 |
+
║ POST /predict — Predicción completa ║
|
| 10 |
+
║ GET /models — Info de modelos cargados ║
|
| 11 |
+
║ ║
|
| 12 |
+
║ La Pi llama a estos endpoints cada 15 minutos. ║
|
| 13 |
+
╚══════════════════════════════════════════════════════════════╝
|
| 14 |
+
"""
|
| 15 |
+
import os
|
| 16 |
+
import json
|
| 17 |
+
import logging
|
| 18 |
+
import pickle
|
| 19 |
+
import time
|
| 20 |
+
from datetime import datetime, timezone
|
| 21 |
+
from typing import Optional
|
| 22 |
+
|
| 23 |
+
import numpy as np
|
| 24 |
+
import pandas as pd
|
| 25 |
+
from fastapi import FastAPI, HTTPException
|
| 26 |
+
from pydantic import BaseModel
|
| 27 |
+
|
| 28 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
|
| 29 |
+
logger = logging.getLogger("AuroraBrain.API")
|
| 30 |
+
|
| 31 |
+
app = FastAPI(
|
| 32 |
+
title="Aurora Brain API",
|
| 33 |
+
description="ML Trading Intelligence — Régimen + Predicciones",
|
| 34 |
+
version="1.0.0",
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
MODELS_DIR = os.path.join(os.path.dirname(__file__), "models")
|
| 38 |
+
DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
|
| 39 |
+
|
| 40 |
+
# Modelos cargados en memoria
|
| 41 |
+
_regime_model = None
|
| 42 |
+
_regime_metadata = None
|
| 43 |
+
_startup_time = None
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
# ─────────────────────────────────────────────
|
| 47 |
+
# STARTUP
|
| 48 |
+
# ─────────────────────────────────────────────
|
| 49 |
+
@app.on_event("startup")
|
| 50 |
+
async def load_models():
|
| 51 |
+
global _regime_model, _regime_metadata, _startup_time
|
| 52 |
+
_startup_time = datetime.now(timezone.utc)
|
| 53 |
+
|
| 54 |
+
# Cargar modelo de régimen
|
| 55 |
+
model_path = os.path.join(MODELS_DIR, "regime_model.pkl")
|
| 56 |
+
meta_path = os.path.join(MODELS_DIR, "regime_metadata.json")
|
| 57 |
+
|
| 58 |
+
if os.path.exists(model_path):
|
| 59 |
+
with open(model_path, "rb") as f:
|
| 60 |
+
_regime_model = pickle.load(f)
|
| 61 |
+
logger.info("✅ Modelo de régimen cargado")
|
| 62 |
+
|
| 63 |
+
if os.path.exists(meta_path):
|
| 64 |
+
with open(meta_path, "r") as f:
|
| 65 |
+
_regime_metadata = json.load(f)
|
| 66 |
+
logger.info("✅ Metadata de régimen cargada (accuracy: %.1f%%)",
|
| 67 |
+
_regime_metadata.get("accuracy", 0))
|
| 68 |
+
|
| 69 |
+
logger.info("🧠 Aurora Brain API lista")
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
# ─────────────────────────────────────────────
|
| 73 |
+
# SCHEMAS
|
| 74 |
+
# ─────────────────────────────────────────────
|
| 75 |
+
class RegimeRequest(BaseModel):
|
| 76 |
+
symbol: str = "BTCUSDT"
|
| 77 |
+
timeframe: str = "4h"
|
| 78 |
+
|
| 79 |
+
class PredictRequest(BaseModel):
|
| 80 |
+
symbol: str = "BTCUSDT"
|
| 81 |
+
timeframe: str = "4h"
|
| 82 |
+
|
| 83 |
+
class RegimeResponse(BaseModel):
|
| 84 |
+
regime: str
|
| 85 |
+
regime_id: int
|
| 86 |
+
confidence: float
|
| 87 |
+
probabilities: dict
|
| 88 |
+
model_accuracy: float
|
| 89 |
+
timestamp: str
|
| 90 |
+
|
| 91 |
+
class HealthResponse(BaseModel):
|
| 92 |
+
status: str
|
| 93 |
+
uptime_seconds: float
|
| 94 |
+
models_loaded: dict
|
| 95 |
+
version: str
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
# ─────────────────────────────────────────────
|
| 99 |
+
# ENDPOINTS
|
| 100 |
+
# ─────────────────────────────────────────────
|
| 101 |
+
@app.get("/health", response_model=HealthResponse)
|
| 102 |
+
async def health():
|
| 103 |
+
uptime = (datetime.now(timezone.utc) - _startup_time).total_seconds() if _startup_time else 0
|
| 104 |
+
return HealthResponse(
|
| 105 |
+
status="ok",
|
| 106 |
+
uptime_seconds=round(uptime, 0),
|
| 107 |
+
models_loaded={
|
| 108 |
+
"regime_detector": _regime_model is not None,
|
| 109 |
+
"regime_accuracy": _regime_metadata.get("accuracy", 0) if _regime_metadata else 0,
|
| 110 |
+
},
|
| 111 |
+
version="1.0.0",
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
@app.post("/regime", response_model=RegimeResponse)
|
| 116 |
+
async def get_regime(req: RegimeRequest):
|
| 117 |
+
"""Retorna el régimen actual del mercado."""
|
| 118 |
+
if _regime_model is None or _regime_metadata is None:
|
| 119 |
+
raise HTTPException(status_code=503, detail="Modelo de régimen no cargado")
|
| 120 |
+
|
| 121 |
+
# Cargar features más recientes
|
| 122 |
+
features_path = os.path.join(DATA_DIR, f"features_{req.symbol}_{req.timeframe}.parquet")
|
| 123 |
+
if not os.path.exists(features_path):
|
| 124 |
+
raise HTTPException(status_code=404,
|
| 125 |
+
detail=f"Features no encontradas para {req.symbol} {req.timeframe}")
|
| 126 |
+
|
| 127 |
+
df = pd.read_parquet(features_path)
|
| 128 |
+
feature_cols = _regime_metadata["feature_cols"]
|
| 129 |
+
|
| 130 |
+
# Última fila
|
| 131 |
+
last_row = df[feature_cols].iloc[-1:].fillna(df[feature_cols].median())
|
| 132 |
+
|
| 133 |
+
# Predecir
|
| 134 |
+
proba = _regime_model.predict_proba(last_row)[0]
|
| 135 |
+
regime_id = int(np.argmax(proba))
|
| 136 |
+
|
| 137 |
+
REGIME_NAMES = {0: "TRENDING", 1: "RANGING", 2: "VOLATILE", 3: "BREAKOUT"}
|
| 138 |
+
|
| 139 |
+
return RegimeResponse(
|
| 140 |
+
regime=REGIME_NAMES[regime_id],
|
| 141 |
+
regime_id=regime_id,
|
| 142 |
+
confidence=round(float(proba[regime_id]), 4),
|
| 143 |
+
probabilities={REGIME_NAMES[i]: round(float(p), 4) for i, p in enumerate(proba)},
|
| 144 |
+
model_accuracy=_regime_metadata.get("accuracy", 0),
|
| 145 |
+
timestamp=datetime.now(timezone.utc).isoformat(),
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
@app.post("/predict")
|
| 150 |
+
async def predict(req: PredictRequest):
|
| 151 |
+
"""
|
| 152 |
+
Predicción completa: régimen + señal del modelo especialista.
|
| 153 |
+
Por ahora solo retorna régimen. Los modelos especialistas se agregan en Fases 3-6.
|
| 154 |
+
"""
|
| 155 |
+
# Obtener régimen
|
| 156 |
+
regime_req = RegimeRequest(symbol=req.symbol, timeframe=req.timeframe)
|
| 157 |
+
try:
|
| 158 |
+
regime = await get_regime(regime_req)
|
| 159 |
+
except HTTPException:
|
| 160 |
+
regime = None
|
| 161 |
+
|
| 162 |
+
result = {
|
| 163 |
+
"symbol": req.symbol,
|
| 164 |
+
"timeframe": req.timeframe,
|
| 165 |
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
| 166 |
+
"regime": regime.dict() if regime else None,
|
| 167 |
+
"signal": None, # Fase 3+: modelo especialista por régimen
|
| 168 |
+
"recommendation": "HOLD", # Default hasta que los modelos estén listos
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
# Lógica de recomendación basada en régimen
|
| 172 |
+
if regime:
|
| 173 |
+
if regime.regime == "VOLATILE" and regime.confidence > 0.7:
|
| 174 |
+
result["recommendation"] = "SHIELD_MAX"
|
| 175 |
+
result["action"] = "Activar Guardian Shield modo máximo — NO comprar"
|
| 176 |
+
elif regime.regime == "TRENDING" and regime.confidence > 0.7:
|
| 177 |
+
result["recommendation"] = "SMC_ACTIVE"
|
| 178 |
+
result["action"] = "Habilitar estrategias SMC — mercado en tendencia"
|
| 179 |
+
elif regime.regime == "RANGING" and regime.confidence > 0.7:
|
| 180 |
+
result["recommendation"] = "GRID_SUGGEST"
|
| 181 |
+
result["action"] = "Considerar grid trading — mercado lateral"
|
| 182 |
+
elif regime.regime == "BREAKOUT" and regime.confidence > 0.7:
|
| 183 |
+
result["recommendation"] = "ALERT"
|
| 184 |
+
result["action"] = "Breakout detectado — evaluar entrada rápida"
|
| 185 |
+
|
| 186 |
+
return result
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
@app.get("/models")
|
| 190 |
+
async def models_info():
|
| 191 |
+
"""Info de los modelos cargados."""
|
| 192 |
+
info = {
|
| 193 |
+
"regime_detector": None,
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
if _regime_metadata:
|
| 197 |
+
info["regime_detector"] = {
|
| 198 |
+
"trained_at": _regime_metadata.get("trained_at"),
|
| 199 |
+
"accuracy": _regime_metadata.get("accuracy"),
|
| 200 |
+
"cv_accuracy": _regime_metadata.get("cv_accuracy"),
|
| 201 |
+
"n_features": _regime_metadata.get("n_features"),
|
| 202 |
+
"top_features": _regime_metadata.get("top_features", [])[:10],
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
return info
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
# ─────────────────────────────────────────────
|
| 209 |
+
# MAIN (para desarrollo local)
|
| 210 |
+
# ─────────────────────────────────────────────
|
| 211 |
+
if __name__ == "__main__":
|
| 212 |
+
import uvicorn
|
| 213 |
+
port = int(os.environ.get("PORT", 7860))
|
| 214 |
+
uvicorn.run(app, host="0.0.0.0", port=port)
|
download_data.py
ADDED
|
@@ -0,0 +1,436 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
╔══════════════════════════════════════════════════════════════╗
|
| 3 |
+
║ AURORA BRAIN — Download Data ║
|
| 4 |
+
║ ║
|
| 5 |
+
║ Descarga klines históricas de Binance para ML. ║
|
| 6 |
+
║ Soporta múltiples pares y timeframes. ║
|
| 7 |
+
║ Guarda en formato Parquet para eficiencia. ║
|
| 8 |
+
║ ║
|
| 9 |
+
║ Uso: ║
|
| 10 |
+
║ python download_data.py ║
|
| 11 |
+
║ python download_data.py --symbols BTCUSDT ETHUSDT ║
|
| 12 |
+
║ python download_data.py --days 1800 --timeframe 1h ║
|
| 13 |
+
╚══════════════════════════════════════════════════════════════╝
|
| 14 |
+
"""
|
| 15 |
+
import os
|
| 16 |
+
import time
|
| 17 |
+
import argparse
|
| 18 |
+
import logging
|
| 19 |
+
from datetime import datetime, timedelta, timezone
|
| 20 |
+
|
| 21 |
+
import pandas as pd
|
| 22 |
+
import requests
|
| 23 |
+
|
| 24 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
|
| 25 |
+
logger = logging.getLogger("AuroraBrain.Data")
|
| 26 |
+
|
| 27 |
+
# ─────────────────────────────────────────────
|
| 28 |
+
# CONFIGURACIÓN
|
| 29 |
+
# ─────────────────────────────────────────────
|
| 30 |
+
BINANCE_BASE = "https://api.binance.com"
|
| 31 |
+
KLINES_ENDPOINT = "/api/v3/klines"
|
| 32 |
+
|
| 33 |
+
DEFAULT_SYMBOLS = ["BTCUSDT", "ETHUSDT", "SOLUSDT"]
|
| 34 |
+
DEFAULT_TIMEFRAME = "4h"
|
| 35 |
+
DEFAULT_DAYS = 1825 # ~5 años
|
| 36 |
+
MAX_CANDLES_PER_REQUEST = 1000
|
| 37 |
+
|
| 38 |
+
DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
|
| 39 |
+
|
| 40 |
+
# Mapeo de timeframe a milisegundos por vela
|
| 41 |
+
TF_MS = {
|
| 42 |
+
"1m": 60_000, "3m": 180_000, "5m": 300_000, "15m": 900_000,
|
| 43 |
+
"30m": 1_800_000, "1h": 3_600_000, "2h": 7_200_000,
|
| 44 |
+
"4h": 14_400_000, "6h": 21_600_000, "8h": 28_800_000,
|
| 45 |
+
"12h": 43_200_000, "1d": 86_400_000, "1w": 604_800_000,
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
# ─────────────────────────────────────────────
|
| 50 |
+
# DESCARGA DE KLINES
|
| 51 |
+
# ─────────────────────────────────────────────
|
| 52 |
+
def download_klines(symbol: str, timeframe: str, days: int) -> pd.DataFrame:
|
| 53 |
+
"""
|
| 54 |
+
Descarga klines históricas de Binance con paginación automática.
|
| 55 |
+
Retorna DataFrame con columnas OHLCV + extras.
|
| 56 |
+
"""
|
| 57 |
+
tf_ms = TF_MS.get(timeframe, 14_400_000)
|
| 58 |
+
end_ts = int(datetime.now(timezone.utc).timestamp() * 1000)
|
| 59 |
+
start_ts = end_ts - (days * 86_400_000)
|
| 60 |
+
|
| 61 |
+
all_candles = []
|
| 62 |
+
current_start = start_ts
|
| 63 |
+
page = 0
|
| 64 |
+
|
| 65 |
+
logger.info("📥 Descargando %s %s — %d días (%s → %s)",
|
| 66 |
+
symbol, timeframe, days,
|
| 67 |
+
datetime.fromtimestamp(start_ts / 1000, tz=timezone.utc).strftime("%Y-%m-%d"),
|
| 68 |
+
datetime.fromtimestamp(end_ts / 1000, tz=timezone.utc).strftime("%Y-%m-%d"))
|
| 69 |
+
|
| 70 |
+
while current_start < end_ts:
|
| 71 |
+
params = {
|
| 72 |
+
"symbol": symbol,
|
| 73 |
+
"interval": timeframe,
|
| 74 |
+
"startTime": current_start,
|
| 75 |
+
"endTime": end_ts,
|
| 76 |
+
"limit": MAX_CANDLES_PER_REQUEST,
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
try:
|
| 80 |
+
resp = requests.get(f"{BINANCE_BASE}{KLINES_ENDPOINT}", params=params, timeout=30)
|
| 81 |
+
resp.raise_for_status()
|
| 82 |
+
data = resp.json()
|
| 83 |
+
except Exception as e:
|
| 84 |
+
logger.error("❌ Error descargando %s página %d: %s", symbol, page, e)
|
| 85 |
+
time.sleep(5)
|
| 86 |
+
continue
|
| 87 |
+
|
| 88 |
+
if not data:
|
| 89 |
+
break
|
| 90 |
+
|
| 91 |
+
all_candles.extend(data)
|
| 92 |
+
page += 1
|
| 93 |
+
|
| 94 |
+
# Avanzar al siguiente batch
|
| 95 |
+
last_ts = data[-1][0]
|
| 96 |
+
current_start = last_ts + tf_ms
|
| 97 |
+
|
| 98 |
+
if page % 10 == 0:
|
| 99 |
+
logger.info(" 📊 %s: %d velas descargadas...", symbol, len(all_candles))
|
| 100 |
+
|
| 101 |
+
# Rate limit: max 1200 requests/min en Binance
|
| 102 |
+
time.sleep(0.1)
|
| 103 |
+
|
| 104 |
+
if not all_candles:
|
| 105 |
+
logger.warning("⚠️ No se obtuvieron datos para %s", symbol)
|
| 106 |
+
return pd.DataFrame()
|
| 107 |
+
|
| 108 |
+
# Parsear a DataFrame
|
| 109 |
+
df = pd.DataFrame(all_candles, columns=[
|
| 110 |
+
"open_time", "open", "high", "low", "close", "volume",
|
| 111 |
+
"close_time", "quote_volume", "trades", "taker_buy_base",
|
| 112 |
+
"taker_buy_quote", "ignore",
|
| 113 |
+
])
|
| 114 |
+
|
| 115 |
+
# Tipos
|
| 116 |
+
for col in ["open", "high", "low", "close", "volume", "quote_volume",
|
| 117 |
+
"taker_buy_base", "taker_buy_quote"]:
|
| 118 |
+
df[col] = pd.to_numeric(df[col], errors="coerce")
|
| 119 |
+
|
| 120 |
+
df["trades"] = df["trades"].astype(int)
|
| 121 |
+
df["open_time"] = pd.to_datetime(df["open_time"], unit="ms", utc=True)
|
| 122 |
+
df["close_time"] = pd.to_datetime(df["close_time"], unit="ms", utc=True)
|
| 123 |
+
|
| 124 |
+
# Índice temporal
|
| 125 |
+
df.set_index("open_time", inplace=True)
|
| 126 |
+
df.drop(columns=["ignore"], inplace=True)
|
| 127 |
+
|
| 128 |
+
# Remover duplicados
|
| 129 |
+
df = df[~df.index.duplicated(keep="first")]
|
| 130 |
+
df.sort_index(inplace=True)
|
| 131 |
+
|
| 132 |
+
# Agregar columnas derivadas útiles
|
| 133 |
+
df["taker_buy_ratio"] = df["taker_buy_quote"] / df["quote_volume"].replace(0, 1)
|
| 134 |
+
df["symbol"] = symbol
|
| 135 |
+
|
| 136 |
+
logger.info("✅ %s: %d velas descargadas (%s → %s)",
|
| 137 |
+
symbol, len(df),
|
| 138 |
+
df.index[0].strftime("%Y-%m-%d"),
|
| 139 |
+
df.index[-1].strftime("%Y-%m-%d"))
|
| 140 |
+
|
| 141 |
+
return df
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
def download_funding_rate(symbol: str, days: int) -> pd.DataFrame:
|
| 145 |
+
"""Descarga funding rate histórico de Binance Futures."""
|
| 146 |
+
end_ts = int(datetime.now(timezone.utc).timestamp() * 1000)
|
| 147 |
+
start_ts = end_ts - (days * 86_400_000)
|
| 148 |
+
|
| 149 |
+
all_data = []
|
| 150 |
+
current_start = start_ts
|
| 151 |
+
|
| 152 |
+
logger.info("📥 Descargando funding rate %s...", symbol)
|
| 153 |
+
|
| 154 |
+
while current_start < end_ts:
|
| 155 |
+
params = {
|
| 156 |
+
"symbol": symbol,
|
| 157 |
+
"startTime": current_start,
|
| 158 |
+
"endTime": end_ts,
|
| 159 |
+
"limit": 1000,
|
| 160 |
+
}
|
| 161 |
+
try:
|
| 162 |
+
resp = requests.get(
|
| 163 |
+
"https://fapi.binance.com/fapi/v1/fundingRate",
|
| 164 |
+
params=params, timeout=30,
|
| 165 |
+
)
|
| 166 |
+
resp.raise_for_status()
|
| 167 |
+
data = resp.json()
|
| 168 |
+
except Exception as e:
|
| 169 |
+
logger.error("❌ Funding rate error: %s", e)
|
| 170 |
+
time.sleep(2)
|
| 171 |
+
continue
|
| 172 |
+
|
| 173 |
+
if not data:
|
| 174 |
+
break
|
| 175 |
+
|
| 176 |
+
all_data.extend(data)
|
| 177 |
+
current_start = data[-1]["fundingTime"] + 1
|
| 178 |
+
time.sleep(0.2)
|
| 179 |
+
|
| 180 |
+
if not all_data:
|
| 181 |
+
return pd.DataFrame()
|
| 182 |
+
|
| 183 |
+
df = pd.DataFrame(all_data)
|
| 184 |
+
df["fundingRate"] = pd.to_numeric(df["fundingRate"], errors="coerce")
|
| 185 |
+
df["fundingTime"] = pd.to_datetime(df["fundingTime"], unit="ms", utc=True)
|
| 186 |
+
df.set_index("fundingTime", inplace=True)
|
| 187 |
+
df = df[~df.index.duplicated(keep="first")]
|
| 188 |
+
df.sort_index(inplace=True)
|
| 189 |
+
|
| 190 |
+
logger.info("✅ Funding rate %s: %d registros", symbol, len(df))
|
| 191 |
+
return df[["fundingRate"]]
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
def download_long_short_ratio(symbol: str, days: int) -> pd.DataFrame:
|
| 195 |
+
"""Descarga Long/Short ratio de Binance Futures."""
|
| 196 |
+
end_ts = int(datetime.now(timezone.utc).timestamp() * 1000)
|
| 197 |
+
start_ts = end_ts - (days * 86_400_000)
|
| 198 |
+
|
| 199 |
+
all_data = []
|
| 200 |
+
current_start = start_ts
|
| 201 |
+
|
| 202 |
+
logger.info("📥 Descargando L/S ratio %s...", symbol)
|
| 203 |
+
|
| 204 |
+
while current_start < end_ts:
|
| 205 |
+
params = {
|
| 206 |
+
"symbol": symbol,
|
| 207 |
+
"period": "4h",
|
| 208 |
+
"startTime": current_start,
|
| 209 |
+
"endTime": end_ts,
|
| 210 |
+
"limit": 500,
|
| 211 |
+
}
|
| 212 |
+
try:
|
| 213 |
+
resp = requests.get(
|
| 214 |
+
"https://fapi.binance.com/futures/data/globalLongShortAccountRatio",
|
| 215 |
+
params=params, timeout=30,
|
| 216 |
+
)
|
| 217 |
+
resp.raise_for_status()
|
| 218 |
+
data = resp.json()
|
| 219 |
+
except Exception as e:
|
| 220 |
+
logger.error("❌ L/S ratio error: %s", e)
|
| 221 |
+
time.sleep(2)
|
| 222 |
+
continue
|
| 223 |
+
|
| 224 |
+
if not data:
|
| 225 |
+
break
|
| 226 |
+
|
| 227 |
+
all_data.extend(data)
|
| 228 |
+
current_start = data[-1]["timestamp"] + 1
|
| 229 |
+
time.sleep(0.3)
|
| 230 |
+
|
| 231 |
+
if not all_data:
|
| 232 |
+
return pd.DataFrame()
|
| 233 |
+
|
| 234 |
+
df = pd.DataFrame(all_data)
|
| 235 |
+
df["longShortRatio"] = pd.to_numeric(df["longShortRatio"], errors="coerce")
|
| 236 |
+
df["longAccount"] = pd.to_numeric(df["longAccount"], errors="coerce")
|
| 237 |
+
df["shortAccount"] = pd.to_numeric(df["shortAccount"], errors="coerce")
|
| 238 |
+
df["timestamp"] = pd.to_datetime(df["timestamp"], unit="ms", utc=True)
|
| 239 |
+
df.set_index("timestamp", inplace=True)
|
| 240 |
+
df.sort_index(inplace=True)
|
| 241 |
+
|
| 242 |
+
logger.info("✅ L/S ratio %s: %d registros", symbol, len(df))
|
| 243 |
+
return df
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
def download_open_interest(symbol: str, days: int) -> pd.DataFrame:
|
| 247 |
+
"""Descarga Open Interest histórico de Binance Futures."""
|
| 248 |
+
end_ts = int(datetime.now(timezone.utc).timestamp() * 1000)
|
| 249 |
+
start_ts = end_ts - (days * 86_400_000)
|
| 250 |
+
|
| 251 |
+
all_data = []
|
| 252 |
+
current_start = start_ts
|
| 253 |
+
|
| 254 |
+
logger.info("📥 Descargando OI %s...", symbol)
|
| 255 |
+
|
| 256 |
+
while current_start < end_ts:
|
| 257 |
+
params = {
|
| 258 |
+
"symbol": symbol,
|
| 259 |
+
"period": "4h",
|
| 260 |
+
"startTime": current_start,
|
| 261 |
+
"endTime": end_ts,
|
| 262 |
+
"limit": 500,
|
| 263 |
+
}
|
| 264 |
+
try:
|
| 265 |
+
resp = requests.get(
|
| 266 |
+
"https://fapi.binance.com/futures/data/openInterestHist",
|
| 267 |
+
params=params, timeout=30,
|
| 268 |
+
)
|
| 269 |
+
resp.raise_for_status()
|
| 270 |
+
data = resp.json()
|
| 271 |
+
except Exception as e:
|
| 272 |
+
logger.error("❌ OI error: %s", e)
|
| 273 |
+
time.sleep(2)
|
| 274 |
+
continue
|
| 275 |
+
|
| 276 |
+
if not data:
|
| 277 |
+
break
|
| 278 |
+
|
| 279 |
+
all_data.extend(data)
|
| 280 |
+
current_start = data[-1]["timestamp"] + 1
|
| 281 |
+
time.sleep(0.3)
|
| 282 |
+
|
| 283 |
+
if not all_data:
|
| 284 |
+
return pd.DataFrame()
|
| 285 |
+
|
| 286 |
+
df = pd.DataFrame(all_data)
|
| 287 |
+
df["sumOpenInterest"] = pd.to_numeric(df["sumOpenInterest"], errors="coerce")
|
| 288 |
+
df["sumOpenInterestValue"] = pd.to_numeric(df["sumOpenInterestValue"], errors="coerce")
|
| 289 |
+
df["timestamp"] = pd.to_datetime(df["timestamp"], unit="ms", utc=True)
|
| 290 |
+
df.set_index("timestamp", inplace=True)
|
| 291 |
+
df.sort_index(inplace=True)
|
| 292 |
+
|
| 293 |
+
logger.info("✅ OI %s: %d registros", symbol, len(df))
|
| 294 |
+
return df
|
| 295 |
+
|
| 296 |
+
|
| 297 |
+
def download_macro_data(days: int) -> pd.DataFrame:
|
| 298 |
+
"""Descarga datos macro via yfinance: DXY, S&P500, Gold, VIX."""
|
| 299 |
+
try:
|
| 300 |
+
import yfinance as yf
|
| 301 |
+
except ImportError:
|
| 302 |
+
logger.warning("⚠️ yfinance no disponible — saltando datos macro")
|
| 303 |
+
return pd.DataFrame()
|
| 304 |
+
|
| 305 |
+
tickers = {
|
| 306 |
+
"DXY": "DX-Y.NYB",
|
| 307 |
+
"SPX": "^GSPC",
|
| 308 |
+
"GOLD": "GC=F",
|
| 309 |
+
"VIX": "^VIX",
|
| 310 |
+
"US10Y": "^TNX",
|
| 311 |
+
}
|
| 312 |
+
|
| 313 |
+
start_date = (datetime.now() - timedelta(days=days)).strftime("%Y-%m-%d")
|
| 314 |
+
frames = {}
|
| 315 |
+
|
| 316 |
+
for name, ticker in tickers.items():
|
| 317 |
+
try:
|
| 318 |
+
logger.info("📥 Descargando %s (%s)...", name, ticker)
|
| 319 |
+
data = yf.download(ticker, start=start_date, interval="1d",
|
| 320 |
+
progress=False, auto_adjust=True)
|
| 321 |
+
if not data.empty:
|
| 322 |
+
frames[name] = data["Close"].rename(f"macro_{name.lower()}")
|
| 323 |
+
except Exception as e:
|
| 324 |
+
logger.warning("⚠️ Error descargando %s: %s", name, e)
|
| 325 |
+
|
| 326 |
+
if not frames:
|
| 327 |
+
return pd.DataFrame()
|
| 328 |
+
|
| 329 |
+
df = pd.concat(frames.values(), axis=1)
|
| 330 |
+
df.index = pd.to_datetime(df.index, utc=True)
|
| 331 |
+
df.sort_index(inplace=True)
|
| 332 |
+
df = df.ffill()
|
| 333 |
+
|
| 334 |
+
logger.info("✅ Macro data: %d días, %d indicadores", len(df), len(df.columns))
|
| 335 |
+
return df
|
| 336 |
+
|
| 337 |
+
|
| 338 |
+
def download_fear_greed(days: int) -> pd.DataFrame:
|
| 339 |
+
"""Descarga Fear & Greed Index histórico."""
|
| 340 |
+
try:
|
| 341 |
+
resp = requests.get(
|
| 342 |
+
f"https://api.alternative.me/fng/?limit={days}&format=json",
|
| 343 |
+
timeout=15,
|
| 344 |
+
)
|
| 345 |
+
resp.raise_for_status()
|
| 346 |
+
data = resp.json().get("data", [])
|
| 347 |
+
except Exception as e:
|
| 348 |
+
logger.warning("⚠️ F&G error: %s", e)
|
| 349 |
+
return pd.DataFrame()
|
| 350 |
+
|
| 351 |
+
if not data:
|
| 352 |
+
return pd.DataFrame()
|
| 353 |
+
|
| 354 |
+
df = pd.DataFrame(data)
|
| 355 |
+
df["value"] = pd.to_numeric(df["value"], errors="coerce")
|
| 356 |
+
df["timestamp"] = pd.to_datetime(df["timestamp"].astype(int), unit="s", utc=True)
|
| 357 |
+
df.set_index("timestamp", inplace=True)
|
| 358 |
+
df.sort_index(inplace=True)
|
| 359 |
+
df = df.rename(columns={"value": "fear_greed"})
|
| 360 |
+
|
| 361 |
+
logger.info("✅ Fear & Greed: %d días", len(df))
|
| 362 |
+
return df[["fear_greed"]]
|
| 363 |
+
|
| 364 |
+
|
| 365 |
+
# ─────────────────────────────────────────────
|
| 366 |
+
# MAIN
|
| 367 |
+
# ─────────────────────────────────────────────
|
| 368 |
+
def main():
|
| 369 |
+
parser = argparse.ArgumentParser(description="Aurora Brain — Download Data")
|
| 370 |
+
parser.add_argument("--symbols", nargs="+", default=DEFAULT_SYMBOLS,
|
| 371 |
+
help="Pares a descargar (default: BTCUSDT ETHUSDT SOLUSDT)")
|
| 372 |
+
parser.add_argument("--timeframe", default=DEFAULT_TIMEFRAME,
|
| 373 |
+
help="Timeframe (default: 4h)")
|
| 374 |
+
parser.add_argument("--days", type=int, default=DEFAULT_DAYS,
|
| 375 |
+
help="Días de historia (default: 1825 = ~5 años)")
|
| 376 |
+
parser.add_argument("--no-derivatives", action="store_true",
|
| 377 |
+
help="Saltar datos de futuros (funding, OI, L/S)")
|
| 378 |
+
parser.add_argument("--no-macro", action="store_true",
|
| 379 |
+
help="Saltar datos macro (DXY, S&P, Gold, VIX)")
|
| 380 |
+
args = parser.parse_args()
|
| 381 |
+
|
| 382 |
+
os.makedirs(DATA_DIR, exist_ok=True)
|
| 383 |
+
|
| 384 |
+
# ── 1. Klines spot ──
|
| 385 |
+
for symbol in args.symbols:
|
| 386 |
+
df = download_klines(symbol, args.timeframe, args.days)
|
| 387 |
+
if not df.empty:
|
| 388 |
+
path = os.path.join(DATA_DIR, f"klines_{symbol}_{args.timeframe}.parquet")
|
| 389 |
+
df.to_parquet(path)
|
| 390 |
+
logger.info("💾 Guardado: %s (%d filas)", path, len(df))
|
| 391 |
+
|
| 392 |
+
# ── 2. Datos de derivados (futuros) ──
|
| 393 |
+
if not args.no_derivatives:
|
| 394 |
+
for symbol in args.symbols:
|
| 395 |
+
# Funding rate
|
| 396 |
+
df_fr = download_funding_rate(symbol, args.days)
|
| 397 |
+
if not df_fr.empty:
|
| 398 |
+
path = os.path.join(DATA_DIR, f"funding_{symbol}.parquet")
|
| 399 |
+
df_fr.to_parquet(path)
|
| 400 |
+
|
| 401 |
+
# Long/Short ratio
|
| 402 |
+
df_ls = download_long_short_ratio(symbol, args.days)
|
| 403 |
+
if not df_ls.empty:
|
| 404 |
+
path = os.path.join(DATA_DIR, f"longshort_{symbol}.parquet")
|
| 405 |
+
df_ls.to_parquet(path)
|
| 406 |
+
|
| 407 |
+
# Open Interest
|
| 408 |
+
df_oi = download_open_interest(symbol, args.days)
|
| 409 |
+
if not df_oi.empty:
|
| 410 |
+
path = os.path.join(DATA_DIR, f"oi_{symbol}.parquet")
|
| 411 |
+
df_oi.to_parquet(path)
|
| 412 |
+
|
| 413 |
+
# ── 3. Datos macro ──
|
| 414 |
+
if not args.no_macro:
|
| 415 |
+
df_macro = download_macro_data(args.days)
|
| 416 |
+
if not df_macro.empty:
|
| 417 |
+
path = os.path.join(DATA_DIR, "macro.parquet")
|
| 418 |
+
df_macro.to_parquet(path)
|
| 419 |
+
|
| 420 |
+
df_fg = download_fear_greed(min(args.days, 3650))
|
| 421 |
+
if not df_fg.empty:
|
| 422 |
+
path = os.path.join(DATA_DIR, "fear_greed.parquet")
|
| 423 |
+
df_fg.to_parquet(path)
|
| 424 |
+
|
| 425 |
+
# ── Resumen ──
|
| 426 |
+
logger.info("\n" + "=" * 60)
|
| 427 |
+
logger.info("📦 DESCARGA COMPLETA")
|
| 428 |
+
logger.info("=" * 60)
|
| 429 |
+
for f in sorted(os.listdir(DATA_DIR)):
|
| 430 |
+
if f.endswith(".parquet"):
|
| 431 |
+
size_mb = os.path.getsize(os.path.join(DATA_DIR, f)) / 1_048_576
|
| 432 |
+
logger.info(" 📄 %s (%.1f MB)", f, size_mb)
|
| 433 |
+
|
| 434 |
+
|
| 435 |
+
if __name__ == "__main__":
|
| 436 |
+
main()
|
feature_engine.py
ADDED
|
@@ -0,0 +1,511 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
╔══════════════════════════════════════════════════════════════╗
|
| 3 |
+
║ AURORA BRAIN — Feature Engine ║
|
| 4 |
+
║ ║
|
| 5 |
+
║ Genera 200+ features por vela a partir de datos OHLCV + ║
|
| 6 |
+
║ derivados + macro + sentimiento. ║
|
| 7 |
+
║ ║
|
| 8 |
+
║ Categorías: ║
|
| 9 |
+
║ A. Microestructura de precio (~50 features) ║
|
| 10 |
+
║ B. Momentum y tendencia (~40 features) ║
|
| 11 |
+
║ C. Volumen y flujo (~30 features) ║
|
| 12 |
+
║ D. Cross-asset intelligence (~30 features) ║
|
| 13 |
+
║ E. On-chain y derivados (~30 features) ║
|
| 14 |
+
║ F. Sentimiento y macro (~20 features) ║
|
| 15 |
+
║ ║
|
| 16 |
+
║ Uso: ║
|
| 17 |
+
║ python feature_engine.py ║
|
| 18 |
+
║ python feature_engine.py --symbol BTCUSDT --timeframe 4h ║
|
| 19 |
+
╚══════════════════════════════════════════════════════════════╝
|
| 20 |
+
"""
|
| 21 |
+
import os
|
| 22 |
+
import argparse
|
| 23 |
+
import logging
|
| 24 |
+
import warnings
|
| 25 |
+
|
| 26 |
+
import numpy as np
|
| 27 |
+
import pandas as pd
|
| 28 |
+
import pandas_ta as ta
|
| 29 |
+
|
| 30 |
+
warnings.filterwarnings("ignore", category=FutureWarning)
|
| 31 |
+
|
| 32 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
|
| 33 |
+
logger = logging.getLogger("AuroraBrain.Features")
|
| 34 |
+
|
| 35 |
+
DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
# ═══════════════════════════════════════════════════════════
|
| 39 |
+
# A. MICROESTRUCTURA DE PRECIO (~50 features)
|
| 40 |
+
# ═══════════════════════════════════════════════════════════
|
| 41 |
+
|
| 42 |
+
def features_microestructura(df: pd.DataFrame) -> pd.DataFrame:
|
| 43 |
+
"""Features derivados de la estructura interna de las velas."""
|
| 44 |
+
o, h, l, c = df["open"], df["high"], df["low"], df["close"]
|
| 45 |
+
rng = (h - l).replace(0, np.nan)
|
| 46 |
+
|
| 47 |
+
# Ratios de vela
|
| 48 |
+
df["f_body_ratio"] = (c - o).abs() / rng # Fuerza: 1=todo cuerpo, 0=todo mecha
|
| 49 |
+
df["f_upper_wick_ratio"] = (h - pd.concat([c, o], axis=1).max(axis=1)) / rng # Rechazo arriba
|
| 50 |
+
df["f_lower_wick_ratio"] = (pd.concat([c, o], axis=1).min(axis=1) - l) / rng # Rechazo abajo
|
| 51 |
+
df["f_price_position"] = (c - l) / rng # Dónde cerró: 1=máximo, 0=mínimo
|
| 52 |
+
df["f_is_bull"] = (c > o).astype(int)
|
| 53 |
+
df["f_is_doji"] = (df["f_body_ratio"] < 0.1).astype(int)
|
| 54 |
+
|
| 55 |
+
# Body size normalizado por ATR
|
| 56 |
+
atr14 = ta.atr(h, l, c, length=14)
|
| 57 |
+
df["f_body_atr_ratio"] = (c - o).abs() / atr14.replace(0, np.nan)
|
| 58 |
+
|
| 59 |
+
# Gap entre velas
|
| 60 |
+
df["f_gap_pct"] = (o - c.shift(1)) / c.shift(1) * 100
|
| 61 |
+
|
| 62 |
+
# Secuencias consecutivas
|
| 63 |
+
bull = (c > o).astype(int)
|
| 64 |
+
df["f_consec_bull"] = bull.groupby((bull != bull.shift()).cumsum()).cumcount() + 1
|
| 65 |
+
df["f_consec_bull"] = df["f_consec_bull"] * bull
|
| 66 |
+
|
| 67 |
+
bear = (c < o).astype(int)
|
| 68 |
+
df["f_consec_bear"] = bear.groupby((bear != bear.shift()).cumsum()).cumcount() + 1
|
| 69 |
+
df["f_consec_bear"] = df["f_consec_bear"] * bear
|
| 70 |
+
|
| 71 |
+
# Patrones de velas (booleanos)
|
| 72 |
+
df["f_engulfing_bull"] = ((c > o) & (c.shift(1) < o.shift(1)) &
|
| 73 |
+
(c > o.shift(1)) & (o < c.shift(1))).astype(int)
|
| 74 |
+
df["f_engulfing_bear"] = ((c < o) & (c.shift(1) > o.shift(1)) &
|
| 75 |
+
(c < o.shift(1)) & (o > c.shift(1))).astype(int)
|
| 76 |
+
df["f_hammer"] = ((df["f_lower_wick_ratio"] > 0.6) & (df["f_body_ratio"] < 0.3) &
|
| 77 |
+
(df["f_upper_wick_ratio"] < 0.1)).astype(int)
|
| 78 |
+
df["f_shooting_star"] = ((df["f_upper_wick_ratio"] > 0.6) & (df["f_body_ratio"] < 0.3) &
|
| 79 |
+
(df["f_lower_wick_ratio"] < 0.1)).astype(int)
|
| 80 |
+
|
| 81 |
+
# ATR y volatilidad
|
| 82 |
+
df["f_atr_14"] = atr14
|
| 83 |
+
df["f_atr_pct"] = atr14 / c * 100
|
| 84 |
+
df["f_atr_roc_5"] = atr14.pct_change(5) * 100 # Aceleración de volatilidad
|
| 85 |
+
df["f_atr_roc_14"] = atr14.pct_change(14) * 100
|
| 86 |
+
|
| 87 |
+
# Range vs promedio
|
| 88 |
+
df["f_range_ratio_20"] = rng / rng.rolling(20).mean()
|
| 89 |
+
|
| 90 |
+
# Cambio porcentual
|
| 91 |
+
for period in [1, 3, 5, 10, 20]:
|
| 92 |
+
df[f"f_return_{period}"] = c.pct_change(period) * 100
|
| 93 |
+
|
| 94 |
+
# High-Low range pct
|
| 95 |
+
df["f_hl_pct"] = rng / c * 100
|
| 96 |
+
|
| 97 |
+
# Distancia a máximos/mínimos recientes
|
| 98 |
+
df["f_dist_high_20"] = (c - h.rolling(20).max()) / c * 100
|
| 99 |
+
df["f_dist_low_20"] = (c - l.rolling(20).min()) / c * 100
|
| 100 |
+
|
| 101 |
+
# Candle pattern encoding: últimas 5 velas como patrón
|
| 102 |
+
# Codificamos cada vela como 0 (bear) o 1 (bull), generando un número 0-31
|
| 103 |
+
pattern = pd.Series(0, index=df.index, dtype=int)
|
| 104 |
+
for i in range(5):
|
| 105 |
+
pattern += bull.shift(i).fillna(0).astype(int) * (2 ** i)
|
| 106 |
+
df["f_candle_pattern_5"] = pattern
|
| 107 |
+
|
| 108 |
+
logger.info(" ✅ A. Microestructura: %d features", sum(1 for c in df.columns if c.startswith("f_")))
|
| 109 |
+
return df
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
# ═══════════════════════════════════════════════════════════
|
| 113 |
+
# B. MOMENTUM Y TENDENCIA (~40 features)
|
| 114 |
+
# ═══════════════════════════════════════════════════════════
|
| 115 |
+
|
| 116 |
+
def features_momentum(df: pd.DataFrame) -> pd.DataFrame:
|
| 117 |
+
"""Features de momentum, tendencia y osciladores."""
|
| 118 |
+
c, h, l = df["close"], df["high"], df["low"]
|
| 119 |
+
|
| 120 |
+
# RSI
|
| 121 |
+
df["f_rsi_14"] = ta.rsi(c, length=14)
|
| 122 |
+
df["f_rsi_7"] = ta.rsi(c, length=7)
|
| 123 |
+
df["f_rsi_roc_5"] = df["f_rsi_14"].diff(5) # RSI rate-of-change
|
| 124 |
+
df["f_rsi_roc_14"] = df["f_rsi_14"].diff(14)
|
| 125 |
+
|
| 126 |
+
# RSI divergence (precio sube pero RSI baja = divergencia bajista)
|
| 127 |
+
df["f_rsi_price_div_14"] = df["f_return_14"] - df["f_rsi_roc_14"]
|
| 128 |
+
|
| 129 |
+
# MACD
|
| 130 |
+
macd = ta.macd(c, fast=12, slow=26, signal=9)
|
| 131 |
+
if macd is not None and len(macd.columns) >= 3:
|
| 132 |
+
df["f_macd_hist"] = macd.iloc[:, 2] # Histograma
|
| 133 |
+
df["f_macd_hist_roc"] = df["f_macd_hist"].diff(3) # Aceleración MACD
|
| 134 |
+
df["f_macd_cross_bull"] = ((df["f_macd_hist"] > 0) &
|
| 135 |
+
(df["f_macd_hist"].shift(1) <= 0)).astype(int)
|
| 136 |
+
df["f_macd_cross_bear"] = ((df["f_macd_hist"] < 0) &
|
| 137 |
+
(df["f_macd_hist"].shift(1) >= 0)).astype(int)
|
| 138 |
+
|
| 139 |
+
# ADX
|
| 140 |
+
adx_data = ta.adx(h, l, c, length=14)
|
| 141 |
+
if adx_data is not None:
|
| 142 |
+
df["f_adx_14"] = adx_data.iloc[:, 0]
|
| 143 |
+
df["f_plus_di"] = adx_data.iloc[:, 1]
|
| 144 |
+
df["f_minus_di"] = adx_data.iloc[:, 2]
|
| 145 |
+
df["f_di_ratio"] = df["f_plus_di"] / df["f_minus_di"].replace(0, np.nan)
|
| 146 |
+
|
| 147 |
+
adx_28 = ta.adx(h, l, c, length=28)
|
| 148 |
+
if adx_28 is not None:
|
| 149 |
+
df["f_adx_28"] = adx_28.iloc[:, 0]
|
| 150 |
+
|
| 151 |
+
# EMAs
|
| 152 |
+
for length in [9, 21, 55, 200]:
|
| 153 |
+
ema = ta.ema(c, length=length)
|
| 154 |
+
df[f"f_ema_{length}"] = ema
|
| 155 |
+
df[f"f_dist_ema_{length}"] = (c - ema) / ema * 100 # Distancia % al EMA
|
| 156 |
+
df[f"f_ema_{length}_slope"] = ema.diff(5) / ema * 100 # Pendiente (ángulo)
|
| 157 |
+
|
| 158 |
+
# Orden de EMAs (1=alcista perfecto: 9>21>55>200)
|
| 159 |
+
df["f_ema_order_bull"] = (
|
| 160 |
+
(df["f_ema_9"] > df["f_ema_21"]) &
|
| 161 |
+
(df["f_ema_21"] > df["f_ema_55"]) &
|
| 162 |
+
(df["f_ema_55"] > df["f_ema_200"])
|
| 163 |
+
).astype(int)
|
| 164 |
+
df["f_ema_order_bear"] = (
|
| 165 |
+
(df["f_ema_9"] < df["f_ema_21"]) &
|
| 166 |
+
(df["f_ema_21"] < df["f_ema_55"]) &
|
| 167 |
+
(df["f_ema_55"] < df["f_ema_200"])
|
| 168 |
+
).astype(int)
|
| 169 |
+
|
| 170 |
+
# Regresión lineal — pendiente
|
| 171 |
+
for period in [10, 20, 50]:
|
| 172 |
+
slope = ta.linreg(c, length=period, tsf=False)
|
| 173 |
+
if slope is not None:
|
| 174 |
+
df[f"f_linreg_slope_{period}"] = slope
|
| 175 |
+
|
| 176 |
+
# Bollinger Bands
|
| 177 |
+
bb = ta.bbands(c, length=20, std=2)
|
| 178 |
+
if bb is not None and len(bb.columns) >= 3:
|
| 179 |
+
df["f_bb_upper"] = bb.iloc[:, 0]
|
| 180 |
+
df["f_bb_mid"] = bb.iloc[:, 1]
|
| 181 |
+
df["f_bb_lower"] = bb.iloc[:, 2]
|
| 182 |
+
df["f_bb_width"] = (df["f_bb_upper"] - df["f_bb_lower"]) / df["f_bb_mid"] * 100
|
| 183 |
+
df["f_bb_position"] = (c - df["f_bb_lower"]) / (df["f_bb_upper"] - df["f_bb_lower"]).replace(0, np.nan)
|
| 184 |
+
|
| 185 |
+
# Stochastic
|
| 186 |
+
stoch = ta.stoch(h, l, c, k=14, d=3)
|
| 187 |
+
if stoch is not None and len(stoch.columns) >= 2:
|
| 188 |
+
df["f_stoch_k"] = stoch.iloc[:, 0]
|
| 189 |
+
df["f_stoch_d"] = stoch.iloc[:, 1]
|
| 190 |
+
|
| 191 |
+
# Higher Highs / Higher Lows conteo (proxy de HH+HL para bias)
|
| 192 |
+
hh_count = 0
|
| 193 |
+
hl_count = 0
|
| 194 |
+
ll_count = 0
|
| 195 |
+
lh_count = 0
|
| 196 |
+
swing_h = h.rolling(10).max()
|
| 197 |
+
swing_l = l.rolling(10).min()
|
| 198 |
+
prev_sh = swing_h.shift(10)
|
| 199 |
+
prev_sl = swing_l.shift(10)
|
| 200 |
+
df["f_hh_count_20"] = ((swing_h > prev_sh).rolling(20).sum()).fillna(0)
|
| 201 |
+
df["f_hl_count_20"] = ((swing_l > prev_sl).rolling(20).sum()).fillna(0)
|
| 202 |
+
df["f_ll_count_20"] = ((swing_l < prev_sl).rolling(20).sum()).fillna(0)
|
| 203 |
+
df["f_lh_count_20"] = ((swing_h < prev_sh).rolling(20).sum()).fillna(0)
|
| 204 |
+
|
| 205 |
+
n_momentum = sum(1 for col in df.columns if col.startswith("f_") and
|
| 206 |
+
any(x in col for x in ["rsi", "macd", "adx", "ema", "linreg", "bb", "stoch", "hh_", "hl_", "ll_", "lh_", "di_"]))
|
| 207 |
+
logger.info(" ✅ B. Momentum: ~%d features", n_momentum)
|
| 208 |
+
return df
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
# ═══════════════════════════════════════════════════════════
|
| 212 |
+
# C. VOLUMEN Y FLUJO (~30 features)
|
| 213 |
+
# ══════════════════════��════════════════════════════════════
|
| 214 |
+
|
| 215 |
+
def features_volumen(df: pd.DataFrame) -> pd.DataFrame:
|
| 216 |
+
"""Features de volumen, flujo de órdenes y liquidez."""
|
| 217 |
+
v = df["volume"]
|
| 218 |
+
c = df["close"]
|
| 219 |
+
qv = df.get("quote_volume", v * c)
|
| 220 |
+
tbr = df.get("taker_buy_ratio", pd.Series(0.5, index=df.index))
|
| 221 |
+
|
| 222 |
+
# Volumen relativo
|
| 223 |
+
df["f_vol_sma_20"] = v.rolling(20).mean()
|
| 224 |
+
df["f_vol_ratio_20"] = v / df["f_vol_sma_20"].replace(0, np.nan)
|
| 225 |
+
df["f_vol_ratio_5"] = v / v.rolling(5).mean().replace(0, np.nan)
|
| 226 |
+
|
| 227 |
+
# Aceleración de volumen
|
| 228 |
+
df["f_vol_accel"] = df["f_vol_ratio_20"].diff(3)
|
| 229 |
+
|
| 230 |
+
# Volume spike (>2x promedio)
|
| 231 |
+
df["f_vol_spike"] = (df["f_vol_ratio_20"] > 2.0).astype(int)
|
| 232 |
+
|
| 233 |
+
# OBV (On Balance Volume)
|
| 234 |
+
obv = ta.obv(c, v)
|
| 235 |
+
if obv is not None:
|
| 236 |
+
df["f_obv"] = obv
|
| 237 |
+
df["f_obv_slope_10"] = obv.diff(10) / obv.abs().replace(0, np.nan) * 100
|
| 238 |
+
|
| 239 |
+
# Volume-weighted price deviation
|
| 240 |
+
vwap_approx = (qv.rolling(20).sum()) / (v.rolling(20).sum().replace(0, np.nan))
|
| 241 |
+
df["f_vwap_dev"] = (c - vwap_approx) / vwap_approx * 100
|
| 242 |
+
|
| 243 |
+
# Taker buy ratio
|
| 244 |
+
df["f_tbr"] = tbr
|
| 245 |
+
df["f_tbr_sma_10"] = tbr.rolling(10).mean()
|
| 246 |
+
df["f_tbr_roc_5"] = tbr.diff(5)
|
| 247 |
+
|
| 248 |
+
# Quote volume rate-of-change
|
| 249 |
+
df["f_qvol_roc_5"] = qv.pct_change(5) * 100
|
| 250 |
+
|
| 251 |
+
# Trades count si disponible
|
| 252 |
+
if "trades" in df.columns:
|
| 253 |
+
trades = df["trades"]
|
| 254 |
+
df["f_trades_ratio_20"] = trades / trades.rolling(20).mean().replace(0, np.nan)
|
| 255 |
+
df["f_avg_trade_size"] = qv / trades.replace(0, np.nan)
|
| 256 |
+
|
| 257 |
+
# Volume profile proxy: ratio de volumen en mitad superior vs inferior del rango
|
| 258 |
+
mid_price = (df["high"] + df["low"]) / 2
|
| 259 |
+
df["f_vol_above_mid"] = ((c > mid_price) * v).rolling(20).sum()
|
| 260 |
+
df["f_vol_below_mid"] = ((c <= mid_price) * v).rolling(20).sum()
|
| 261 |
+
df["f_vol_balance"] = df["f_vol_above_mid"] / (df["f_vol_above_mid"] + df["f_vol_below_mid"]).replace(0, np.nan)
|
| 262 |
+
|
| 263 |
+
n_vol = sum(1 for col in df.columns if col.startswith("f_vol") or col.startswith("f_obv") or
|
| 264 |
+
col.startswith("f_vwap") or col.startswith("f_tbr") or col.startswith("f_qvol") or
|
| 265 |
+
col.startswith("f_trades") or col.startswith("f_avg_trade"))
|
| 266 |
+
logger.info(" ✅ C. Volumen: ~%d features", n_vol)
|
| 267 |
+
return df
|
| 268 |
+
|
| 269 |
+
|
| 270 |
+
# ═══════════════════════════════════════════════════════════
|
| 271 |
+
# D. CROSS-ASSET INTELLIGENCE (~30 features)
|
| 272 |
+
# ═══════════════════════════════════════════════════════════
|
| 273 |
+
|
| 274 |
+
def features_cross_asset(df: pd.DataFrame, df_btc: pd.DataFrame = None,
|
| 275 |
+
df_macro: pd.DataFrame = None) -> pd.DataFrame:
|
| 276 |
+
"""Features de correlación entre activos y datos macro."""
|
| 277 |
+
c = df["close"]
|
| 278 |
+
symbol = df.get("symbol", pd.Series("UNKNOWN", index=df.index)).iloc[0] if "symbol" in df.columns else "UNKNOWN"
|
| 279 |
+
|
| 280 |
+
# Cross-asset con BTC (si no es BTC mismo)
|
| 281 |
+
if df_btc is not None and symbol != "BTCUSDT":
|
| 282 |
+
btc_c = df_btc["close"].reindex(df.index, method="ffill")
|
| 283 |
+
|
| 284 |
+
# Correlación rolling
|
| 285 |
+
df["f_corr_btc_30"] = c.rolling(30).corr(btc_c)
|
| 286 |
+
df["f_corr_btc_90"] = c.rolling(90).corr(btc_c)
|
| 287 |
+
|
| 288 |
+
# Beta respecto a BTC
|
| 289 |
+
btc_ret = btc_c.pct_change()
|
| 290 |
+
sym_ret = c.pct_change()
|
| 291 |
+
cov = sym_ret.rolling(30).cov(btc_ret)
|
| 292 |
+
var = btc_ret.rolling(30).var()
|
| 293 |
+
df["f_beta_btc_30"] = cov / var.replace(0, np.nan)
|
| 294 |
+
|
| 295 |
+
# Lead-lag: ¿se movió antes que BTC?
|
| 296 |
+
for lag in [1, 3, 6]:
|
| 297 |
+
df[f"f_lead_btc_{lag}"] = sym_ret.shift(lag).rolling(10).corr(btc_ret)
|
| 298 |
+
|
| 299 |
+
# Spread normalizado
|
| 300 |
+
df["f_spread_btc"] = (c / btc_c).pct_change(5) * 100
|
| 301 |
+
else:
|
| 302 |
+
# Si ES BTC, agregamos 0s
|
| 303 |
+
df["f_corr_btc_30"] = 1.0
|
| 304 |
+
df["f_corr_btc_90"] = 1.0
|
| 305 |
+
|
| 306 |
+
# Datos macro (si disponibles)
|
| 307 |
+
if df_macro is not None and not df_macro.empty:
|
| 308 |
+
# Reindexar macro a la frecuencia del dataframe principal
|
| 309 |
+
for col in df_macro.columns:
|
| 310 |
+
macro_series = df_macro[col].reindex(df.index, method="ffill")
|
| 311 |
+
df[f"f_{col}"] = macro_series
|
| 312 |
+
|
| 313 |
+
# Rate-of-change macro
|
| 314 |
+
df[f"f_{col}_roc_5d"] = macro_series.pct_change(5) * 100
|
| 315 |
+
|
| 316 |
+
n_cross = sum(1 for col in df.columns if "corr_" in col or "beta_" in col or
|
| 317 |
+
"lead_" in col or "spread_" in col or "macro_" in col)
|
| 318 |
+
logger.info(" ✅ D. Cross-asset: ~%d features", n_cross)
|
| 319 |
+
return df
|
| 320 |
+
|
| 321 |
+
|
| 322 |
+
# ═══════════════════════════════════════════════════════════
|
| 323 |
+
# E. ON-CHAIN Y DERIVADOS (~30 features)
|
| 324 |
+
# ═════════════════════════════════════════════���═════════════
|
| 325 |
+
|
| 326 |
+
def features_onchain(df: pd.DataFrame, df_funding: pd.DataFrame = None,
|
| 327 |
+
df_ls: pd.DataFrame = None, df_oi: pd.DataFrame = None) -> pd.DataFrame:
|
| 328 |
+
"""Features de datos on-chain y derivados."""
|
| 329 |
+
|
| 330 |
+
# Funding rate
|
| 331 |
+
if df_funding is not None and not df_funding.empty:
|
| 332 |
+
fr = df_funding["fundingRate"].reindex(df.index, method="ffill")
|
| 333 |
+
df["f_funding_rate"] = fr
|
| 334 |
+
df["f_funding_rate_sma_10"] = fr.rolling(10).mean()
|
| 335 |
+
df["f_funding_rate_extreme_pos"] = (fr > 0.001).astype(int) # Overleveraged longs
|
| 336 |
+
df["f_funding_rate_extreme_neg"] = (fr < -0.001).astype(int) # Overleveraged shorts
|
| 337 |
+
df["f_funding_rate_roc"] = fr.diff(3)
|
| 338 |
+
|
| 339 |
+
# Long/Short ratio
|
| 340 |
+
if df_ls is not None and not df_ls.empty:
|
| 341 |
+
ls = df_ls["longShortRatio"].reindex(df.index, method="ffill")
|
| 342 |
+
df["f_ls_ratio"] = ls
|
| 343 |
+
df["f_ls_ratio_sma_10"] = ls.rolling(10).mean()
|
| 344 |
+
df["f_ls_ratio_roc"] = ls.pct_change(5) * 100
|
| 345 |
+
|
| 346 |
+
if "longAccount" in df_ls.columns:
|
| 347 |
+
long_pct = df_ls["longAccount"].reindex(df.index, method="ffill")
|
| 348 |
+
df["f_long_pct"] = long_pct
|
| 349 |
+
|
| 350 |
+
# Open Interest
|
| 351 |
+
if df_oi is not None and not df_oi.empty:
|
| 352 |
+
oi = df_oi["sumOpenInterestValue"].reindex(df.index, method="ffill")
|
| 353 |
+
df["f_oi_value"] = oi
|
| 354 |
+
df["f_oi_roc_5"] = oi.pct_change(5) * 100
|
| 355 |
+
df["f_oi_roc_24"] = oi.pct_change(24) * 100 # ~24 velas de 4h ≈ 4 días
|
| 356 |
+
|
| 357 |
+
# OI divergence: precio sube pero OI baja = movimiento sin respaldo
|
| 358 |
+
if "f_return_5" in df.columns:
|
| 359 |
+
df["f_oi_price_div"] = df["f_return_5"] - df["f_oi_roc_5"]
|
| 360 |
+
|
| 361 |
+
n_onchain = sum(1 for col in df.columns if "funding" in col or "ls_" in col or
|
| 362 |
+
"oi_" in col or "long_pct" in col)
|
| 363 |
+
logger.info(" ✅ E. On-chain: ~%d features", n_onchain)
|
| 364 |
+
return df
|
| 365 |
+
|
| 366 |
+
|
| 367 |
+
# ═══════════════════════════════════════════════════════════
|
| 368 |
+
# F. SENTIMIENTO Y MACRO (~20 features)
|
| 369 |
+
# ═══════════════════════════════════════════════════════════
|
| 370 |
+
|
| 371 |
+
def features_sentimiento(df: pd.DataFrame, df_fg: pd.DataFrame = None) -> pd.DataFrame:
|
| 372 |
+
"""Features de sentimiento y patrones temporales."""
|
| 373 |
+
|
| 374 |
+
# Fear & Greed
|
| 375 |
+
if df_fg is not None and not df_fg.empty:
|
| 376 |
+
fg = df_fg["fear_greed"].reindex(df.index, method="ffill")
|
| 377 |
+
df["f_fear_greed"] = fg
|
| 378 |
+
df["f_fear_greed_roc_5"] = fg.diff(5)
|
| 379 |
+
df["f_fear_greed_extreme_fear"] = (fg < 25).astype(int)
|
| 380 |
+
df["f_fear_greed_extreme_greed"] = (fg > 75).astype(int)
|
| 381 |
+
|
| 382 |
+
# Patrones temporales
|
| 383 |
+
if isinstance(df.index, pd.DatetimeIndex):
|
| 384 |
+
df["f_hour_of_day"] = df.index.hour
|
| 385 |
+
df["f_day_of_week"] = df.index.dayofweek
|
| 386 |
+
df["f_is_weekend"] = (df.index.dayofweek >= 5).astype(int)
|
| 387 |
+
|
| 388 |
+
# Meses estacionalmente fuertes para BTC (históricamente: Oct, Nov, Abr)
|
| 389 |
+
df["f_month"] = df.index.month
|
| 390 |
+
df["f_is_q4"] = (df.index.month >= 10).astype(int)
|
| 391 |
+
|
| 392 |
+
# Días desde halving BTC (aprox)
|
| 393 |
+
# Halvings: 2012-11-28, 2016-07-09, 2020-05-11, 2024-04-20
|
| 394 |
+
halvings = pd.to_datetime(["2020-05-11", "2024-04-20"], utc=True)
|
| 395 |
+
last_halving = halvings[-1]
|
| 396 |
+
df["f_days_since_halving"] = (df.index - last_halving).days
|
| 397 |
+
|
| 398 |
+
n_sent = sum(1 for col in df.columns if "fear_greed" in col or "hour_" in col or
|
| 399 |
+
"day_" in col or "weekend" in col or "month" in col or "halving" in col or "q4" in col)
|
| 400 |
+
logger.info(" ✅ F. Sentimiento: ~%d features", n_sent)
|
| 401 |
+
return df
|
| 402 |
+
|
| 403 |
+
|
| 404 |
+
# ═══════════════════════════════════════════════════════════
|
| 405 |
+
# ORQUESTADOR PRINCIPAL
|
| 406 |
+
# ═══════════════════════════════════════════════════════════
|
| 407 |
+
|
| 408 |
+
def generate_features(symbol: str, timeframe: str = "4h",
|
| 409 |
+
btc_symbol: str = "BTCUSDT") -> pd.DataFrame:
|
| 410 |
+
"""
|
| 411 |
+
Genera todas las features para un par dado.
|
| 412 |
+
Lee los archivos parquet del directorio data/.
|
| 413 |
+
"""
|
| 414 |
+
logger.info("🔧 Generando features para %s %s...", symbol, timeframe)
|
| 415 |
+
|
| 416 |
+
# ── Cargar datos ──
|
| 417 |
+
klines_path = os.path.join(DATA_DIR, f"klines_{symbol}_{timeframe}.parquet")
|
| 418 |
+
if not os.path.exists(klines_path):
|
| 419 |
+
logger.error("❌ No se encontró %s", klines_path)
|
| 420 |
+
return pd.DataFrame()
|
| 421 |
+
|
| 422 |
+
df = pd.read_parquet(klines_path)
|
| 423 |
+
logger.info(" 📊 %d velas cargadas", len(df))
|
| 424 |
+
|
| 425 |
+
# BTC para cross-asset (si no es BTC)
|
| 426 |
+
df_btc = None
|
| 427 |
+
if symbol != btc_symbol:
|
| 428 |
+
btc_path = os.path.join(DATA_DIR, f"klines_{btc_symbol}_{timeframe}.parquet")
|
| 429 |
+
if os.path.exists(btc_path):
|
| 430 |
+
df_btc = pd.read_parquet(btc_path)
|
| 431 |
+
|
| 432 |
+
# Derivados
|
| 433 |
+
df_funding = _load_parquet(f"funding_{symbol}.parquet")
|
| 434 |
+
df_ls = _load_parquet(f"longshort_{symbol}.parquet")
|
| 435 |
+
df_oi = _load_parquet(f"oi_{symbol}.parquet")
|
| 436 |
+
|
| 437 |
+
# Macro y sentimiento
|
| 438 |
+
df_macro = _load_parquet("macro.parquet")
|
| 439 |
+
df_fg = _load_parquet("fear_greed.parquet")
|
| 440 |
+
|
| 441 |
+
# ── Generar features por categoría ──
|
| 442 |
+
df = features_microestructura(df)
|
| 443 |
+
df = features_momentum(df)
|
| 444 |
+
df = features_volumen(df)
|
| 445 |
+
df = features_cross_asset(df, df_btc=df_btc, df_macro=df_macro)
|
| 446 |
+
df = features_onchain(df, df_funding=df_funding, df_ls=df_ls, df_oi=df_oi)
|
| 447 |
+
df = features_sentimiento(df, df_fg=df_fg)
|
| 448 |
+
|
| 449 |
+
# ── Limpieza ──
|
| 450 |
+
feature_cols = [c for c in df.columns if c.startswith("f_")]
|
| 451 |
+
df[feature_cols] = df[feature_cols].replace([np.inf, -np.inf], np.nan)
|
| 452 |
+
|
| 453 |
+
# Warmup: eliminar primeras 200 filas donde la mayoría de features son NaN
|
| 454 |
+
warmup = 210
|
| 455 |
+
df = df.iloc[warmup:]
|
| 456 |
+
|
| 457 |
+
n_features = len(feature_cols)
|
| 458 |
+
n_rows = len(df)
|
| 459 |
+
nan_pct = df[feature_cols].isna().mean().mean() * 100
|
| 460 |
+
|
| 461 |
+
logger.info("\n" + "=" * 60)
|
| 462 |
+
logger.info("🔧 FEATURES GENERADAS: %s %s", symbol, timeframe)
|
| 463 |
+
logger.info(" 📊 %d velas × %d features", n_rows, n_features)
|
| 464 |
+
logger.info(" 📉 NaN promedio: %.1f%%", nan_pct)
|
| 465 |
+
logger.info("=" * 60)
|
| 466 |
+
|
| 467 |
+
return df
|
| 468 |
+
|
| 469 |
+
|
| 470 |
+
def _load_parquet(filename: str) -> pd.DataFrame:
|
| 471 |
+
"""Carga un parquet si existe, retorna DataFrame vacío si no."""
|
| 472 |
+
path = os.path.join(DATA_DIR, filename)
|
| 473 |
+
if os.path.exists(path):
|
| 474 |
+
return pd.read_parquet(path)
|
| 475 |
+
return pd.DataFrame()
|
| 476 |
+
|
| 477 |
+
|
| 478 |
+
def get_feature_columns(df: pd.DataFrame) -> list[str]:
|
| 479 |
+
"""Retorna la lista de columnas de features (prefijo f_)."""
|
| 480 |
+
return sorted([c for c in df.columns if c.startswith("f_")])
|
| 481 |
+
|
| 482 |
+
|
| 483 |
+
# ─────────────────────────────────────────────
|
| 484 |
+
# MAIN
|
| 485 |
+
# ─────────────────────────────────────────────
|
| 486 |
+
def main():
|
| 487 |
+
parser = argparse.ArgumentParser(description="Aurora Brain — Feature Engine")
|
| 488 |
+
parser.add_argument("--symbol", default="BTCUSDT", help="Par (default: BTCUSDT)")
|
| 489 |
+
parser.add_argument("--timeframe", default="4h", help="Timeframe (default: 4h)")
|
| 490 |
+
parser.add_argument("--all", action="store_true", help="Generar para BTC + ETH + SOL")
|
| 491 |
+
args = parser.parse_args()
|
| 492 |
+
|
| 493 |
+
symbols = ["BTCUSDT", "ETHUSDT", "SOLUSDT"] if args.all else [args.symbol]
|
| 494 |
+
|
| 495 |
+
for symbol in symbols:
|
| 496 |
+
df = generate_features(symbol, args.timeframe)
|
| 497 |
+
if not df.empty:
|
| 498 |
+
path = os.path.join(DATA_DIR, f"features_{symbol}_{args.timeframe}.parquet")
|
| 499 |
+
df.to_parquet(path)
|
| 500 |
+
logger.info("💾 Guardado: %s", path)
|
| 501 |
+
|
| 502 |
+
# Mostrar top features por varianza (las más informativas)
|
| 503 |
+
feature_cols = get_feature_columns(df)
|
| 504 |
+
variance = df[feature_cols].var().sort_values(ascending=False)
|
| 505 |
+
logger.info("\n📊 Top 20 features por varianza:")
|
| 506 |
+
for i, (col, var) in enumerate(variance.head(20).items()):
|
| 507 |
+
logger.info(" %2d. %-30s var=%.4f", i + 1, col, var)
|
| 508 |
+
|
| 509 |
+
|
| 510 |
+
if __name__ == "__main__":
|
| 511 |
+
main()
|
regime_detector.py
ADDED
|
@@ -0,0 +1,286 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
╔══════════════════════════════════════════════════════════════╗
|
| 3 |
+
║ AURORA BRAIN — Regime Detector (Capa 1) ║
|
| 4 |
+
║ ║
|
| 5 |
+
║ Entrena un XGBoost para clasificar el régimen actual ║
|
| 6 |
+
║ del mercado: TRENDING / RANGING / VOLATILE / BREAKOUT ║
|
| 7 |
+
║ ║
|
| 8 |
+
║ Walk-forward validation para evitar overfitting. ║
|
| 9 |
+
║ Feature importance para interpretabilidad. ║
|
| 10 |
+
║ ║
|
| 11 |
+
║ Uso: ║
|
| 12 |
+
║ python regime_detector.py ║
|
| 13 |
+
║ python regime_detector.py --symbol BTCUSDT --test-pct 20 ║
|
| 14 |
+
╚══════════════════════════════════════════════════════════════╝
|
| 15 |
+
"""
|
| 16 |
+
import os
|
| 17 |
+
import json
|
| 18 |
+
import argparse
|
| 19 |
+
import logging
|
| 20 |
+
import pickle
|
| 21 |
+
from datetime import datetime, timezone
|
| 22 |
+
|
| 23 |
+
import numpy as np
|
| 24 |
+
import pandas as pd
|
| 25 |
+
from sklearn.model_selection import TimeSeriesSplit
|
| 26 |
+
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
|
| 27 |
+
from sklearn.preprocessing import LabelEncoder
|
| 28 |
+
|
| 29 |
+
try:
|
| 30 |
+
import xgboost as xgb
|
| 31 |
+
except ImportError:
|
| 32 |
+
xgb = None
|
| 33 |
+
|
| 34 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
|
| 35 |
+
logger = logging.getLogger("AuroraBrain.Regime")
|
| 36 |
+
|
| 37 |
+
DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
|
| 38 |
+
MODELS_DIR = os.path.join(os.path.dirname(__file__), "models")
|
| 39 |
+
|
| 40 |
+
REGIME_NAMES = {0: "TRENDING", 1: "RANGING", 2: "VOLATILE", 3: "BREAKOUT"}
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def select_features(df: pd.DataFrame, target_col: str = "regime",
|
| 44 |
+
max_features: int = 80) -> list[str]:
|
| 45 |
+
"""
|
| 46 |
+
Selecciona las features más relevantes para el modelo.
|
| 47 |
+
Combina: varianza mínima + correlación con target.
|
| 48 |
+
"""
|
| 49 |
+
feature_cols = sorted([c for c in df.columns if c.startswith("f_")])
|
| 50 |
+
|
| 51 |
+
# Filtrar features con poca varianza
|
| 52 |
+
variances = df[feature_cols].var()
|
| 53 |
+
low_var = variances[variances < 1e-8].index.tolist()
|
| 54 |
+
feature_cols = [c for c in feature_cols if c not in low_var]
|
| 55 |
+
|
| 56 |
+
# Filtrar features con >30% NaN
|
| 57 |
+
nan_pct = df[feature_cols].isna().mean()
|
| 58 |
+
high_nan = nan_pct[nan_pct > 0.3].index.tolist()
|
| 59 |
+
feature_cols = [c for c in feature_cols if c not in high_nan]
|
| 60 |
+
|
| 61 |
+
# Correlación con target
|
| 62 |
+
valid = df.dropna(subset=[target_col])
|
| 63 |
+
if len(valid) > 100:
|
| 64 |
+
corr = valid[feature_cols].corrwith(valid[target_col]).abs()
|
| 65 |
+
corr = corr.sort_values(ascending=False)
|
| 66 |
+
feature_cols = corr.head(max_features).index.tolist()
|
| 67 |
+
|
| 68 |
+
logger.info(" 📊 Features seleccionadas: %d (de %d originales)",
|
| 69 |
+
len(feature_cols), sum(1 for c in df.columns if c.startswith("f_")))
|
| 70 |
+
return feature_cols
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def train_regime_model(df: pd.DataFrame, feature_cols: list[str],
|
| 74 |
+
test_pct: float = 20.0, n_splits: int = 5) -> dict:
|
| 75 |
+
"""
|
| 76 |
+
Entrena el detector de régimen con walk-forward validation.
|
| 77 |
+
|
| 78 |
+
Returns:
|
| 79 |
+
dict con modelo, métricas, feature importance
|
| 80 |
+
"""
|
| 81 |
+
if xgb is None:
|
| 82 |
+
logger.error("❌ xgboost no instalado")
|
| 83 |
+
return {}
|
| 84 |
+
|
| 85 |
+
# Filtrar velas sin régimen etiquetado
|
| 86 |
+
valid = df[df["regime"] >= 0].copy()
|
| 87 |
+
valid = valid.dropna(subset=feature_cols, how="all")
|
| 88 |
+
|
| 89 |
+
# Fill NaN con mediana (para features parciales)
|
| 90 |
+
X = valid[feature_cols].fillna(valid[feature_cols].median())
|
| 91 |
+
y = valid["regime"].astype(int)
|
| 92 |
+
|
| 93 |
+
logger.info(" 📊 Dataset: %d muestras, %d features, %d clases",
|
| 94 |
+
len(X), len(feature_cols), y.nunique())
|
| 95 |
+
|
| 96 |
+
# ── Split temporal (walk-forward) ──
|
| 97 |
+
split_idx = int(len(X) * (1 - test_pct / 100))
|
| 98 |
+
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
|
| 99 |
+
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]
|
| 100 |
+
|
| 101 |
+
logger.info(" 📊 Train: %d | Test: %d (%.0f%%)", len(X_train), len(X_test), test_pct)
|
| 102 |
+
|
| 103 |
+
# ── Entrenamiento XGBoost ──
|
| 104 |
+
class_counts = y_train.value_counts()
|
| 105 |
+
total = len(y_train)
|
| 106 |
+
sample_weights = y_train.map(lambda x: total / (len(class_counts) * class_counts[x]))
|
| 107 |
+
|
| 108 |
+
model = xgb.XGBClassifier(
|
| 109 |
+
n_estimators=300,
|
| 110 |
+
max_depth=6,
|
| 111 |
+
learning_rate=0.05,
|
| 112 |
+
subsample=0.8,
|
| 113 |
+
colsample_bytree=0.8,
|
| 114 |
+
min_child_weight=5,
|
| 115 |
+
gamma=0.1,
|
| 116 |
+
reg_alpha=0.1,
|
| 117 |
+
reg_lambda=1.0,
|
| 118 |
+
objective="multi:softprob",
|
| 119 |
+
num_class=4,
|
| 120 |
+
eval_metric="mlogloss",
|
| 121 |
+
random_state=42,
|
| 122 |
+
n_jobs=-1,
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
+
model.fit(
|
| 126 |
+
X_train, y_train,
|
| 127 |
+
sample_weight=sample_weights,
|
| 128 |
+
eval_set=[(X_test, y_test)],
|
| 129 |
+
verbose=False,
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
# ── Evaluación ──
|
| 133 |
+
y_pred = model.predict(X_test)
|
| 134 |
+
accuracy = accuracy_score(y_test, y_pred)
|
| 135 |
+
report = classification_report(y_test, y_pred, target_names=list(REGIME_NAMES.values()),
|
| 136 |
+
output_dict=True, zero_division=0)
|
| 137 |
+
cm = confusion_matrix(y_test, y_pred)
|
| 138 |
+
|
| 139 |
+
logger.info("\n" + "=" * 60)
|
| 140 |
+
logger.info("🎯 RESULTADOS DEL DETECTOR DE RÉGIMEN")
|
| 141 |
+
logger.info("=" * 60)
|
| 142 |
+
logger.info(" Accuracy: %.2f%%", accuracy * 100)
|
| 143 |
+
logger.info("\n%s", classification_report(y_test, y_pred,
|
| 144 |
+
target_names=list(REGIME_NAMES.values()), zero_division=0))
|
| 145 |
+
|
| 146 |
+
# ── Feature importance ──
|
| 147 |
+
importances = model.feature_importances_
|
| 148 |
+
feat_imp = sorted(zip(feature_cols, importances), key=lambda x: -x[1])
|
| 149 |
+
|
| 150 |
+
logger.info("\n📊 Top 20 features más importantes:")
|
| 151 |
+
for i, (feat, imp) in enumerate(feat_imp[:20]):
|
| 152 |
+
logger.info(" %2d. %-35s %.4f", i + 1, feat, imp)
|
| 153 |
+
|
| 154 |
+
# ── Walk-forward cross-validation ──
|
| 155 |
+
tscv = TimeSeriesSplit(n_splits=n_splits)
|
| 156 |
+
cv_scores = []
|
| 157 |
+
|
| 158 |
+
for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
|
| 159 |
+
cv_model = xgb.XGBClassifier(
|
| 160 |
+
n_estimators=200, max_depth=6, learning_rate=0.05,
|
| 161 |
+
subsample=0.8, colsample_bytree=0.8,
|
| 162 |
+
objective="multi:softprob", num_class=4,
|
| 163 |
+
random_state=42, n_jobs=-1,
|
| 164 |
+
)
|
| 165 |
+
cv_model.fit(X.iloc[train_idx], y.iloc[train_idx], verbose=False)
|
| 166 |
+
cv_pred = cv_model.predict(X.iloc[val_idx])
|
| 167 |
+
cv_acc = accuracy_score(y.iloc[val_idx], cv_pred)
|
| 168 |
+
cv_scores.append(cv_acc)
|
| 169 |
+
logger.info(" Fold %d: %.2f%%", fold + 1, cv_acc * 100)
|
| 170 |
+
|
| 171 |
+
avg_cv = np.mean(cv_scores)
|
| 172 |
+
std_cv = np.std(cv_scores)
|
| 173 |
+
logger.info(" Walk-Forward CV: %.2f%% ± %.2f%%", avg_cv * 100, std_cv * 100)
|
| 174 |
+
|
| 175 |
+
# ── Guardar modelo ──
|
| 176 |
+
os.makedirs(MODELS_DIR, exist_ok=True)
|
| 177 |
+
model_path = os.path.join(MODELS_DIR, "regime_model.pkl")
|
| 178 |
+
with open(model_path, "wb") as f:
|
| 179 |
+
pickle.dump(model, f)
|
| 180 |
+
|
| 181 |
+
# Guardar metadatos
|
| 182 |
+
metadata = {
|
| 183 |
+
"trained_at": datetime.now(timezone.utc).isoformat(),
|
| 184 |
+
"accuracy": round(accuracy * 100, 2),
|
| 185 |
+
"cv_accuracy": round(avg_cv * 100, 2),
|
| 186 |
+
"cv_std": round(std_cv * 100, 2),
|
| 187 |
+
"n_train": len(X_train),
|
| 188 |
+
"n_test": len(X_test),
|
| 189 |
+
"n_features": len(feature_cols),
|
| 190 |
+
"feature_cols": feature_cols,
|
| 191 |
+
"top_features": [{"name": f, "importance": round(float(i), 4)}
|
| 192 |
+
for f, i in feat_imp[:30]],
|
| 193 |
+
"class_report": report,
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
meta_path = os.path.join(MODELS_DIR, "regime_metadata.json")
|
| 197 |
+
with open(meta_path, "w") as f:
|
| 198 |
+
json.dump(metadata, f, indent=2, ensure_ascii=False)
|
| 199 |
+
|
| 200 |
+
logger.info("\n💾 Modelo guardado: %s", model_path)
|
| 201 |
+
logger.info("💾 Metadata: %s", meta_path)
|
| 202 |
+
|
| 203 |
+
return {
|
| 204 |
+
"model": model,
|
| 205 |
+
"accuracy": accuracy,
|
| 206 |
+
"cv_accuracy": avg_cv,
|
| 207 |
+
"feature_cols": feature_cols,
|
| 208 |
+
"metadata": metadata,
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
def predict_regime(df: pd.DataFrame) -> dict:
|
| 213 |
+
"""
|
| 214 |
+
Predice el régimen actual usando el modelo guardado.
|
| 215 |
+
Retorna dict con régimen, probabilidades y confianza.
|
| 216 |
+
"""
|
| 217 |
+
model_path = os.path.join(MODELS_DIR, "regime_model.pkl")
|
| 218 |
+
meta_path = os.path.join(MODELS_DIR, "regime_metadata.json")
|
| 219 |
+
|
| 220 |
+
if not os.path.exists(model_path):
|
| 221 |
+
return {"error": "Modelo no encontrado — entrenar primero"}
|
| 222 |
+
|
| 223 |
+
with open(model_path, "rb") as f:
|
| 224 |
+
model = pickle.load(f)
|
| 225 |
+
with open(meta_path, "r") as f:
|
| 226 |
+
metadata = json.load(f)
|
| 227 |
+
|
| 228 |
+
feature_cols = metadata["feature_cols"]
|
| 229 |
+
|
| 230 |
+
# Preparar features de la última vela
|
| 231 |
+
last_row = df[feature_cols].iloc[-1:].fillna(df[feature_cols].median())
|
| 232 |
+
|
| 233 |
+
# Predecir
|
| 234 |
+
proba = model.predict_proba(last_row)[0]
|
| 235 |
+
regime_id = int(np.argmax(proba))
|
| 236 |
+
confidence = float(proba[regime_id])
|
| 237 |
+
|
| 238 |
+
return {
|
| 239 |
+
"regime": REGIME_NAMES[regime_id],
|
| 240 |
+
"regime_id": regime_id,
|
| 241 |
+
"confidence": round(confidence, 4),
|
| 242 |
+
"probabilities": {
|
| 243 |
+
REGIME_NAMES[i]: round(float(p), 4) for i, p in enumerate(proba)
|
| 244 |
+
},
|
| 245 |
+
"model_accuracy": metadata.get("accuracy", 0),
|
| 246 |
+
"model_cv_accuracy": metadata.get("cv_accuracy", 0),
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
# ─────────────────────────────────────────────
|
| 251 |
+
# MAIN
|
| 252 |
+
# ─────────────────────────────────────────────
|
| 253 |
+
def main():
|
| 254 |
+
parser = argparse.ArgumentParser(description="Aurora Brain — Regime Detector")
|
| 255 |
+
parser.add_argument("--symbol", default="BTCUSDT")
|
| 256 |
+
parser.add_argument("--timeframe", default="4h")
|
| 257 |
+
parser.add_argument("--test-pct", type=float, default=20.0)
|
| 258 |
+
parser.add_argument("--predict-only", action="store_true",
|
| 259 |
+
help="Solo predecir régimen actual (no entrenar)")
|
| 260 |
+
args = parser.parse_args()
|
| 261 |
+
|
| 262 |
+
labeled_path = os.path.join(DATA_DIR, f"labeled_{args.symbol}_{args.timeframe}.parquet")
|
| 263 |
+
if not os.path.exists(labeled_path):
|
| 264 |
+
logger.error("❌ No encontrado: %s — ejecutá regime_labeler.py primero", labeled_path)
|
| 265 |
+
return
|
| 266 |
+
|
| 267 |
+
df = pd.read_parquet(labeled_path)
|
| 268 |
+
|
| 269 |
+
if args.predict_only:
|
| 270 |
+
result = predict_regime(df)
|
| 271 |
+
logger.info("\n🔮 RÉGIMEN ACTUAL: %s (confianza: %.1f%%)",
|
| 272 |
+
result["regime"], result["confidence"] * 100)
|
| 273 |
+
logger.info(" Probabilidades: %s", result["probabilities"])
|
| 274 |
+
else:
|
| 275 |
+
feature_cols = select_features(df)
|
| 276 |
+
result = train_regime_model(df, feature_cols, test_pct=args.test_pct)
|
| 277 |
+
|
| 278 |
+
if result:
|
| 279 |
+
# Predecir régimen actual
|
| 280 |
+
pred = predict_regime(df)
|
| 281 |
+
logger.info("\n🔮 RÉGIMEN ACTUAL: %s (confianza: %.1f%%)",
|
| 282 |
+
pred["regime"], pred["confidence"] * 100)
|
| 283 |
+
|
| 284 |
+
|
| 285 |
+
if __name__ == "__main__":
|
| 286 |
+
main()
|
regime_labeler.py
ADDED
|
@@ -0,0 +1,217 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
╔══════════════════════════════════════════════════════════════╗
|
| 3 |
+
║ AURORA BRAIN — Regime Labeler ║
|
| 4 |
+
║ ║
|
| 5 |
+
║ Etiqueta cada vela con el régimen de mercado que ocurrió ║
|
| 6 |
+
║ DESPUÉS (post-hoc). Esto se usa como target para entrenar ║
|
| 7 |
+
║ el detector de régimen. ║
|
| 8 |
+
║ ║
|
| 9 |
+
║ Regímenes: ║
|
| 10 |
+
║ 0 = TRENDING — precio avanzó >5% en dirección del trend ║
|
| 11 |
+
║ 1 = RANGING — precio se mantuvo en rango ±3% ║
|
| 12 |
+
║ 2 = VOLATILE — drawdown >5% en 24h ║
|
| 13 |
+
║ 3 = BREAKOUT — movimiento >8% en 48h + volumen >3x ║
|
| 14 |
+
║ ║
|
| 15 |
+
║ Uso: ║
|
| 16 |
+
║ python regime_labeler.py ║
|
| 17 |
+
║ python regime_labeler.py --symbol BTCUSDT --horizon 12 ║
|
| 18 |
+
╚══════════════════════════════════════════════════════════════╝
|
| 19 |
+
"""
|
| 20 |
+
import os
|
| 21 |
+
import argparse
|
| 22 |
+
import logging
|
| 23 |
+
|
| 24 |
+
import numpy as np
|
| 25 |
+
import pandas as pd
|
| 26 |
+
|
| 27 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
|
| 28 |
+
logger = logging.getLogger("AuroraBrain.Labeler")
|
| 29 |
+
|
| 30 |
+
DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
|
| 31 |
+
|
| 32 |
+
# Régimen IDs
|
| 33 |
+
REGIME_TRENDING = 0
|
| 34 |
+
REGIME_RANGING = 1
|
| 35 |
+
REGIME_VOLATILE = 2
|
| 36 |
+
REGIME_BREAKOUT = 3
|
| 37 |
+
|
| 38 |
+
REGIME_NAMES = {0: "TRENDING", 1: "RANGING", 2: "VOLATILE", 3: "BREAKOUT"}
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def label_regimes(df: pd.DataFrame, horizon: int = 12,
|
| 42 |
+
trend_threshold: float = 5.0,
|
| 43 |
+
range_threshold: float = 3.0,
|
| 44 |
+
volatile_dd_threshold: float = 5.0,
|
| 45 |
+
breakout_threshold: float = 8.0,
|
| 46 |
+
breakout_vol_mult: float = 3.0) -> pd.DataFrame:
|
| 47 |
+
"""
|
| 48 |
+
Etiqueta cada vela con el régimen que ocurrió en las siguientes `horizon` velas.
|
| 49 |
+
|
| 50 |
+
Args:
|
| 51 |
+
df: DataFrame con features ya calculadas (debe tener close, volume, f_vol_sma_20)
|
| 52 |
+
horizon: Número de velas a mirar hacia adelante (default 12 = 48h en 4H)
|
| 53 |
+
trend_threshold: % mínimo de avance para considerar TRENDING
|
| 54 |
+
range_threshold: % máximo de rango para considerar RANGING
|
| 55 |
+
volatile_dd_threshold: % de drawdown para considerar VOLATILE
|
| 56 |
+
breakout_threshold: % de movimiento para considerar BREAKOUT
|
| 57 |
+
breakout_vol_mult: Multiplicador de volumen para BREAKOUT
|
| 58 |
+
|
| 59 |
+
Returns:
|
| 60 |
+
DataFrame con columna 'regime' y 'regime_name' agregadas
|
| 61 |
+
"""
|
| 62 |
+
c = df["close"]
|
| 63 |
+
v = df["volume"]
|
| 64 |
+
|
| 65 |
+
n = len(df)
|
| 66 |
+
regimes = np.full(n, REGIME_RANGING) # Default: RANGING
|
| 67 |
+
|
| 68 |
+
logger.info("🏷️ Etiquetando regímenes (horizon=%d velas)...", horizon)
|
| 69 |
+
|
| 70 |
+
for i in range(n - horizon):
|
| 71 |
+
# Ventana de futuro
|
| 72 |
+
future_prices = c.iloc[i + 1: i + 1 + horizon]
|
| 73 |
+
future_volumes = v.iloc[i + 1: i + 1 + horizon]
|
| 74 |
+
|
| 75 |
+
if len(future_prices) < horizon:
|
| 76 |
+
continue
|
| 77 |
+
|
| 78 |
+
current_price = c.iloc[i]
|
| 79 |
+
if current_price <= 0:
|
| 80 |
+
continue
|
| 81 |
+
|
| 82 |
+
# Métricas de la ventana futura
|
| 83 |
+
max_price = future_prices.max()
|
| 84 |
+
min_price = future_prices.min()
|
| 85 |
+
end_price = future_prices.iloc[-1]
|
| 86 |
+
|
| 87 |
+
# Retorno neto
|
| 88 |
+
return_pct = (end_price - current_price) / current_price * 100
|
| 89 |
+
|
| 90 |
+
# Máximo drawdown en la ventana
|
| 91 |
+
running_max = future_prices.cummax()
|
| 92 |
+
drawdowns = (running_max - future_prices) / running_max * 100
|
| 93 |
+
max_drawdown = drawdowns.max()
|
| 94 |
+
|
| 95 |
+
# Máximo rally en la ventana
|
| 96 |
+
running_min = future_prices.cummin()
|
| 97 |
+
rallies = (future_prices - running_min) / running_min * 100
|
| 98 |
+
max_rally = rallies.max()
|
| 99 |
+
|
| 100 |
+
# Rango total
|
| 101 |
+
price_range = (max_price - min_price) / current_price * 100
|
| 102 |
+
|
| 103 |
+
# Volumen promedio futuro vs histórico
|
| 104 |
+
vol_sma = df["f_vol_sma_20"].iloc[i] if "f_vol_sma_20" in df.columns else v.iloc[max(0, i-20):i].mean()
|
| 105 |
+
avg_future_vol = future_volumes.mean()
|
| 106 |
+
vol_ratio = avg_future_vol / vol_sma if vol_sma > 0 else 1.0
|
| 107 |
+
|
| 108 |
+
# ── Clasificación (prioridad: VOLATILE > BREAKOUT > TRENDING > RANGING) ──
|
| 109 |
+
|
| 110 |
+
# VOLATILE: drawdown fuerte
|
| 111 |
+
if max_drawdown >= volatile_dd_threshold:
|
| 112 |
+
regimes[i] = REGIME_VOLATILE
|
| 113 |
+
continue
|
| 114 |
+
|
| 115 |
+
# BREAKOUT: movimiento grande + volumen alto
|
| 116 |
+
if price_range >= breakout_threshold and vol_ratio >= breakout_vol_mult:
|
| 117 |
+
regimes[i] = REGIME_BREAKOUT
|
| 118 |
+
continue
|
| 119 |
+
|
| 120 |
+
# TRENDING: avance sostenido en una dirección
|
| 121 |
+
if abs(return_pct) >= trend_threshold:
|
| 122 |
+
regimes[i] = REGIME_TRENDING
|
| 123 |
+
continue
|
| 124 |
+
|
| 125 |
+
# RANGING: precio se quedó en un rango estrecho
|
| 126 |
+
if price_range <= range_threshold:
|
| 127 |
+
regimes[i] = REGIME_RANGING
|
| 128 |
+
continue
|
| 129 |
+
|
| 130 |
+
# Default para movimientos moderados
|
| 131 |
+
if abs(return_pct) >= trend_threshold * 0.6:
|
| 132 |
+
regimes[i] = REGIME_TRENDING
|
| 133 |
+
else:
|
| 134 |
+
regimes[i] = REGIME_RANGING
|
| 135 |
+
|
| 136 |
+
# Últimas `horizon` velas no tienen etiqueta confiable
|
| 137 |
+
regimes[-horizon:] = -1 # Marcar como desconocido
|
| 138 |
+
|
| 139 |
+
df["regime"] = regimes
|
| 140 |
+
df["regime_name"] = df["regime"].map(REGIME_NAMES).fillna("UNKNOWN")
|
| 141 |
+
|
| 142 |
+
# Estadísticas
|
| 143 |
+
valid = df[df["regime"] >= 0]
|
| 144 |
+
counts = valid["regime_name"].value_counts()
|
| 145 |
+
total = len(valid)
|
| 146 |
+
|
| 147 |
+
logger.info("\n" + "=" * 50)
|
| 148 |
+
logger.info("🏷️ REGÍMENES ETIQUETADOS")
|
| 149 |
+
logger.info(" Total velas etiquetadas: %d", total)
|
| 150 |
+
for name, count in counts.items():
|
| 151 |
+
pct = count / total * 100
|
| 152 |
+
logger.info(" %-12s: %5d (%5.1f%%)", name, count, pct)
|
| 153 |
+
logger.info("=" * 50)
|
| 154 |
+
|
| 155 |
+
return df
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
def label_targets(df: pd.DataFrame, horizons: list[int] = None) -> pd.DataFrame:
|
| 159 |
+
"""
|
| 160 |
+
Agrega targets de retorno futuro para los modelos de predicción.
|
| 161 |
+
Estos NO son los regímenes — son los targets numéricos.
|
| 162 |
+
"""
|
| 163 |
+
if horizons is None:
|
| 164 |
+
horizons = [6, 12, 24] # 24h, 48h, 96h en TF 4H
|
| 165 |
+
|
| 166 |
+
c = df["close"]
|
| 167 |
+
|
| 168 |
+
for h in horizons:
|
| 169 |
+
# Retorno futuro
|
| 170 |
+
df[f"target_return_{h}"] = c.shift(-h).pct_change(h).shift(h) if False else \
|
| 171 |
+
(c.shift(-h) - c) / c * 100
|
| 172 |
+
|
| 173 |
+
# Dirección (1=sube, 0=baja)
|
| 174 |
+
df[f"target_dir_{h}"] = (df[f"target_return_{h}"] > 0).astype(int)
|
| 175 |
+
|
| 176 |
+
# Max drawdown futuro
|
| 177 |
+
future_max = c.iloc[::-1].rolling(h, min_periods=1).max().iloc[::-1].shift(-1)
|
| 178 |
+
future_min = c.iloc[::-1].rolling(h, min_periods=1).min().iloc[::-1].shift(-1)
|
| 179 |
+
df[f"target_max_dd_{h}"] = (c - future_min) / c * 100
|
| 180 |
+
df[f"target_max_rally_{h}"] = (future_max - c) / c * 100
|
| 181 |
+
|
| 182 |
+
logger.info(" ✅ Targets generados para horizontes: %s", horizons)
|
| 183 |
+
return df
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
# ─────────────────────────────────────────────
|
| 187 |
+
# MAIN
|
| 188 |
+
# ─────────────────────────────────────────────
|
| 189 |
+
def main():
|
| 190 |
+
parser = argparse.ArgumentParser(description="Aurora Brain — Regime Labeler")
|
| 191 |
+
parser.add_argument("--symbol", default="BTCUSDT")
|
| 192 |
+
parser.add_argument("--timeframe", default="4h")
|
| 193 |
+
parser.add_argument("--horizon", type=int, default=12, help="Velas a futuro (default 12 = 48h en 4H)")
|
| 194 |
+
parser.add_argument("--all", action="store_true")
|
| 195 |
+
args = parser.parse_args()
|
| 196 |
+
|
| 197 |
+
symbols = ["BTCUSDT", "ETHUSDT", "SOLUSDT"] if args.all else [args.symbol]
|
| 198 |
+
|
| 199 |
+
for symbol in symbols:
|
| 200 |
+
features_path = os.path.join(DATA_DIR, f"features_{symbol}_{args.timeframe}.parquet")
|
| 201 |
+
if not os.path.exists(features_path):
|
| 202 |
+
logger.error("❌ No encontrado: %s — ejecutá feature_engine.py primero", features_path)
|
| 203 |
+
continue
|
| 204 |
+
|
| 205 |
+
df = pd.read_parquet(features_path)
|
| 206 |
+
logger.info("📊 Cargado %s: %d velas × %d columnas", symbol, len(df), len(df.columns))
|
| 207 |
+
|
| 208 |
+
df = label_regimes(df, horizon=args.horizon)
|
| 209 |
+
df = label_targets(df, horizons=[6, 12, 24])
|
| 210 |
+
|
| 211 |
+
out_path = os.path.join(DATA_DIR, f"labeled_{symbol}_{args.timeframe}.parquet")
|
| 212 |
+
df.to_parquet(out_path)
|
| 213 |
+
logger.info("💾 Guardado: %s", out_path)
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
if __name__ == "__main__":
|
| 217 |
+
main()
|
requirements.txt
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Aurora Brain — Dependencies
|
| 2 |
+
# Para HuggingFace Space (Python 3.10+)
|
| 3 |
+
|
| 4 |
+
# Core data
|
| 5 |
+
pandas>=2.0
|
| 6 |
+
numpy>=1.24
|
| 7 |
+
pyarrow>=12.0
|
| 8 |
+
|
| 9 |
+
# Technical analysis
|
| 10 |
+
pandas_ta>=0.3.14
|
| 11 |
+
|
| 12 |
+
# ML
|
| 13 |
+
scikit-learn>=1.3
|
| 14 |
+
xgboost>=2.0
|
| 15 |
+
|
| 16 |
+
# API
|
| 17 |
+
fastapi>=0.100
|
| 18 |
+
uvicorn>=0.22
|
| 19 |
+
pydantic>=2.0
|
| 20 |
+
|
| 21 |
+
# Data download
|
| 22 |
+
requests>=2.31
|
| 23 |
+
|
| 24 |
+
# Macro data
|
| 25 |
+
yfinance>=0.2.28
|
| 26 |
+
|
| 27 |
+
# Visualization (entrenamiento)
|
| 28 |
+
matplotlib>=3.7
|
| 29 |
+
|
| 30 |
+
# Future: TFT (Fase 3)
|
| 31 |
+
# darts>=0.27
|
| 32 |
+
# pytorch-lightning>=2.0
|
| 33 |
+
# torch>=2.0
|