kauabarros-24 commited on
Commit ·
57789e6
1
Parent(s): 40f7c0d
CHORE: Train model with randow forest algorithm
Browse files- data/PETR4_SA.csv +0 -0
- data/PETRA_SA.csv +1 -0
- pyproject.toml +34 -7
- scripts/train_all.py +12 -0
- src/data_collection.py +1 -1
- src/features.py +3 -51
- src/features_enginnering.py +53 -0
- src/model_training +42 -0
data/PETR4_SA.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/PETRA_SA.csv
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
Date,Adj Close,Close,High,Low,Open,Volume
|
pyproject.toml
CHANGED
|
@@ -1,22 +1,49 @@
|
|
| 1 |
[project]
|
| 2 |
name = "AllStreet"
|
| 3 |
version = "0.1.0"
|
| 4 |
-
description = "
|
| 5 |
authors = [
|
| 6 |
-
{name = "kauabarros-24", email = "
|
| 7 |
]
|
| 8 |
-
dependencies = [
|
| 9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
readme = "README.md"
|
| 11 |
license = {text = "MIT"}
|
| 12 |
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
[tool.pdm]
|
| 15 |
distribution = false
|
| 16 |
|
| 17 |
[dependency-groups]
|
| 18 |
dev = [
|
| 19 |
-
"black>=
|
| 20 |
-
"flake8>=
|
| 21 |
-
"pytest>=
|
| 22 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
[project]
|
| 2 |
name = "AllStreet"
|
| 3 |
version = "0.1.0"
|
| 4 |
+
description = "Modelo de ML para previsão de ações"
|
| 5 |
authors = [
|
| 6 |
+
{name = "kauabarros-24", email = "martinsbarroskaua@gmail.com"},
|
| 7 |
]
|
| 8 |
+
dependencies = [
|
| 9 |
+
"pandas>=2.0.0",
|
| 10 |
+
"numpy>=1.24.0",
|
| 11 |
+
"scipy>=1.10.0",
|
| 12 |
+
"statsmodels>=0.14.0",
|
| 13 |
+
"scikit-learn>=1.3.0",
|
| 14 |
+
"yfinance>=0.2.0",
|
| 15 |
+
"pandas-datareader>=0.10.0",
|
| 16 |
+
"matplotlib>=3.7.0",
|
| 17 |
+
"seaborn>=0.12.0",
|
| 18 |
+
"plotly>=5.14.0",
|
| 19 |
+
"jupyter>=1.0.0",
|
| 20 |
+
"jupyterlab>=4.0.0",
|
| 21 |
+
"ipykernel>=6.0.0"
|
| 22 |
+
]
|
| 23 |
+
requires-python = ">=3.12"
|
| 24 |
readme = "README.md"
|
| 25 |
license = {text = "MIT"}
|
| 26 |
|
| 27 |
+
[build-system]
|
| 28 |
+
requires = ["pdm-backend"]
|
| 29 |
+
build-backend = "pdm.backend"
|
| 30 |
|
| 31 |
[tool.pdm]
|
| 32 |
distribution = false
|
| 33 |
|
| 34 |
[dependency-groups]
|
| 35 |
dev = [
|
| 36 |
+
"black>=23.0.0",
|
| 37 |
+
"flake8>=6.0.0",
|
| 38 |
+
"pytest>=7.0.0"
|
| 39 |
]
|
| 40 |
+
|
| 41 |
+
[tool.pdm.scripts]
|
| 42 |
+
download-data = "python3 src/data_collection.py"
|
| 43 |
+
generate-features = "python3 src/features_enginnering.py"
|
| 44 |
+
features = "python3 src/features.py"
|
| 45 |
+
train-all = "python3 scripts/train_all.py"
|
| 46 |
+
train-rf = "python3 -c 'from src.model_training import train_model; train_model(10, \"random_forest\"); train_model(24, \"random_forest\"); train_model(60, \"random_forest\"); train_model(120, \"random_forest\")'"
|
| 47 |
+
train-logistic = "python -c 'from src.model_training import train_model; train_model(10, \"logistic\"); train_model(24, \"logistic\"); train_model(60, \"logistic\"); train_model(120, \"logistic\")'"
|
| 48 |
+
backtest = "python src/backtesting.py"
|
| 49 |
+
pipeline = "pdm run download-data && pdm run generate-features && pdm run train-all"
|
scripts/train_all.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from src.model_training import train_model
|
| 2 |
+
|
| 3 |
+
horizons = ['10m', '2y', '5y', '10y']
|
| 4 |
+
|
| 5 |
+
print("=== Treinando com Random Forest ===")
|
| 6 |
+
for h in horizons:
|
| 7 |
+
train_model(h, model_type='random_forest')
|
| 8 |
+
|
| 9 |
+
"""print("\n=== Treinando com Regressão Logística ===")
|
| 10 |
+
for h in horizons:
|
| 11 |
+
train_model(h, model_type='logistic')
|
| 12 |
+
"""
|
src/data_collection.py
CHANGED
|
@@ -18,7 +18,7 @@ def save_data(df: pd.DataFrame, ticker: str, data_dir='data'):
|
|
| 18 |
return path
|
| 19 |
|
| 20 |
if __name__ == "__main__":
|
| 21 |
-
ticker = "
|
| 22 |
df = download_stock_data(ticker, start="2010-01-01", end="2025-01-01")
|
| 23 |
print(df.head())
|
| 24 |
save_data(df, ticker)
|
|
|
|
| 18 |
return path
|
| 19 |
|
| 20 |
if __name__ == "__main__":
|
| 21 |
+
ticker = "PETR4.SA"
|
| 22 |
df = download_stock_data(ticker, start="2010-01-01", end="2025-01-01")
|
| 23 |
print(df.head())
|
| 24 |
save_data(df, ticker)
|
src/features.py
CHANGED
|
@@ -1,54 +1,6 @@
|
|
| 1 |
-
|
| 2 |
-
import numpy as np
|
| 3 |
-
import os
|
| 4 |
-
|
| 5 |
-
def load_data(ticker: str, data_dir='data') -> pd.DataFrame:
|
| 6 |
-
filename = f"{ticker.replace('.', '_')}.csv"
|
| 7 |
-
path = os.path.join(data_dir, filename)
|
| 8 |
-
df = pd.read_csv(path, index_col=0, parses_data=True)
|
| 9 |
-
|
| 10 |
-
return df
|
| 11 |
-
|
| 12 |
-
def resample_monthly(df_daily: pd.DataFrame) -> pd.DataFrame:
|
| 13 |
-
df_daily.index = pd.to_datetime()
|
| 14 |
-
monthly = df_daily.resample('M').last()
|
| 15 |
-
return monthly
|
| 16 |
-
|
| 17 |
-
def create_features_and_target(df_monthly: pd.DataFrame, horizon_months: int):
|
| 18 |
-
df = df_monthly.copy()
|
| 19 |
-
close = df["Close"]
|
| 20 |
-
|
| 21 |
-
for lag in [1, 2, 3, 4 , 5]:
|
| 22 |
-
df[f"ret_{lag}m"] = close.pct_change(lag).shift(1)
|
| 23 |
-
|
| 24 |
-
df['ma6'] = close.rolling(6).mean()
|
| 25 |
-
df['ma12'] = close.rolling(12).mean()
|
| 26 |
-
df['close/ma6'] = close / df['ma6']
|
| 27 |
-
df['close/ma12'] = close / df['ma12']
|
| 28 |
-
|
| 29 |
-
future_close = close.shift(-horizon_months)
|
| 30 |
-
df['target'] = (future_close > close).astype(int)
|
| 31 |
-
|
| 32 |
-
df.dropna(inplace=True)
|
| 33 |
-
return df
|
| 34 |
-
|
| 35 |
-
def prepare_data_for_all_horizons(ticker: str, horizons=[10, 24, 60, 120]):
|
| 36 |
-
df_daily = load_data(ticker)
|
| 37 |
-
df_monthly = resample_monthly()
|
| 38 |
-
|
| 39 |
-
data_dict = {}
|
| 40 |
-
for h in horizons:
|
| 41 |
-
df_h = create_features_and_target(df_monthly, h)
|
| 42 |
-
feature_cols = [col for col in df_h.columns if col not in ['target', 'Close', 'Open', 'High', 'Low', 'Volume']]
|
| 43 |
-
X = df_h[feature_cols]
|
| 44 |
-
y = df_h['target']
|
| 45 |
-
data_dict[h] = (X, y)
|
| 46 |
-
print(f"Horizonte {h} meses: {X.shape[0]} amostras")
|
| 47 |
-
return data_dict
|
| 48 |
|
| 49 |
if __name__ == "__main__":
|
| 50 |
ticker = "PETR4.SA"
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
X.to_csv(f"data/features_{h}m.csv")
|
| 54 |
-
y.to_csv(f"data/target_{h}m.csv")
|
|
|
|
| 1 |
+
from src.feature_engineering import prepare_data_for_all_horizons
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
if __name__ == "__main__":
|
| 4 |
ticker = "PETR4.SA"
|
| 5 |
+
prepare_data_for_all_horizons(ticker)
|
| 6 |
+
print("Arquivos gerados com sucesso na pasta data/")
|
|
|
|
|
|
src/features_enginnering.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
def load_data(ticker: str, data_dir='data') -> pd.DataFrame:
|
| 6 |
+
ticker_clean = ticker.replace('.', '_')
|
| 7 |
+
filename = f"raw_{ticker_clean}.csv"
|
| 8 |
+
path = os.path.join(data_dir, filename)
|
| 9 |
+
if not os.path.exists(path):
|
| 10 |
+
alt_path = os.path.join(data_dir, f"{ticker_clean}.csv")
|
| 11 |
+
if os.path.exists(alt_path):
|
| 12 |
+
path = alt_path
|
| 13 |
+
else:
|
| 14 |
+
raise FileNotFoundError(f"Arquivo não encontrado: {path} ou {alt_path}")
|
| 15 |
+
df = pd.read_csv(path, index_col=0, parse_dates=True)
|
| 16 |
+
return df
|
| 17 |
+
|
| 18 |
+
def resample_monthly(df_daily: pd.DataFrame) -> pd.DataFrame:
|
| 19 |
+
df_daily.index = pd.to_datetime(df_daily.index)
|
| 20 |
+
monthly = df_daily.resample('M').last()
|
| 21 |
+
return monthly
|
| 22 |
+
|
| 23 |
+
def create_features_and_target(df_monthly: pd.DataFrame, horizon_months: int):
|
| 24 |
+
df = df_monthly.copy()
|
| 25 |
+
close = df["Close"]
|
| 26 |
+
for lag in [1, 2, 3, 4, 5]:
|
| 27 |
+
df[f"ret_{lag}m"] = close.pct_change(lag).shift(1)
|
| 28 |
+
df['ma6'] = close.rolling(6).mean()
|
| 29 |
+
df['ma12'] = close.rolling(12).mean()
|
| 30 |
+
df['close/ma6'] = close / df['ma6']
|
| 31 |
+
df['close/ma12'] = close / df['ma12']
|
| 32 |
+
future_close = close.shift(-horizon_months)
|
| 33 |
+
df['target'] = (future_close > close).astype(int)
|
| 34 |
+
df.dropna(inplace=True)
|
| 35 |
+
return df
|
| 36 |
+
|
| 37 |
+
def prepare_data_for_all_horizons(ticker: str, horizons=[10, 24, 60, 120]):
|
| 38 |
+
os.makedirs('data', exist_ok=True)
|
| 39 |
+
df_daily = load_data(ticker)
|
| 40 |
+
df_monthly = resample_monthly(df_daily)
|
| 41 |
+
for h in horizons:
|
| 42 |
+
df_h = create_features_and_target(df_monthly, h)
|
| 43 |
+
feature_cols = [col for col in df_h.columns
|
| 44 |
+
if col not in ['target', 'Close', 'Open', 'High', 'Low', 'Volume']]
|
| 45 |
+
X = df_h[feature_cols]
|
| 46 |
+
y = df_h['target']
|
| 47 |
+
X.to_csv(f"data/features_{h}m.csv")
|
| 48 |
+
y.to_csv(f"data/target_{h}m.csv", header=['target'])
|
| 49 |
+
print(f"Horizonte {h} meses: {X.shape[0]} amostras salvas.")
|
| 50 |
+
|
| 51 |
+
if __name__ == "__main__":
|
| 52 |
+
ticker = "PETR4.SA"
|
| 53 |
+
prepare_data_for_all_horizons(ticker)
|
src/model_training
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
from sklearn.linear_model import LogisticRegression
|
| 4 |
+
from sklearn.ensemble import RandomForestClassifier
|
| 5 |
+
from sklearn.metrics import accuracy_score, classification_report
|
| 6 |
+
|
| 7 |
+
def load_features_target(horizon):
|
| 8 |
+
X = pd.read_csv(f"data/features_{horizon}m.csv", index_col=0)
|
| 9 |
+
Y = pd.read_csv(f"data/target_{horizon}m.csv", index_col=0).squeeze()
|
| 10 |
+
|
| 11 |
+
return X, Y
|
| 12 |
+
|
| 13 |
+
def temporal_train_split(X, Y, test_size=0.2):
|
| 14 |
+
n = len(X)
|
| 15 |
+
split_idx = int(n * (1 - test_size))
|
| 16 |
+
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
|
| 17 |
+
Y_train, Y_test = Y.iloc[:split_idx], Y.iloc[split_idx:]
|
| 18 |
+
|
| 19 |
+
return X_train, X_test, Y_train, Y_test
|
| 20 |
+
|
| 21 |
+
def train_model(horizon, model_type="random_forest"):
|
| 22 |
+
X, Y = load_features_target(horizon)
|
| 23 |
+
|
| 24 |
+
X_train, X_test, Y_train, Y_test = temporal_train_split(X, Y, test_size=0.2)
|
| 25 |
+
|
| 26 |
+
try:
|
| 27 |
+
model = RandomForestClassifier(n_estimators=100, random_state=42)
|
| 28 |
+
except Exception as error:
|
| 29 |
+
raise ValueError(f"There's a error in training model: {error}")
|
| 30 |
+
|
| 31 |
+
model.fit(X_train, Y_train)
|
| 32 |
+
|
| 33 |
+
y_pred = model.predict(X_test)
|
| 34 |
+
accuracy = accuracy_score(Y_test, y_pred)
|
| 35 |
+
report = classification_report(Y_test, y_pred, target_names=['Queda', 'Sobe'])
|
| 36 |
+
|
| 37 |
+
print(f"\n--- Horizonte: {horizon} ---")
|
| 38 |
+
print(f"Acurácia no teste: {accuracy:.4f}")
|
| 39 |
+
print("Relatório de classificação:")
|
| 40 |
+
print(report)
|
| 41 |
+
|
| 42 |
+
return model, accuracy, report
|