kauabarros-24 commited on
Commit
57789e6
·
1 Parent(s): 40f7c0d

CHORE: Train model with randow forest algorithm

Browse files
data/PETR4_SA.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/PETRA_SA.csv ADDED
@@ -0,0 +1 @@
 
 
1
+ Date,Adj Close,Close,High,Low,Open,Volume
pyproject.toml CHANGED
@@ -1,22 +1,49 @@
1
  [project]
2
  name = "AllStreet"
3
  version = "0.1.0"
4
- description = "Default template for PDM package"
5
  authors = [
6
- {name = "kauabarros-24", email = "user.mail:martinsbarroskaua@gmail.com"},
7
  ]
8
- dependencies = ["fastapi>=0.135.1", "pandas>=3.0.1", "numpy>=2.4.2", "scipy>=1.17.1", "statsmodels>=0.14.6", "scikit-learn>=1.8.0", "yfinance>=1.2.0", "pandas-datareader>=0.10.0", "alpha-vantage>=3.0.0", "investpy>=1.0.8", "backtrader>=1.9.78.123", "vectorbt>=0.28.2", "influxdb-client>=1.50.0", "jupyter>=1.1.1", "jupyterlab>=4.5.5", "ipykernel>=7.2.0"]
9
- requires-python = "==3.12.*"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  readme = "README.md"
11
  license = {text = "MIT"}
12
 
 
 
 
13
 
14
  [tool.pdm]
15
  distribution = false
16
 
17
  [dependency-groups]
18
  dev = [
19
- "black>=26.3.0",
20
- "flake8>=7.3.0",
21
- "pytest>=9.0.2",
22
  ]
 
 
 
 
 
 
 
 
 
 
 
1
  [project]
2
  name = "AllStreet"
3
  version = "0.1.0"
4
+ description = "Modelo de ML para previsão de ações"
5
  authors = [
6
+ {name = "kauabarros-24", email = "martinsbarroskaua@gmail.com"},
7
  ]
8
+ dependencies = [
9
+ "pandas>=2.0.0",
10
+ "numpy>=1.24.0",
11
+ "scipy>=1.10.0",
12
+ "statsmodels>=0.14.0",
13
+ "scikit-learn>=1.3.0",
14
+ "yfinance>=0.2.0",
15
+ "pandas-datareader>=0.10.0",
16
+ "matplotlib>=3.7.0",
17
+ "seaborn>=0.12.0",
18
+ "plotly>=5.14.0",
19
+ "jupyter>=1.0.0",
20
+ "jupyterlab>=4.0.0",
21
+ "ipykernel>=6.0.0"
22
+ ]
23
+ requires-python = ">=3.12"
24
  readme = "README.md"
25
  license = {text = "MIT"}
26
 
27
+ [build-system]
28
+ requires = ["pdm-backend"]
29
+ build-backend = "pdm.backend"
30
 
31
  [tool.pdm]
32
  distribution = false
33
 
34
  [dependency-groups]
35
  dev = [
36
+ "black>=23.0.0",
37
+ "flake8>=6.0.0",
38
+ "pytest>=7.0.0"
39
  ]
40
+
41
+ [tool.pdm.scripts]
42
+ download-data = "python3 src/data_collection.py"
43
+ generate-features = "python3 src/features_enginnering.py"
44
+ features = "python3 src/features.py"
45
+ train-all = "python3 scripts/train_all.py"
46
+ train-rf = "python3 -c 'from src.model_training import train_model; train_model(10, \"random_forest\"); train_model(24, \"random_forest\"); train_model(60, \"random_forest\"); train_model(120, \"random_forest\")'"
47
+ train-logistic = "python -c 'from src.model_training import train_model; train_model(10, \"logistic\"); train_model(24, \"logistic\"); train_model(60, \"logistic\"); train_model(120, \"logistic\")'"
48
+ backtest = "python src/backtesting.py"
49
+ pipeline = "pdm run download-data && pdm run generate-features && pdm run train-all"
scripts/train_all.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.model_training import train_model
2
+
3
+ horizons = ['10m', '2y', '5y', '10y']
4
+
5
+ print("=== Treinando com Random Forest ===")
6
+ for h in horizons:
7
+ train_model(h, model_type='random_forest')
8
+
9
+ """print("\n=== Treinando com Regressão Logística ===")
10
+ for h in horizons:
11
+ train_model(h, model_type='logistic')
12
+ """
src/data_collection.py CHANGED
@@ -18,7 +18,7 @@ def save_data(df: pd.DataFrame, ticker: str, data_dir='data'):
18
  return path
19
 
20
  if __name__ == "__main__":
21
- ticker = "PETRA.SA"
22
  df = download_stock_data(ticker, start="2010-01-01", end="2025-01-01")
23
  print(df.head())
24
  save_data(df, ticker)
 
18
  return path
19
 
20
  if __name__ == "__main__":
21
+ ticker = "PETR4.SA"
22
  df = download_stock_data(ticker, start="2010-01-01", end="2025-01-01")
23
  print(df.head())
24
  save_data(df, ticker)
src/features.py CHANGED
@@ -1,54 +1,6 @@
1
- import pandas as pd
2
- import numpy as np
3
- import os
4
-
5
- def load_data(ticker: str, data_dir='data') -> pd.DataFrame:
6
- filename = f"{ticker.replace('.', '_')}.csv"
7
- path = os.path.join(data_dir, filename)
8
- df = pd.read_csv(path, index_col=0, parses_data=True)
9
-
10
- return df
11
-
12
- def resample_monthly(df_daily: pd.DataFrame) -> pd.DataFrame:
13
- df_daily.index = pd.to_datetime()
14
- monthly = df_daily.resample('M').last()
15
- return monthly
16
-
17
- def create_features_and_target(df_monthly: pd.DataFrame, horizon_months: int):
18
- df = df_monthly.copy()
19
- close = df["Close"]
20
-
21
- for lag in [1, 2, 3, 4 , 5]:
22
- df[f"ret_{lag}m"] = close.pct_change(lag).shift(1)
23
-
24
- df['ma6'] = close.rolling(6).mean()
25
- df['ma12'] = close.rolling(12).mean()
26
- df['close/ma6'] = close / df['ma6']
27
- df['close/ma12'] = close / df['ma12']
28
-
29
- future_close = close.shift(-horizon_months)
30
- df['target'] = (future_close > close).astype(int)
31
-
32
- df.dropna(inplace=True)
33
- return df
34
-
35
- def prepare_data_for_all_horizons(ticker: str, horizons=[10, 24, 60, 120]):
36
- df_daily = load_data(ticker)
37
- df_monthly = resample_monthly()
38
-
39
- data_dict = {}
40
- for h in horizons:
41
- df_h = create_features_and_target(df_monthly, h)
42
- feature_cols = [col for col in df_h.columns if col not in ['target', 'Close', 'Open', 'High', 'Low', 'Volume']]
43
- X = df_h[feature_cols]
44
- y = df_h['target']
45
- data_dict[h] = (X, y)
46
- print(f"Horizonte {h} meses: {X.shape[0]} amostras")
47
- return data_dict
48
 
49
  if __name__ == "__main__":
50
  ticker = "PETR4.SA"
51
- data_dict = prepare_data_for_all_horizons(ticker)
52
- for h, (X, y) in data_dict.items():
53
- X.to_csv(f"data/features_{h}m.csv")
54
- y.to_csv(f"data/target_{h}m.csv")
 
1
+ from src.feature_engineering import prepare_data_for_all_horizons
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  if __name__ == "__main__":
4
  ticker = "PETR4.SA"
5
+ prepare_data_for_all_horizons(ticker)
6
+ print("Arquivos gerados com sucesso na pasta data/")
 
 
src/features_enginnering.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import os
4
+
5
+ def load_data(ticker: str, data_dir='data') -> pd.DataFrame:
6
+ ticker_clean = ticker.replace('.', '_')
7
+ filename = f"raw_{ticker_clean}.csv"
8
+ path = os.path.join(data_dir, filename)
9
+ if not os.path.exists(path):
10
+ alt_path = os.path.join(data_dir, f"{ticker_clean}.csv")
11
+ if os.path.exists(alt_path):
12
+ path = alt_path
13
+ else:
14
+ raise FileNotFoundError(f"Arquivo não encontrado: {path} ou {alt_path}")
15
+ df = pd.read_csv(path, index_col=0, parse_dates=True)
16
+ return df
17
+
18
+ def resample_monthly(df_daily: pd.DataFrame) -> pd.DataFrame:
19
+ df_daily.index = pd.to_datetime(df_daily.index)
20
+ monthly = df_daily.resample('M').last()
21
+ return monthly
22
+
23
+ def create_features_and_target(df_monthly: pd.DataFrame, horizon_months: int):
24
+ df = df_monthly.copy()
25
+ close = df["Close"]
26
+ for lag in [1, 2, 3, 4, 5]:
27
+ df[f"ret_{lag}m"] = close.pct_change(lag).shift(1)
28
+ df['ma6'] = close.rolling(6).mean()
29
+ df['ma12'] = close.rolling(12).mean()
30
+ df['close/ma6'] = close / df['ma6']
31
+ df['close/ma12'] = close / df['ma12']
32
+ future_close = close.shift(-horizon_months)
33
+ df['target'] = (future_close > close).astype(int)
34
+ df.dropna(inplace=True)
35
+ return df
36
+
37
+ def prepare_data_for_all_horizons(ticker: str, horizons=[10, 24, 60, 120]):
38
+ os.makedirs('data', exist_ok=True)
39
+ df_daily = load_data(ticker)
40
+ df_monthly = resample_monthly(df_daily)
41
+ for h in horizons:
42
+ df_h = create_features_and_target(df_monthly, h)
43
+ feature_cols = [col for col in df_h.columns
44
+ if col not in ['target', 'Close', 'Open', 'High', 'Low', 'Volume']]
45
+ X = df_h[feature_cols]
46
+ y = df_h['target']
47
+ X.to_csv(f"data/features_{h}m.csv")
48
+ y.to_csv(f"data/target_{h}m.csv", header=['target'])
49
+ print(f"Horizonte {h} meses: {X.shape[0]} amostras salvas.")
50
+
51
+ if __name__ == "__main__":
52
+ ticker = "PETR4.SA"
53
+ prepare_data_for_all_horizons(ticker)
src/model_training ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.linear_model import LogisticRegression
4
+ from sklearn.ensemble import RandomForestClassifier
5
+ from sklearn.metrics import accuracy_score, classification_report
6
+
7
+ def load_features_target(horizon):
8
+ X = pd.read_csv(f"data/features_{horizon}m.csv", index_col=0)
9
+ Y = pd.read_csv(f"data/target_{horizon}m.csv", index_col=0).squeeze()
10
+
11
+ return X, Y
12
+
13
+ def temporal_train_split(X, Y, test_size=0.2):
14
+ n = len(X)
15
+ split_idx = int(n * (1 - test_size))
16
+ X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
17
+ Y_train, Y_test = Y.iloc[:split_idx], Y.iloc[split_idx:]
18
+
19
+ return X_train, X_test, Y_train, Y_test
20
+
21
+ def train_model(horizon, model_type="random_forest"):
22
+ X, Y = load_features_target(horizon)
23
+
24
+ X_train, X_test, Y_train, Y_test = temporal_train_split(X, Y, test_size=0.2)
25
+
26
+ try:
27
+ model = RandomForestClassifier(n_estimators=100, random_state=42)
28
+ except Exception as error:
29
+ raise ValueError(f"There's a error in training model: {error}")
30
+
31
+ model.fit(X_train, Y_train)
32
+
33
+ y_pred = model.predict(X_test)
34
+ accuracy = accuracy_score(Y_test, y_pred)
35
+ report = classification_report(Y_test, y_pred, target_names=['Queda', 'Sobe'])
36
+
37
+ print(f"\n--- Horizonte: {horizon} ---")
38
+ print(f"Acurácia no teste: {accuracy:.4f}")
39
+ print("Relatório de classificação:")
40
+ print(report)
41
+
42
+ return model, accuracy, report