CHORE: Train model with randow forest algorithm

Browse files

Files changed (8) hide show

data/PETR4_SA.csv +0 -0
data/PETRA_SA.csv +1 -0
pyproject.toml +34 -7
scripts/train_all.py +12 -0
src/data_collection.py +1 -1
src/features.py +3 -51
src/features_enginnering.py +53 -0
src/model_training +42 -0

data/PETR4_SA.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/PETRA_SA.csv ADDED Viewed

	@@ -0,0 +1 @@


1	+ Date,Adj Close,Close,High,Low,Open,Volume

pyproject.toml CHANGED Viewed

@@ -1,22 +1,49 @@
 [project]
 name = "AllStreet"
 version = "0.1.0"
-description = "Default template for PDM package"
 authors = [
-    {name = "kauabarros-24", email = "user.mail:martinsbarroskaua@gmail.com"},
 ]
-dependencies = ["fastapi>=0.135.1", "pandas>=3.0.1", "numpy>=2.4.2", "scipy>=1.17.1", "statsmodels>=0.14.6", "scikit-learn>=1.8.0", "yfinance>=1.2.0", "pandas-datareader>=0.10.0", "alpha-vantage>=3.0.0", "investpy>=1.0.8", "backtrader>=1.9.78.123", "vectorbt>=0.28.2", "influxdb-client>=1.50.0", "jupyter>=1.1.1", "jupyterlab>=4.5.5", "ipykernel>=7.2.0"]
-requires-python = "==3.12.*"
 readme = "README.md"
 license = {text = "MIT"}
 [tool.pdm]
 distribution = false
 [dependency-groups]
 dev = [
-    "black>=26.3.0",
-    "flake8>=7.3.0",
-    "pytest>=9.0.2",
 ]

 [project]
 name = "AllStreet"
 version = "0.1.0"
+description = "Modelo de ML para previsão de ações"
 authors = [
+    {name = "kauabarros-24", email = "martinsbarroskaua@gmail.com"},
 ]
+dependencies = [
+    "pandas>=2.0.0",
+    "numpy>=1.24.0",
+    "scipy>=1.10.0",
+    "statsmodels>=0.14.0",
+    "scikit-learn>=1.3.0",
+    "yfinance>=0.2.0",
+    "pandas-datareader>=0.10.0",
+    "matplotlib>=3.7.0",
+    "seaborn>=0.12.0",
+    "plotly>=5.14.0",
+    "jupyter>=1.0.0",
+    "jupyterlab>=4.0.0",
+    "ipykernel>=6.0.0"
+]
+requires-python = ">=3.12"
 readme = "README.md"
 license = {text = "MIT"}
+[build-system]
+requires = ["pdm-backend"]
+build-backend = "pdm.backend"
 [tool.pdm]
 distribution = false
 [dependency-groups]
 dev = [
+    "black>=23.0.0",
+    "flake8>=6.0.0",
+    "pytest>=7.0.0"
 ]
+[tool.pdm.scripts]
+download-data = "python3 src/data_collection.py"
+generate-features = "python3 src/features_enginnering.py"
+features = "python3 src/features.py"
+train-all = "python3 scripts/train_all.py"
+train-rf = "python3 -c 'from src.model_training import train_model; train_model(10, \"random_forest\"); train_model(24, \"random_forest\"); train_model(60, \"random_forest\"); train_model(120, \"random_forest\")'"
+train-logistic = "python -c 'from src.model_training import train_model; train_model(10, \"logistic\"); train_model(24, \"logistic\"); train_model(60, \"logistic\"); train_model(120, \"logistic\")'"
+backtest = "python src/backtesting.py"
+pipeline = "pdm run download-data && pdm run generate-features && pdm run train-all"

scripts/train_all.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from src.model_training import train_model
+horizons = ['10m', '2y', '5y', '10y']
+print("=== Treinando com Random Forest ===")
+for h in horizons:
+    train_model(h, model_type='random_forest')
+"""print("\n=== Treinando com Regressão Logística ===")
+for h in horizons:
+    train_model(h, model_type='logistic')
+"""

src/data_collection.py CHANGED Viewed

@@ -18,7 +18,7 @@ def save_data(df: pd.DataFrame, ticker: str, data_dir='data'):
     return path
 if __name__ == "__main__":
-    ticker = "PETRA.SA"
     df = download_stock_data(ticker, start="2010-01-01", end="2025-01-01")
     print(df.head())
     save_data(df, ticker)

     return path
 if __name__ == "__main__":
+    ticker = "PETR4.SA"
     df = download_stock_data(ticker, start="2010-01-01", end="2025-01-01")
     print(df.head())
     save_data(df, ticker)

src/features.py CHANGED Viewed

@@ -1,54 +1,6 @@
-import pandas as pd
-import numpy as np
-import os
-def load_data(ticker: str, data_dir='data') -> pd.DataFrame:
-    filename = f"{ticker.replace('.', '_')}.csv"
-    path = os.path.join(data_dir, filename)
-    df = pd.read_csv(path, index_col=0, parses_data=True)
-    return df
-def resample_monthly(df_daily: pd.DataFrame) -> pd.DataFrame:
-    df_daily.index = pd.to_datetime()
-    monthly = df_daily.resample('M').last()
-    return monthly
-def create_features_and_target(df_monthly: pd.DataFrame, horizon_months: int):
-    df = df_monthly.copy()
-    close = df["Close"]
-    for lag in [1, 2, 3, 4 , 5]:
-        df[f"ret_{lag}m"] = close.pct_change(lag).shift(1)
-    df['ma6'] = close.rolling(6).mean()
-    df['ma12'] = close.rolling(12).mean()
-    df['close/ma6'] = close / df['ma6']
-    df['close/ma12'] = close / df['ma12']
-    future_close = close.shift(-horizon_months)
-    df['target'] = (future_close > close).astype(int)
-    df.dropna(inplace=True)
-    return df
-def prepare_data_for_all_horizons(ticker: str, horizons=[10, 24, 60, 120]):
-    df_daily = load_data(ticker)
-    df_monthly = resample_monthly()
-    data_dict = {}
-    for h in horizons:
-        df_h = create_features_and_target(df_monthly, h)
-        feature_cols = [col for col in df_h.columns if col not in ['target', 'Close', 'Open', 'High', 'Low', 'Volume']]
-        X = df_h[feature_cols]
-        y = df_h['target']
-        data_dict[h] = (X, y)
-        print(f"Horizonte {h} meses: {X.shape[0]} amostras")
-    return data_dict
 if __name__ == "__main__":
     ticker = "PETR4.SA"
-    data_dict = prepare_data_for_all_horizons(ticker)
-    for h, (X, y) in data_dict.items():
-        X.to_csv(f"data/features_{h}m.csv")
-        y.to_csv(f"data/target_{h}m.csv")

+from src.feature_engineering import prepare_data_for_all_horizons
 if __name__ == "__main__":
     ticker = "PETR4.SA"
+    prepare_data_for_all_horizons(ticker)
+    print("Arquivos gerados com sucesso na pasta data/")

src/features_enginnering.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import pandas as pd
+import numpy as np
+import os
+def load_data(ticker: str, data_dir='data') -> pd.DataFrame:
+    ticker_clean = ticker.replace('.', '_')
+    filename = f"raw_{ticker_clean}.csv"
+    path = os.path.join(data_dir, filename)
+    if not os.path.exists(path):
+        alt_path = os.path.join(data_dir, f"{ticker_clean}.csv")
+        if os.path.exists(alt_path):
+            path = alt_path
+        else:
+            raise FileNotFoundError(f"Arquivo não encontrado: {path} ou {alt_path}")
+    df = pd.read_csv(path, index_col=0, parse_dates=True)
+    return df
+def resample_monthly(df_daily: pd.DataFrame) -> pd.DataFrame:
+    df_daily.index = pd.to_datetime(df_daily.index)
+    monthly = df_daily.resample('M').last()
+    return monthly
+def create_features_and_target(df_monthly: pd.DataFrame, horizon_months: int):
+    df = df_monthly.copy()
+    close = df["Close"]
+    for lag in [1, 2, 3, 4, 5]:
+        df[f"ret_{lag}m"] = close.pct_change(lag).shift(1)
+    df['ma6'] = close.rolling(6).mean()
+    df['ma12'] = close.rolling(12).mean()
+    df['close/ma6'] = close / df['ma6']
+    df['close/ma12'] = close / df['ma12']
+    future_close = close.shift(-horizon_months)
+    df['target'] = (future_close > close).astype(int)
+    df.dropna(inplace=True)
+    return df
+def prepare_data_for_all_horizons(ticker: str, horizons=[10, 24, 60, 120]):
+    os.makedirs('data', exist_ok=True)
+    df_daily = load_data(ticker)
+    df_monthly = resample_monthly(df_daily)
+    for h in horizons:
+        df_h = create_features_and_target(df_monthly, h)
+        feature_cols = [col for col in df_h.columns
+                        if col not in ['target', 'Close', 'Open', 'High', 'Low', 'Volume']]
+        X = df_h[feature_cols]
+        y = df_h['target']
+        X.to_csv(f"data/features_{h}m.csv")
+        y.to_csv(f"data/target_{h}m.csv", header=['target'])
+        print(f"Horizonte {h} meses: {X.shape[0]} amostras salvas.")
+if __name__ == "__main__":
+    ticker = "PETR4.SA"
+    prepare_data_for_all_horizons(ticker)

src/model_training ADDED Viewed

	@@ -0,0 +1,42 @@

+import pandas as pd
+import numpy as np
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import accuracy_score, classification_report
+def load_features_target(horizon):
+    X = pd.read_csv(f"data/features_{horizon}m.csv", index_col=0)
+    Y = pd.read_csv(f"data/target_{horizon}m.csv", index_col=0).squeeze()
+    return X, Y
+def temporal_train_split(X, Y, test_size=0.2):
+    n = len(X)
+    split_idx = int(n * (1 - test_size))
+    X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
+    Y_train, Y_test = Y.iloc[:split_idx], Y.iloc[split_idx:]
+    return X_train, X_test, Y_train, Y_test
+def train_model(horizon, model_type="random_forest"):
+    X, Y = load_features_target(horizon)
+    X_train, X_test, Y_train, Y_test = temporal_train_split(X, Y, test_size=0.2)
+    try:
+        model = RandomForestClassifier(n_estimators=100, random_state=42)
+    except Exception as error:
+        raise ValueError(f"There's a error in training model: {error}")
+    model.fit(X_train, Y_train)
+    y_pred = model.predict(X_test)
+    accuracy = accuracy_score(Y_test, y_pred)
+    report = classification_report(Y_test, y_pred, target_names=['Queda', 'Sobe'])
+    print(f"\n--- Horizonte: {horizon} ---")
+    print(f"Acurácia no teste: {accuracy:.4f}")
+    print("Relatório de classificação:")
+    print(report)
+    return model, accuracy, report