File size: 2,795 Bytes
57789e6
 
 
 
af171b5
43056e4
af171b5
 
 
 
57789e6
 
 
af171b5
 
 
 
 
 
 
 
 
43056e4
af171b5
 
 
 
 
 
 
 
 
 
57789e6
 
 
af171b5
 
 
 
 
 
57789e6
af171b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57789e6
af171b5
 
 
57789e6
 
af171b5
 
 
 
 
 
 
 
 
 
 
 
 
57789e6
af171b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57789e6
af171b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import pandas as pd
import numpy as np
import os


def load_data(ticker, data_dir='data'):

    print("=================================")
    print("Iniciando carregamento de dados")

    ticker_clean = ticker.replace('.', '_')
    filename = f"raw_{ticker_clean}.csv"
    path = os.path.join(data_dir, filename)

    print(f"Carregando arquivo: {path}")

    if not os.path.exists(path):
        raise FileNotFoundError(f"Arquivo não encontrado: {path}")

    df = pd.read_csv(path, index_col=0)

    print("Convertendo índice para datetime...")
    df.index = pd.to_datetime(df.index, errors='coerce')

    print("Removendo datas inválidas...")
    df = df[~df.index.isna()]

    if "Close" not in df.columns:
        print("Colunas disponíveis:", df.columns)
        raise ValueError("Coluna 'Close' não encontrada")

    print("Dataset carregado:", df.shape)

    return df


def create_features_and_target(df, horizon_days):

    print(f"Criando features para horizonte {horizon_days} dias")

    df = df.copy()

    close = df["Close"]

    # retornos passados
    for lag in [5, 10, 20, 50]:
        df[f"ret_{lag}d"] = close.pct_change(lag).shift(1)

    # médias móveis
    df["ma20"] = close.rolling(20).mean()
    df["ma50"] = close.rolling(50).mean()

    # relação preço/média
    df["close_ma20_ratio"] = close / df["ma20"]
    df["close_ma50_ratio"] = close / df["ma50"]

    # target futuro
    future_close = close.shift(-horizon_days)

    df["target"] = (future_close > close).astype(int)

    df.dropna(inplace=True)

    print("Features criadas:", df.shape)

    return df


def prepare_data_for_all_horizons(ticker, horizons=[30, 90, 180, 360]):

    print("=================================")
    print(f"Preparando dados para {ticker}")
    print("=================================")

    os.makedirs("data", exist_ok=True)

    df = load_data(ticker)

    print("Total de dados:", len(df))

    for h in horizons:

        print("---------------------------------")
        print(f"Horizonte: {h} dias")

        df_h = create_features_and_target(df, h)

        feature_cols = [
            col for col in df_h.columns
            if col not in [
                "target",
                "Close",
                "Open",
                "High",
                "Low",
                "Volume",
                "Adj Close"
            ]
        ]

        X = df_h[feature_cols]
        y = df_h["target"]

        X_path = f"data/features_{h}d.csv"
        y_path = f"data/target_{h}d.csv"

        X.to_csv(X_path)
        y.to_csv(y_path, header=["target"])

        print("Features salvas:", X_path)
        print("Target salvo:", y_path)
        print("Amostras:", X.shape[0])


if __name__ == "__main__":

    prepare_data_for_all_horizons("PETR4.SA")