File size: 5,139 Bytes
4be2d4d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
"""
๋ฐ์ดํ„ฐ ํ†ตํ•ฉ ๋ฐ ์ „์ฒ˜๋ฆฌ ๋ชจ๋“ˆ
"""

import pandas as pd
import yfinance as yf

from .technical_indicators import add_technical_indicators
from .optimization import run_technical_optimization
from .financial_data import process_financial_data
from .economic_data import get_economic_data
from .hierarchical_embedding import get_industry_data, add_industry_encoding, combine_stocks_for_embedding
from src.config import FRED_API_KEY

def process_stock_data(tickers, start_date, end_date, fred_api_key=None):
    """
    ์ฃผ์‹ ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ์˜ ์ „์ฒด ํŒŒ์ดํ”„๋ผ์ธ์„ ์‹คํ–‰ํ•˜๋Š” ํ•จ์ˆ˜
    """
    # API ํ‚ค ๊ธฐ๋ณธ๊ฐ’ ์ฒ˜๋ฆฌ
    if fred_api_key is None:
        fred_api_key = FRED_API_KEY
    
    print(f"์ฃผ์‹ ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ ์‹œ์ž‘: {len(tickers)}๊ฐœ ์ข…๋ชฉ")
    
    # 1. ๊ธฐ์ˆ ์  ์ง€ํ‘œ ์ตœ์ ํ™”
    optimal_params = run_technical_optimization(tickers, start_date, end_date)
    
    # 2. ์ตœ์ ํ™”๋œ ํŒŒ๋ผ๋ฏธํ„ฐ๋กœ ๋ฐ์ดํ„ฐ์…‹ ์ƒ์„ฑ
    all_data = {}
    
    for ticker in tickers:
        df = yf.download(ticker, start=start_date, end=end_date, auto_adjust=True)
        if isinstance(df.columns, pd.MultiIndex):
            df.columns = df.columns.droplevel(1)
            
        # ์ตœ์ ํ™”๋œ ํŒŒ๋ผ๋ฏธํ„ฐ๋กœ ๊ธฐ์ˆ ์  ์ง€ํ‘œ ์ถ”๊ฐ€
        df_with_indicators = add_technical_indicators(
            df.copy(),
            ema_params=optimal_params['ema'],
            macd_params=optimal_params['macd'],
            cmf_period=optimal_params['cmf'],
            rsi_params=optimal_params['rsi']
        )
        
        # ๊ฒฐ์ธก์น˜ ์ œ๊ฑฐ
        df_with_indicators = df_with_indicators.dropna()
        
        all_data[ticker] = df_with_indicators
        print(f"{ticker} ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ ์™„๋ฃŒ: {len(df_with_indicators)}ํ–‰")
    
    # 3. ์žฌ๋ฌด์ œํ‘œ ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ ๋ฐ ํ†ตํ•ฉ
    print("\n===== ์žฌ๋ฌด์ œํ‘œ ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ ์‹œ์ž‘ =====\n")
    financial_data_all = {}
    common_features = None

    # ๋ชจ๋“  ์ข…๋ชฉ์˜ ์žฌ๋ฌด ๋ฐ์ดํ„ฐ ์ˆ˜์ง‘
    for ticker in tickers:
        if ticker not in all_data:
            continue
            
        financial_data = process_financial_data(ticker, all_data, end_date)
        
        if financial_data is not None and not financial_data.empty:
            financial_features = [col for col in financial_data.columns if col != 'Close']
            if financial_features:
                financial_data_all[ticker] = financial_data
                
                # ๊ณตํ†ต ํŠน์„ฑ ์ถ”์ 
                if common_features is None:
                    common_features = set(financial_features)
                else:
                    common_features = common_features.intersection(set(financial_features))
    
    # 4. ๊ฒฝ์ œ ์ง€ํ‘œ ๋ฐ์ดํ„ฐ ์ถ”๊ฐ€
    print("\n===== ๊ฒฝ์ œ ์ง€ํ‘œ ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ ์‹œ์ž‘ =====\n")
    econ_df = get_economic_data(start_date, end_date, fred_api_key)
    
    # 5. ๋ฐ์ดํ„ฐ ์ตœ์ข… ํ†ตํ•ฉ (๊ธฐ์ˆ  ์ง€ํ‘œ + ์žฌ๋ฌด์ œํ‘œ + ๊ฒฝ์ œ ์ง€ํ‘œ)
    print("\n===== ์ตœ์ข… ๋ฐ์ดํ„ฐ ํ†ตํ•ฉ =====\n")
    
    # ๊ณตํ†ต ์žฌ๋ฌด ํŠน์„ฑ ์„ ํƒ
    if common_features:
        selected_common_features = list(common_features)[:20]  # ์ƒ์œ„ 20๊ฐœ
        print(f"๋ชจ๋“  ์ข…๋ชฉ์— ๊ณตํ†ต์ธ ์žฌ๋ฌด ํŠน์„ฑ ์ค‘ {len(selected_common_features)}๊ฐœ ์„ ํƒ")
    else:
        selected_common_features = []
        print("๊ณตํ†ต ์žฌ๋ฌด ํŠน์„ฑ์ด ์—†์Šต๋‹ˆ๋‹ค.")
    
    # ๊ฐ ์ข…๋ชฉ๋ณ„ ๋ฐ์ดํ„ฐ ํ†ตํ•ฉ
    for ticker in tickers:
        if ticker in all_data:
            try:
                # ๊ธฐ๋ณธ ๋ฐ์ดํ„ฐ (๊ธฐ์ˆ ์  ์ง€ํ‘œ ํฌํ•จ)
                stock_data = all_data[ticker].copy()
                
                # ์žฌ๋ฌด์ œํ‘œ ๋ฐ์ดํ„ฐ ์ถ”๊ฐ€ (์„ ํƒ๋œ ๊ณตํ†ต ํŠน์„ฑ๋งŒ)
                if ticker in financial_data_all and selected_common_features:
                    fin_data = financial_data_all[ticker][selected_common_features]
                    stock_data = stock_data.join(fin_data, how='left')
                
                # ๊ฒฝ์ œ ์ง€ํ‘œ ๋ฐ์ดํ„ฐ ์ถ”๊ฐ€
                stock_data = stock_data.join(econ_df, how='left')
                
                # ๊ฒฐ์ธก์น˜ ์ฒ˜๋ฆฌ
                for col in stock_data.columns:
                    if col != 'Close' and stock_data[col].isna().any():
                        stock_data[col] = stock_data[col].interpolate(method='linear')
                
                stock_data = stock_data.dropna()
                
                # ์ตœ์ข… ๋ฐ์ดํ„ฐ ์ €์žฅ
                all_data[ticker] = stock_data
                
                print(f"{ticker} ๋ฐ์ดํ„ฐ ํ†ตํ•ฉ ์™„๋ฃŒ: {stock_data.shape[1]}๊ฐœ ํŠน์„ฑ")
                
            except Exception as e:
                print(f"{ticker} ๋ฐ์ดํ„ฐ ํ†ตํ•ฉ ์‹คํŒจ: {e}")
    
    # 6. ์‚ฐ์—… ์ •๋ณด ๊ฐ€์ ธ์˜ค๊ธฐ ๋ฐ ๋ฐ์ดํ„ฐ ํ†ตํ•ฉ
    industry_df = get_industry_data(tickers)
    
    # 7. ์ข…๋ชฉ ์ž„๋ฒ ๋”ฉ์„ ์œ„ํ•œ ๋ฐ์ดํ„ฐ ํ†ตํ•ฉ
    combined_data = combine_stocks_for_embedding(all_data, tickers)
    
    # 8. ์‚ฐ์—… ์ •๋ณด ์ธ์ฝ”๋”ฉ ์ถ”๊ฐ€
    final_data, industry_encoders = add_industry_encoding(combined_data, industry_df)
    
    return final_data, all_data, industry_encoders