Spaces:
Sleeping
Sleeping
| """ | |
| ์ฃผ์ ๋ฐ์ดํฐ ์์ง ๋ฐ ์ ์ฒ๋ฆฌ ์คํฌ๋ฆฝํธ | |
| """ | |
| import sys | |
| import os | |
| import argparse | |
| import numpy as np | |
| import json | |
| import pickle | |
| from datetime import datetime | |
| from pathlib import Path | |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| from src.data.data_integration import process_stock_data | |
| from src.data.processors import prepare_data | |
| def main(): | |
| parser = argparse.ArgumentParser(description="์ฃผ์ ๋ฐ์ดํฐ ์์ง ๋ฐ ์ ์ฒ๋ฆฌ") | |
| parser.add_argument('--tickers', type=str, default='NFLX,TSLA,NVDA,AMD,INTC', | |
| help='์์งํ ์ข ๋ชฉ (์ผํ๋ก ๊ตฌ๋ถ)') | |
| parser.add_argument('--start_date', type=str, default='2020-01-01', | |
| help='์์ ๋ ์ง (YYYY-MM-DD)') | |
| parser.add_argument('--end_date', type=str, default=None, | |
| help='์ข ๋ฃ ๋ ์ง (YYYY-MM-DD), ๋ฏธ์ง์ ์ ์ค๋') | |
| parser.add_argument('--fred_api_key', type=str, | |
| default='4c55d0ee6170369793707da4cba1b7be', | |
| help='FRED API ํค') | |
| parser.add_argument('--window_size', type=int, default=60, | |
| help='์๊ณ์ด ์๋์ฐ ํฌ๊ธฐ') | |
| args = parser.parse_args() | |
| # ์ข ๋ชฉ ๋ฆฌ์คํธ ํ์ฑ | |
| training_tickers = args.tickers.split(',') | |
| # ๋ ์ง ์ค์ | |
| start_date = args.start_date | |
| end_date = args.end_date or datetime.today().strftime('%Y-%m-%d') | |
| print(f"๋ฐ์ดํฐ ์์ง ์์: {', '.join(training_tickers)}") | |
| print(f"๊ธฐ๊ฐ: {start_date} ~ {end_date}") | |
| # ๋ฐ์ดํฐ ์ฒ๋ฆฌ ๋ฐ ํตํฉ | |
| final_data, all_data, industry_encoders = process_stock_data( | |
| training_tickers, | |
| start_date, | |
| end_date, | |
| args.fred_api_key | |
| ) | |
| # ๋ฐ์ดํฐ์ ์์ฝ ์ ๋ณด ์ถ๋ ฅ | |
| print("\n===== ํตํฉ ๋ฐ์ดํฐ์ ์์ฝ =====") | |
| print(f"ํฌ๊ธฐ: {final_data.shape[0]}ํ, {final_data.shape[1]}์ด") | |
| if 'ticker' in final_data.columns: | |
| print(f"์ข ๋ชฉ ์: {final_data['ticker'].nunique()}๊ฐ") | |
| if 'sector' in final_data.columns: | |
| print(f"์นํฐ ์: {final_data['sector'].nunique()}๊ฐ") | |
| if 'industry' in final_data.columns: | |
| print(f"์ฐ์ ์: {final_data['industry'].nunique()}๊ฐ") | |
| print(f"๊ธฐ๊ฐ: {final_data['Date'].min()} ~ {final_data['Date'].max()}") | |
| # ์ข ๋ชฉ๋ณ ์ ๋ณด ์ถ๋ ฅ | |
| print(f"์ข ๋ชฉ๋ณ ํ ์:") | |
| for ticker in training_tickers: | |
| if 'ticker' in final_data.columns: | |
| count = len(final_data[final_data['ticker'] == ticker]) | |
| else: | |
| count = "์ ๋ณด ์์" | |
| print(f" - {ticker}: {count}ํ") | |
| # ์ ์ฅ ๋๋ ํ ๋ฆฌ ์ค๋น | |
| os.makedirs("./data", exist_ok=True) | |
| data_dir = Path("./data") | |
| all_tickers = '_'.join(training_tickers) | |
| raw_filename = data_dir / f"{all_tickers}_data.csv" | |
| # ์๋ณธ ๋ฐ์ดํฐ ์ ์ฅ | |
| final_data.to_csv(raw_filename, index=False) | |
| print(f"์๋ณธ ๋ฐ์ดํฐ๊ฐ ์ ์ฅ๋์์ต๋๋ค: {raw_filename}") | |
| # ๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ ๋ฐ ๋ชจ๋ธ ํ์ต์ฉ ๋ฐ์ดํฐ ์์ฑ | |
| if 'Date' in final_data.columns: | |
| final_data = final_data.set_index('Date') | |
| final_data.sort_index(inplace=True) | |
| # ๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ ์คํ | |
| print("\n์ ์ฒ๋ฆฌ ๋ฐ ํ์ต์ฉ ๋ฐ์ดํฐ ์์ฑ ์์...") | |
| processed_data, ticker_encoder, _ = prepare_data(final_data, window_size=args.window_size) | |
| # ์นํฐ ๋ฐ ์ฐ์ ๋งคํ (์๋ ๊ฒฝ์ฐ) | |
| sector_map = {} | |
| industry_map = {} | |
| if 'sector' in final_data.columns and 'sector_id' in final_data.columns: | |
| for sector, sector_id in zip(final_data['sector'].unique(), final_data['sector_id'].unique()): | |
| sector_map[sector] = int(sector_id) | |
| if 'industry' in final_data.columns and 'industry_id' in final_data.columns: | |
| for industry, industry_id in zip(final_data['industry'].unique(), final_data['industry_id'].unique()): | |
| industry_map[industry] = int(industry_id) | |
| # ๊ฒฐ๊ณผ์ ์นํฐ ๋งคํ ์ถ๊ฐ | |
| processed_data['sector_map'] = sector_map | |
| processed_data['industry_map'] = industry_map | |
| processed_data['feature_count'] = processed_data['x_train'].shape[2] | |
| # ์ ์ฒ๋ฆฌ๋ ๋ฐ์ดํฐ ์ ์ฅ (pickle ํ์) | |
| processed_dir = data_dir / "processed" | |
| processed_dir.mkdir(parents=True, exist_ok=True) | |
| processed_filename = processed_dir / f"{all_tickers}_processed.pkl" | |
| with open(processed_filename, 'wb') as f: | |
| pickle.dump(processed_data, f) | |
| print(f"์ ์ฒ๋ฆฌ๋ ๋ฐ์ดํฐ๊ฐ ์ ์ฅ๋์์ต๋๋ค: {processed_filename}") | |
| # ์ธ์ฝ๋ ์ ๋ณด ์ ์ฅ (JSON ํ์) | |
| encoder_info = { | |
| 'ticker_encoder': {str(i): ticker for i, ticker in enumerate(ticker_encoder.classes_)}, | |
| 'sector_map': processed_data.get('sector_map', {}), | |
| 'industry_map': processed_data.get('industry_map', {}) | |
| } | |
| encoder_filename = processed_dir / f"{all_tickers}_encoder_info.json" | |
| with open(encoder_filename, 'w') as f: | |
| json.dump(encoder_info, f) | |
| print(f"์ธ์ฝ๋ ์ ๋ณด๊ฐ ์ ์ฅ๋์์ต๋๋ค: {encoder_filename}") | |
| # ๋ฉํ๋ฐ์ดํฐ ์ ์ฅ | |
| metadata = { | |
| 'feature_count': processed_data.get('feature_count', 0), | |
| 'window_size': args.window_size, | |
| 'tickers': training_tickers, | |
| 'start_date': str(processed_data.get('start_date', start_date)), | |
| 'end_date': str(processed_data.get('end_date', end_date)), | |
| 'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S') | |
| } | |
| metadata_filename = processed_dir / f"{all_tickers}_metadata.json" | |
| with open(metadata_filename, 'w') as f: | |
| json.dump(metadata, f) | |
| print(f"๋ฉํ๋ฐ์ดํฐ๊ฐ ์ ์ฅ๋์์ต๋๋ค: {metadata_filename}") | |
| print("\n===== ์ ์ฒ๋ฆฌ ์๋ฃ =====") | |
| print(f"ํ์ต ๋ฐ์ดํฐ ํฌ๊ธฐ: {processed_data['x_train'].shape}") | |
| print(f"๊ฒ์ฆ ๋ฐ์ดํฐ ํฌ๊ธฐ: {processed_data['x_val'].shape}") | |
| print(f"ํ ์คํธ ๋ฐ์ดํฐ ํฌ๊ธฐ: {processed_data['x_test'].shape}") | |
| print(f"ํน์ฑ ์: {processed_data['feature_count']}") | |
| if __name__ == "__main__": | |
| main() |