johnaness's picture
Deploy OStock FastAPI backend to HF Space (Docker SDK, port 7860)
4be2d4d
"""
์ฃผ์‹ ๋ฐ์ดํ„ฐ ์ˆ˜์ง‘ ๋ฐ ์ „์ฒ˜๋ฆฌ ์Šคํฌ๋ฆฝํŠธ
"""
import sys
import os
import argparse
import numpy as np
import json
import pickle
from datetime import datetime
from pathlib import Path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from src.data.data_integration import process_stock_data
from src.data.processors import prepare_data
def main():
parser = argparse.ArgumentParser(description="์ฃผ์‹ ๋ฐ์ดํ„ฐ ์ˆ˜์ง‘ ๋ฐ ์ „์ฒ˜๋ฆฌ")
parser.add_argument('--tickers', type=str, default='NFLX,TSLA,NVDA,AMD,INTC',
help='์ˆ˜์ง‘ํ•  ์ข…๋ชฉ (์‰ผํ‘œ๋กœ ๊ตฌ๋ถ„)')
parser.add_argument('--start_date', type=str, default='2020-01-01',
help='์‹œ์ž‘ ๋‚ ์งœ (YYYY-MM-DD)')
parser.add_argument('--end_date', type=str, default=None,
help='์ข…๋ฃŒ ๋‚ ์งœ (YYYY-MM-DD), ๋ฏธ์ง€์ •์‹œ ์˜ค๋Š˜')
parser.add_argument('--fred_api_key', type=str,
default='4c55d0ee6170369793707da4cba1b7be',
help='FRED API ํ‚ค')
parser.add_argument('--window_size', type=int, default=60,
help='์‹œ๊ณ„์—ด ์œˆ๋„์šฐ ํฌ๊ธฐ')
args = parser.parse_args()
# ์ข…๋ชฉ ๋ฆฌ์ŠคํŠธ ํŒŒ์‹ฑ
training_tickers = args.tickers.split(',')
# ๋‚ ์งœ ์„ค์ •
start_date = args.start_date
end_date = args.end_date or datetime.today().strftime('%Y-%m-%d')
print(f"๋ฐ์ดํ„ฐ ์ˆ˜์ง‘ ์‹œ์ž‘: {', '.join(training_tickers)}")
print(f"๊ธฐ๊ฐ„: {start_date} ~ {end_date}")
# ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ ๋ฐ ํ†ตํ•ฉ
final_data, all_data, industry_encoders = process_stock_data(
training_tickers,
start_date,
end_date,
args.fred_api_key
)
# ๋ฐ์ดํ„ฐ์…‹ ์š”์•ฝ ์ •๋ณด ์ถœ๋ ฅ
print("\n===== ํ†ตํ•ฉ ๋ฐ์ดํ„ฐ์…‹ ์š”์•ฝ =====")
print(f"ํฌ๊ธฐ: {final_data.shape[0]}ํ–‰, {final_data.shape[1]}์—ด")
if 'ticker' in final_data.columns:
print(f"์ข…๋ชฉ ์ˆ˜: {final_data['ticker'].nunique()}๊ฐœ")
if 'sector' in final_data.columns:
print(f"์„นํ„ฐ ์ˆ˜: {final_data['sector'].nunique()}๊ฐœ")
if 'industry' in final_data.columns:
print(f"์‚ฐ์—… ์ˆ˜: {final_data['industry'].nunique()}๊ฐœ")
print(f"๊ธฐ๊ฐ„: {final_data['Date'].min()} ~ {final_data['Date'].max()}")
# ์ข…๋ชฉ๋ณ„ ์ •๋ณด ์ถœ๋ ฅ
print(f"์ข…๋ชฉ๋ณ„ ํ–‰ ์ˆ˜:")
for ticker in training_tickers:
if 'ticker' in final_data.columns:
count = len(final_data[final_data['ticker'] == ticker])
else:
count = "์ •๋ณด ์—†์Œ"
print(f" - {ticker}: {count}ํ–‰")
# ์ €์žฅ ๋””๋ ‰ํ† ๋ฆฌ ์ค€๋น„
os.makedirs("./data", exist_ok=True)
data_dir = Path("./data")
all_tickers = '_'.join(training_tickers)
raw_filename = data_dir / f"{all_tickers}_data.csv"
# ์›๋ณธ ๋ฐ์ดํ„ฐ ์ €์žฅ
final_data.to_csv(raw_filename, index=False)
print(f"์›๋ณธ ๋ฐ์ดํ„ฐ๊ฐ€ ์ €์žฅ๋˜์—ˆ์Šต๋‹ˆ๋‹ค: {raw_filename}")
# ๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ ๋ฐ ๋ชจ๋ธ ํ•™์Šต์šฉ ๋ฐ์ดํ„ฐ ์ƒ์„ฑ
if 'Date' in final_data.columns:
final_data = final_data.set_index('Date')
final_data.sort_index(inplace=True)
# ๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ ์‹คํ–‰
print("\n์ „์ฒ˜๋ฆฌ ๋ฐ ํ•™์Šต์šฉ ๋ฐ์ดํ„ฐ ์ƒ์„ฑ ์‹œ์ž‘...")
processed_data, ticker_encoder, _ = prepare_data(final_data, window_size=args.window_size)
# ์„นํ„ฐ ๋ฐ ์‚ฐ์—… ๋งคํ•‘ (์žˆ๋Š” ๊ฒฝ์šฐ)
sector_map = {}
industry_map = {}
if 'sector' in final_data.columns and 'sector_id' in final_data.columns:
for sector, sector_id in zip(final_data['sector'].unique(), final_data['sector_id'].unique()):
sector_map[sector] = int(sector_id)
if 'industry' in final_data.columns and 'industry_id' in final_data.columns:
for industry, industry_id in zip(final_data['industry'].unique(), final_data['industry_id'].unique()):
industry_map[industry] = int(industry_id)
# ๊ฒฐ๊ณผ์— ์„นํ„ฐ ๋งคํ•‘ ์ถ”๊ฐ€
processed_data['sector_map'] = sector_map
processed_data['industry_map'] = industry_map
processed_data['feature_count'] = processed_data['x_train'].shape[2]
# ์ „์ฒ˜๋ฆฌ๋œ ๋ฐ์ดํ„ฐ ์ €์žฅ (pickle ํ˜•์‹)
processed_dir = data_dir / "processed"
processed_dir.mkdir(parents=True, exist_ok=True)
processed_filename = processed_dir / f"{all_tickers}_processed.pkl"
with open(processed_filename, 'wb') as f:
pickle.dump(processed_data, f)
print(f"์ „์ฒ˜๋ฆฌ๋œ ๋ฐ์ดํ„ฐ๊ฐ€ ์ €์žฅ๋˜์—ˆ์Šต๋‹ˆ๋‹ค: {processed_filename}")
# ์ธ์ฝ”๋” ์ •๋ณด ์ €์žฅ (JSON ํ˜•์‹)
encoder_info = {
'ticker_encoder': {str(i): ticker for i, ticker in enumerate(ticker_encoder.classes_)},
'sector_map': processed_data.get('sector_map', {}),
'industry_map': processed_data.get('industry_map', {})
}
encoder_filename = processed_dir / f"{all_tickers}_encoder_info.json"
with open(encoder_filename, 'w') as f:
json.dump(encoder_info, f)
print(f"์ธ์ฝ”๋” ์ •๋ณด๊ฐ€ ์ €์žฅ๋˜์—ˆ์Šต๋‹ˆ๋‹ค: {encoder_filename}")
# ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ์ €์žฅ
metadata = {
'feature_count': processed_data.get('feature_count', 0),
'window_size': args.window_size,
'tickers': training_tickers,
'start_date': str(processed_data.get('start_date', start_date)),
'end_date': str(processed_data.get('end_date', end_date)),
'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
}
metadata_filename = processed_dir / f"{all_tickers}_metadata.json"
with open(metadata_filename, 'w') as f:
json.dump(metadata, f)
print(f"๋ฉ”ํƒ€๋ฐ์ดํ„ฐ๊ฐ€ ์ €์žฅ๋˜์—ˆ์Šต๋‹ˆ๋‹ค: {metadata_filename}")
print("\n===== ์ „์ฒ˜๋ฆฌ ์™„๋ฃŒ =====")
print(f"ํ•™์Šต ๋ฐ์ดํ„ฐ ํฌ๊ธฐ: {processed_data['x_train'].shape}")
print(f"๊ฒ€์ฆ ๋ฐ์ดํ„ฐ ํฌ๊ธฐ: {processed_data['x_val'].shape}")
print(f"ํ…Œ์ŠคํŠธ ๋ฐ์ดํ„ฐ ํฌ๊ธฐ: {processed_data['x_test'].shape}")
print(f"ํŠน์„ฑ ์ˆ˜: {processed_data['feature_count']}")
if __name__ == "__main__":
main()