ostock-backend / model /src /data /financial_data.py
johnaness's picture
Deploy OStock FastAPI backend to HF Space (Docker SDK, port 7860)
4be2d4d
"""
์žฌ๋ฌด์ œํ‘œ ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ ๊ด€๋ จ ํ•จ์ˆ˜
"""
import pandas as pd
import numpy as np
import requests
import warnings
from bs4 import BeautifulSoup
from statsmodels.tsa.holtwinters import ExponentialSmoothing
warnings.filterwarnings("ignore", message="Optimization failed to converge")
def scrape_financial_statement(ticker, statement_type):
"""
ํŠน์ • ์ข…๋ฅ˜์˜ ์žฌ๋ฌด์ œํ‘œ๋ฅผ ์Šคํฌ๋ž˜ํ•‘ํ•˜๋Š” ํ•จ์ˆ˜
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
}
# URL ๋งคํ•‘
url_paths = {
'income': '',
'ratios': 'ratios/',
'balance-sheet': 'balance-sheet/',
'cash-flow-statement': 'cash-flow-statement/'
}
statement_names = {
'income': '์ˆ˜์ต๊ณ„์‚ฐ์„œ',
'ratios': '์žฌ๋ฌด๋น„์œจ',
'balance-sheet': '๋Œ€์ฐจ๋Œ€์กฐํ‘œ',
'cash-flow-statement': 'ํ˜„๊ธˆํ๋ฆ„ํ‘œ'
}
try:
url = f"https://stockanalysis.com/stocks/{ticker}/financials/{url_paths[statement_type]}?p=quarterly"
response = requests.get(url, headers=headers)
print(f"{statement_names[statement_type]} ์ƒํƒœ์ฝ”๋“œ: {response.status_code}")
soup = BeautifulSoup(response.content, 'html.parser')
element_tables = soup.select("table[data-test='financials']")
if not element_tables:
print(f"{ticker}: {statement_names[statement_type]} ํ…Œ์ด๋ธ”์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.")
return None
df = pd.read_html(str(element_tables))[0]
# ์ปฌ๋Ÿผ์ด MultiIndex์ธ์ง€ ํ™•์ธ
if isinstance(df.columns, pd.MultiIndex):
print(f"{ticker}: MultiIndex {statement_names[statement_type]} ์ฒ˜๋ฆฌ")
date_cols = df.columns.get_level_values(1)[1:] # Period Ending ๊ฐ’๋“ค
df = df.droplevel(0, axis=1) # ์ฒซ๋ฒˆ์งธ ๋ ˆ๋ฒจ ์ œ๊ฑฐ
df.columns = [df.columns[0]] + list(date_cols)
result_df = df.set_index(df.columns[0]).transpose()
else:
date_col = df.columns[0]
result_df = df.set_index(date_col).transpose()
result_df.index.name = "Date"
# ์ฒซ ํ–‰ ์ œ์™ธ
if statement_type == 'ratios':
result_df = result_df.iloc[1:-1, :] # ์ฒซ ํ–‰๊ณผ ๋งˆ์ง€๋ง‰ ํ–‰ ์ œ์™ธ
else:
result_df = result_df.iloc[:-1, :] # ๋งˆ์ง€๋ง‰ ํ–‰๋งŒ ์ œ์™ธ
return result_df
except Exception as e:
print(f"{ticker} {statement_names[statement_type]} ์Šคํฌ๋ž˜ํ•‘ ์˜ค๋ฅ˜: {e}")
return None
def convert_to_numeric(df):
"""
DataFrame์˜ ๋ชจ๋“  ์—ด์„ ์ˆซ์žํ˜•์œผ๋กœ ๋ณ€ํ™˜
"""
for column in df.columns:
if df[column].dtype == 'object':
# ์Œ์ˆ˜๊ฐ’ ์ฒ˜๋ฆฌ (์˜ˆ: '-123' -> -123)
df[column] = df[column].apply(
lambda x: float(str(x).replace('-', '')) * -1
if isinstance(x, str) and '-' in x and x.replace('-', '').replace('.', '').isdigit()
else x
)
# ๋ฐฑ๋ถ„์œจ ์ฒ˜๋ฆฌ (์˜ˆ: '12%' -> 0.12)
if df[column].dtype == 'object':
df[column] = df[column].apply(
lambda x: float(str(x).replace('%', '')) / 100
if isinstance(x, str) and '%' in x
else x
)
# ์ตœ์ข… ์ˆซ์ž ๋ณ€ํ™˜
df[column] = pd.to_numeric(df[column], errors='coerce')
return df
def convert_date_format(date_str):
"""
๋‚ ์งœ ๋ฌธ์ž์—ด์„ ํ‘œ์ค€ ํ˜•์‹์œผ๋กœ ๋ณ€ํ™˜
"""
try:
# ๋ฏธ๋ž˜ ๋‚ ์งœ ์ฒ˜๋ฆฌ ๊ฐœ์„ 
if isinstance(date_str, str) and "'" in date_str and len(date_str.split()) >= 4:
# ๋ฏธ๋ž˜ ๋ฐ์ดํ„ฐ ๊ฐ์ง€ ์‹œ None ๋ฐ˜ํ™˜
if any(future_marker in date_str for future_marker in ["'24", "2024"]):
return None
# ๊ณผ๊ฑฐ ๋ฐ์ดํ„ฐ๋Š” ์ •์ƒ ์ฒ˜๋ฆฌ
parts = date_str.split()
month_part = parts[-3]
day_part = parts[-2].replace(',', '')
year_part = parts[-1]
month_dict = {
'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6,
'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12
}
month = month_dict.get(month_part, 1)
day = int(day_part)
year = int(year_part)
return f"{year}-{month:02d}-{day:02d}"
# ๊ธฐํƒ€ ํ˜•์‹ ์ฒ˜๋ฆฌ
return date_str
except Exception as e:
print(f"๋‚ ์งœ ๋ณ€ํ™˜ ์˜ค๋ฅ˜: {e} - '{date_str}'")
return None
def process_index_dates(df):
"""
๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์˜ ์ธ๋ฑ์Šค ๋‚ ์งœ๋ฅผ ๋ณ€ํ™˜
"""
new_index = [convert_date_format(idx) for idx in df.index]
df['_temp_date'] = new_index
df = df[df['_temp_date'].notna()]
if df.empty:
return None
df.index = df['_temp_date']
df = df.drop(columns=['_temp_date'])
return df
def interpolate_and_forecast(df, end_date):
"""
์žฌ๋ฌด ๋ฐ์ดํ„ฐ๋ฅผ ์ผ๋ณ„๋กœ ๋ณด๊ฐ„ํ•˜๊ณ  ํ•„์š”์‹œ ๋ฏธ๋ž˜ ๋ฐ์ดํ„ฐ ์˜ˆ์ธก
"""
# ๋‚ ์งœ ์ธ๋ฑ์Šค๋ฅผ datetime์œผ๋กœ ๋ณ€ํ™˜
df.index = pd.to_datetime(df.index)
# ์ผ๋ณ„ ๋ฐ์ดํ„ฐ๋กœ ๋ณ€ํ™˜ ๋ฐ ๋ณด๊ฐ„
daily_df = df.resample('D').asfreq()
for column in daily_df.columns:
daily_df[column] = daily_df[column].interpolate(method='linear')
# ์˜ˆ์ธก ๋ฐ์ดํ„ฐ ์ƒ์„ฑ
end_date = pd.to_datetime(end_date)
forecast_steps = (end_date - daily_df.index[-1]).days
if forecast_steps > 0:
print(f"์˜ˆ์ธก ์‹œ์ž‘: {forecast_steps}์ผ")
date_range = pd.date_range(daily_df.index[-1] + pd.Timedelta(days=1), end_date)
# ์˜ˆ์ธก๊ฐ’์„ ์‚ฌ์ „์— ๋จผ์ € ๋ชจ์Œ
forecasts = {}
for column in daily_df.columns:
try:
model = ExponentialSmoothing(
daily_df[column], trend='add', seasonal=None, seasonal_periods=4
).fit()
forecast = model.forecast(steps=forecast_steps)
forecasts[column] = forecast
except Exception as e:
print(f"{column} ์˜ˆ์ธก ์‹คํŒจ: {e}")
forecasts[column] = np.full(forecast_steps, np.nan)
# ํ•œ ๋ฒˆ์— DataFrame ์ƒ์„ฑ
forecast_df = pd.DataFrame(forecasts, index=date_range)
daily_df = pd.concat([daily_df, forecast_df])
# ๊ฒฐ์ธก์น˜๊ฐ€ ์žˆ๋Š” ์—ด ์ œ๊ฑฐ
daily_df = daily_df.dropna(axis=1, how='any')
return daily_df
def process_financial_data(ticker, all_data, stock_end_date):
"""
์žฌ๋ฌด์ œํ‘œ ๋ฐ์ดํ„ฐ๋ฅผ ์ฒ˜๋ฆฌํ•˜๋Š” ๋ฉ”์ธ ํ•จ์ˆ˜
"""
try:
print(f"===== {ticker} ์žฌ๋ฌด๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ ์‹œ์ž‘ =====")
# ๊ฐ ์žฌ๋ฌด์ œํ‘œ ์Šคํฌ๋ž˜ํ•‘
FS_Income = scrape_financial_statement(ticker, 'income')
FS_Ratio = scrape_financial_statement(ticker, 'ratios')
FS_Balance = scrape_financial_statement(ticker, 'balance-sheet')
FS_Cash = scrape_financial_statement(ticker, 'cash-flow-statement')
# ์Šคํฌ๋ž˜ํ•‘ ์‹คํŒจ ํ™•์ธ
if any(fs is None for fs in [FS_Income, FS_Ratio, FS_Balance, FS_Cash]):
print(f"{ticker}: ์ผ๋ถ€ ์žฌ๋ฌด์ œํ‘œ ๋ฐ์ดํ„ฐ๋ฅผ ๊ฐ€์ ธ์˜ค์ง€ ๋ชปํ–ˆ์Šต๋‹ˆ๋‹ค.")
return None
# ์žฌ๋ฌด์ œํ‘œ ๋ฐ์ดํ„ฐ๋ฅผ ์ˆซ์ž๋กœ ๋ณ€ํ™˜
FS_Income = convert_to_numeric(FS_Income)
FS_Ratio = convert_to_numeric(FS_Ratio)
FS_Balance = convert_to_numeric(FS_Balance)
FS_Cash = convert_to_numeric(FS_Cash)
# ๋‚ ์งœ ์ธ๋ฑ์Šค ์ฒ˜๋ฆฌ
FS_Income = process_index_dates(FS_Income)
FS_Ratio = process_index_dates(FS_Ratio)
FS_Balance = process_index_dates(FS_Balance)
FS_Cash = process_index_dates(FS_Cash)
# ๋‚ ์งœ ๋ณ€ํ™˜ ์‹คํŒจ ํ™•์ธ
if any(fs is None for fs in [FS_Income, FS_Ratio, FS_Balance, FS_Cash]):
print(f"{ticker}: ๋‚ ์งœ ๋ณ€ํ™˜ ํ›„ ์œ ํšจํ•œ ๋ฐ์ดํ„ฐ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.")
return None
# ROE ๊ณ„์‚ฐ
try:
if 'Net Income' in FS_Income.columns and 'Shareholders\' Equity' in FS_Balance.columns:
FS_Ratio['ROE'] = FS_Income['Net Income'] / FS_Balance['Shareholders\' Equity']
except Exception as e:
print(f"ROE ๊ณ„์‚ฐ ์˜ค๋ฅ˜: {e}")
# ๋ชจ๋“  ์žฌ๋ฌด์ œํ‘œ ๋ฐ์ดํ„ฐ ๋ณ‘ํ•ฉ
FS_Summary = pd.concat([FS_Income, FS_Balance, FS_Ratio, FS_Cash], axis=1)
# ์ค‘๋ณต ์ปฌ๋Ÿผ ์ œ๊ฑฐ
duplicated_columns = FS_Summary.columns[FS_Summary.columns.duplicated()].unique()
if len(duplicated_columns) > 0:
print(f"{ticker} ์ค‘๋ณต ์ปฌ๋Ÿผ ์ œ๊ฑฐ: {duplicated_columns}")
FS_Summary = FS_Summary.loc[:, ~FS_Summary.columns.duplicated()]
# ๋ณด๊ฐ„ ๋ฐ ์˜ˆ์ธก
daily_FS_Summary = interpolate_and_forecast(FS_Summary, stock_end_date)
if daily_FS_Summary.empty:
print(f"{ticker}: ์œ ํšจํ•œ ์ผ๋ณ„ ์žฌ๋ฌด ๋ฐ์ดํ„ฐ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค")
return None
# ์ฃผ๊ฐ€ ๋ฐ์ดํ„ฐ ๋ณ‘ํ•ฉ
if ticker in all_data and 'Close' in all_data[ticker].columns:
close_df = pd.DataFrame(all_data[ticker]['Close'])
close_df.columns = ['Close']
# ์žฌ๋ฌด ๋ฐ์ดํ„ฐ์™€ ์ฃผ๊ฐ€ ๋ฐ์ดํ„ฐ ๋ณ‘ํ•ฉ
daily_FS_Summary = daily_FS_Summary.merge(
close_df, left_index=True, right_index=True, how='inner'
)
if daily_FS_Summary.empty:
print(f"{ticker}: ์ฃผ๊ฐ€ ๋ฐ์ดํ„ฐ์™€ ๋ณ‘ํ•ฉ ํ›„ ๋ฐ์ดํ„ฐ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค")
return None
else:
print(f"{ticker}: Close ๋ฐ์ดํ„ฐ๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค")
return None
print(f"{ticker} ์žฌ๋ฌด ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ ์™„๋ฃŒ: {daily_FS_Summary.shape}")
return daily_FS_Summary
except Exception as e:
print(f"{ticker} ์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {e}")
import traceback
traceback.print_exc()
return None