portfolio / etf /make_data_ETF_csv.py
eric2digit's picture
Upload folder using huggingface_hub
bf3714e verified
import pykrx
import pandas as pd
import yfinance as yf
from datetime import datetime
import re
import os
# ============================================================
# 1) KOSPI / KOSDAQ ํ‹ฐ์ปค ๋ชฉ๋ก ๋ฏธ๋ฆฌ ๋กœ๋”ฉ
# ============================================================
print("[INFO] Loading KOSPI/KOSDAQ ticker lists...")
KOSPI_SET = set(pykrx.stock.get_market_ticker_list(market="KOSPI"))
KOSDAQ_SET = set(pykrx.stock.get_market_ticker_list(market="KOSDAQ"))
# ============================================================
# 2) ์‹œ์žฅ์— ๋”ฐ๋ผ .KS / .KQ ์ž๋™ ๋ถ€์ฐฉ
# ============================================================
def attach_market_suffix(ticker6: str):
if ticker6 in KOSPI_SET:
return ticker6 + ".KS"
if ticker6 in KOSDAQ_SET:
return ticker6 + ".KQ"
return ticker6 + ".KS"
# ============================================================
# 3) RAW-ID ๋ฌธ์ž์—ด์—์„œ ๋Œ€ํ‘œ ํ•œ๊ตญ ํ‹ฐ์ปค ์ถ”์ถœ
# ============================================================
def extract_primary_ticker(raw_id: str):
if pd.isna(raw_id):
return ""
s = str(raw_id).strip()
if re.fullmatch(r"\d{6}\.(KS|KQ)", s):
return s
parts = re.split(r"[,\s]+", s)
# 6์ž๋ฆฌ ์ˆซ์ž โ†’ ์‹œ์žฅ ์ž๋™ํŒ๋ณ„
for p in parts:
if re.fullmatch(r"\d{6}", p):
return attach_market_suffix(p)
# ์ด๋ฏธ .KS/.KQ
for p in parts:
if re.fullmatch(r"\d{6}\.(KS|KQ)", p):
return p
# fallback ์ˆซ์ž
for p in parts:
if p.isdigit() and len(p) == 6:
return attach_market_suffix(p)
# ๋งˆ์ง€๋ง‰ fallback
return parts[0] if parts else s
# ============================================================
# 4) ํšŒ์‚ฌ๋ช… ์กฐํšŒ ํ•จ์ˆ˜
# ============================================================
def safe_company_name(ticker):
try:
yf_t = yf.Ticker(ticker)
try:
fi = yf_t.fast_info
if fi:
nm = fi.get("longName") or fi.get("shortName")
if nm:
return nm
except:
pass
try:
info = yf_t.info
if info:
nm = info.get("longName") or info.get("shortName")
if nm:
return nm
except:
pass
except Exception:
pass
return ""
# ============================================================
# 5) MAIN CODE
# ============================================================
def main():
input_df = pd.read_csv('etf_2.tsv', sep='\t')
output_path = "ETF.csv"
# ํŒŒ์ผ ์—†์œผ๋ฉด ํ—ค๋” ํฌํ•จ ์ƒˆ๋กœ ์ƒ์„ฑ
if not os.path.exists(output_path):
pd.DataFrame(columns=["ID","NAME","COMPANY","VALUE","AMOUNT","PERCENTAGE"])\
.to_csv(output_path, index=False, encoding="utf-8")
for row in input_df.to_dict('records'):
etf_ticker = row['์ข…๋ชฉ์ฝ”๋“œ']
name_from_input = row['์ข…๋ชฉ๋ช…']
today_dt = datetime.now().strftime('%Y%m%d')
df = pykrx.stock.get_etf_portfolio_deposit_file(etf_ticker, today_dt)
if df is None or df.empty:
print(f"[WARN] {etf_ticker} PDF ์—†์Œ. skip")
continue
# ============ 1) ํ‹ฐ์ปค ์ •๋ฆฌ ============
if "ํ‹ฐ์ปค" in df.columns:
df = df.rename(columns={"ํ‹ฐ์ปค": "ID"})
elif df.index.name == "ํ‹ฐ์ปค":
df = df.reset_index().rename(columns={"ํ‹ฐ์ปค": "ID"})
else:
print(f"[ERROR] {etf_ticker}: ํ‹ฐ์ปค ์ปฌ๋Ÿผ ์—†์Œ")
print(df)
continue
raw_ids = df["ID"].astype(str)
df["ID"] = raw_ids.apply(extract_primary_ticker)
# ============ 2) ETF ์ด๋ฆ„ ============
df["NAME"] = name_from_input
# ============ 3) ์ˆซ์ž ์ฒ˜๋ฆฌ ============
if "๊ธˆ์•ก" in df.columns:
df["VALUE"] = pd.to_numeric(df["๊ธˆ์•ก"], errors="coerce")
if "๊ณ„์•ฝ์ˆ˜" in df.columns:
df["AMOUNT"] = pd.to_numeric(df["๊ณ„์•ฝ์ˆ˜"], errors="coerce")
if "VALUE" not in df.columns:
df["VALUE"] = None
if "AMOUNT" not in df.columns:
df["AMOUNT"] = None
df["VALUE"] = df["VALUE"].clip(lower=0)
df["AMOUNT"] = df["AMOUNT"].clip(lower=0)
# ============ 4) ํšŒ์‚ฌ๋ช… ์กฐํšŒ ============
df["COMPANY"] = df["ID"].apply(safe_company_name)
# ============ 5) ๋น„์ค‘ ์ œ๊ฑฐ ============
if "๋น„์ค‘" in df.columns:
df = df.drop(columns=["๋น„์ค‘"])
# ============ 6) ๋น„์œจ ๊ณ„์‚ฐ ============
if df["VALUE"].notna().any():
total_value = df["VALUE"].sum(skipna=True)
if total_value > 0:
df["PERCENTAGE"] = (df["VALUE"] / total_value * 100).round(1)
else:
df["PERCENTAGE"] = 0.0
else:
df["PERCENTAGE"] = None
# ============ 7) ์ตœ์ข… ์ปฌ๋Ÿผ ============
final_cols = ["ID", "NAME", "COMPANY", "VALUE", "AMOUNT", "PERCENTAGE"]
for col in final_cols:
if col not in df.columns:
df[col] = None
df = df[final_cols].reset_index(drop=True)
print(df)
# ===============================================
# 8) CSV ์— append (์‰ผํ‘œ๋กœ ๊ตฌ๋ถ„ + UTF-8)
# ===============================================
df.to_csv(
output_path,
mode="a",
header=False,
index=False,
encoding="utf-8"
)
# ์—ฌ๋Ÿฌ ์ข…๋ชฉ ์ฒ˜๋ฆฌํ•  ๋•Œ break ์ œ๊ฑฐ
# break
if __name__ == "__main__":
main()