portfolio / sec13f_processor.py
eric2digit's picture
Upload folder using huggingface_hub
bf3714e verified
import argparse
import zipfile
from pathlib import Path
from urllib.parse import urlparse
import requests
import pandas as pd
# ------------------------------------------------------------
# 1) ZIP ๋‹ค์šด๋กœ๋“œ (์บ์‹ฑ)
# ------------------------------------------------------------
def download_zip(url: str, cache_dir: str) -> Path:
cache_dir = Path(cache_dir)
cache_dir.mkdir(parents=True, exist_ok=True)
zip_path = cache_dir / Path(urlparse(url).path).name
if zip_path.exists():
print(f"[INFO] Using cached ZIP โ†’ {zip_path}")
return zip_path
print(f"[INFO] Downloading ZIP from {url}")
headers = {"User-Agent": "2Digit AI@2digit.io", "Host": "www.sec.gov"}
resp = requests.get(url, headers=headers, stream=True)
resp.raise_for_status()
zip_path.write_bytes(resp.content)
print(f"[INFO] Saved ZIP โ†’ {zip_path}")
return zip_path
# ------------------------------------------------------------
# 2) ZIP ๋‚ด๋ถ€ ์ฝ๊ธฐ
# ------------------------------------------------------------
def load_tsv_from_zip(zip_path: Path):
print("[INFO] Loading raw 13F data from ZIP...")
with zipfile.ZipFile(zip_path, "r") as z:
names = z.namelist()
cover_path = next(f for f in names if f.endswith("/COVERPAGE.tsv"))
info_path = next(f for f in names if f.endswith("/INFOTABLE.tsv"))
submission_path = next(f for f in names if f.endswith("/SUBMISSION.tsv"))
print(f"[INFO] Loading SUBMISSION โ† {submission_path.split('/')[-1]}")
print(f"[INFO] Loading COVERPAGE โ† {cover_path.split('/')[-1]}")
print(f"[INFO] Loading INFOTABLE โ† {info_path.split('/')[-1]}")
# ์ตœ์†Œ ์ปฌ๋Ÿผ๋งŒ ์ฝ๊ธฐ
submission = pd.read_csv(
z.open(submission_path), sep="\t",
usecols=["ACCESSION_NUMBER", "SUBMISSIONTYPE"],
dtype=str
)
cover = pd.read_csv(
z.open(cover_path), sep="\t",
usecols=["ACCESSION_NUMBER", "FILINGMANAGER_NAME"],
dtype=str
)
info_cols = ["ACCESSION_NUMBER", "NAMEOFISSUER", "CUSIP", "FIGI", "VALUE", "SSHPRNAMT"]
info = pd.read_csv(
z.open(info_path), sep="\t",
usecols=info_cols,
dtype=str,
low_memory=False
)
print(f"[INFO] Raw load complete:")
print(f" โ†’ SUBMISSION: {len(submission):,} rows")
print(f" โ†’ COVERPAGE : {len(cover):,} rows")
print(f" โ†’ INFOTABLE : {len(info):,} rows")
return submission, cover, info
# ------------------------------------------------------------
# 3) FIGI ๊ธฐ๋ฐ˜ Issuer ์ด๋ฆ„ ์ •๊ทœํ™” (๋ฉ”๋ชจ๋ฆฌ ์ตœ์†Œํ™”)
# ------------------------------------------------------------
def normalize_issuer(df: pd.DataFrame) -> pd.DataFrame:
print("[INFO] Normalizing issuer names using FIGI...")
df = df.copy()
df["NAMEOFISSUER"] = df["NAMEOFISSUER"].str.upper().str.strip()
# FIGI๊ฐ€ ์žˆ๋Š” ๊ฒฝ์šฐ์—๋งŒ ๊ทธ๋ฃนํ™” โ†’ ๋ถˆํ•„์š”ํ•œ ๋ฉ”๋ชจ๋ฆฌ ์‚ฌ์šฉ ๋ฐฉ์ง€
figi_notna = df["FIGI"].notna()
if figi_notna.any():
figi_rep = (
df.loc[figi_notna, ['FIGI', 'NAMEOFISSUER']]
.drop_duplicates(subset='FIGI')
.set_index('FIGI')['NAMEOFISSUER']
)
df["NAMEOFISSUER"] = df["FIGI"].map(figi_rep).fillna(df["NAMEOFISSUER"])
return df
# ------------------------------------------------------------
# 4) ํ•ต์‹ฌ ์ฒ˜๋ฆฌ: ๊ฐ ํˆฌ์ž์ž๋ณ„ 1% ์ด์ƒ ํฌ์ง€์…˜๋งŒ ๋‚จ๊ธฐ๊ธฐ
# ------------------------------------------------------------
def build_tables(submission: pd.DataFrame, cover: pd.DataFrame, info: pd.DataFrame):
print("[INFO] Starting post-processing and filtering...")
# 1. 13F-HR / 13F-HR/A ๋งŒ ๋‚จ๊ธฐ๊ธฐ (13F-NT ๋ฐฐ์ œ)
valid_accessions = submission[
submission["SUBMISSIONTYPE"].isin({"13F-HR", "13F-HR/A"})
]["ACCESSION_NUMBER"]
before_cover = len(cover)
cover = cover[cover["ACCESSION_NUMBER"].isin(valid_accessions)].copy()
info = info[info["ACCESSION_NUMBER"].isin(valid_accessions)].copy()
print(f"[INFO] Filtered out 13F-NT โ†’ COVERPAGE {before_cover:,} โ†’ {len(cover):,}")
print(f"[INFO] โ†’ INFOTABLE โ†’ {len(info):,}")
if info.empty:
print("[WARN] No valid holdings after removing 13F-NT")
return pd.DataFrame(), pd.DataFrame()
# 2. ๋ณ‘ํ•ฉ + ์ด๋ฆ„ ์ •๊ทœํ™”
data = (
info.pipe(normalize_issuer)
.merge(cover[["ACCESSION_NUMBER", "FILINGMANAGER_NAME"]], on="ACCESSION_NUMBER")
)
del submission, cover, info # ์ฆ‰์‹œ ๋ฉ”๋ชจ๋ฆฌ ํ•ด์ œ
data["FILINGMANAGER_NAME"] = data["FILINGMANAGER_NAME"].str.upper().str.strip()
# 3. ์ˆซ์ž ๋ณ€ํ™˜
data["VALUE"] = pd.to_numeric(data["VALUE"], errors="coerce")
data["AMOUNT"] = pd.to_numeric(data["SSHPRNAMT"], errors="coerce")
# ํ•ต์‹ฌ ์ˆ˜์ •: VALUE๊ฐ€ 0์ด๊ฑฐ๋‚˜ NaN์ธ ํ–‰ ์™„์ „ํžˆ ์ œ๊ฑฐ (AUM=0 ํˆฌ์ž์ž ์ œ๊ฑฐ)
before = len(data)
data = data[data["VALUE"] > 0].copy()
print(f"[INFO] Removed VALUE โ‰ค 0 rows โ†’ {before:,} โ†’ {len(data):,}")
if data.empty:
print("[WARN] No positive VALUE holdings after cleaning")
return pd.DataFrame(), pd.DataFrame()
# ์ด์ œ ์•ˆ์ „ํ•˜๊ฒŒ ํ˜• ๋ณ€ํ™˜
data["VALUE"] = data["VALUE"].astype("int64")
data["AMOUNT"] = data["AMOUNT"].fillna(0).astype("int64")
# 4. ํˆฌ์ž์ž๋ณ„ ์ด ๋ณด์œ ์•ก ๊ณ„์‚ฐ
data["INVESTOR_TOTAL"] = data.groupby("FILINGMANAGER_NAME", observed=True)["VALUE"].transform("sum")
# 5. 1% ์ด์ƒ ํฌ์ง€์…˜๋งŒ ๋‚จ๊ธฐ๊ธฐ
data = data[data["VALUE"] >= data["INVESTOR_TOTAL"] * 0.01].copy()
if data.empty:
print("[WARN] No positions โ‰ฅ1% after all filtering")
return pd.DataFrame(), pd.DataFrame()
print(f"[INFO] Final meaningful holdings (โ‰ฅ1% per investor): {len(data):,}")
print(f"[INFO] Active investors with significant positions: {data['FILINGMANAGER_NAME'].nunique():,}")
# 6. ๋น„์ค‘ ๊ณ„์‚ฐ
data["PERCENT"] = (data["VALUE"] / data["INVESTOR_TOTAL"] * 100).round(3)
# 7. ์ตœ์ข… ์ •๋ฆฌ
final = data.rename(columns={
"CUSIP": "ID",
"FILINGMANAGER_NAME": "NAME",
"NAMEOFISSUER": "COMPANY",
})[["ID", "NAME", "COMPANY", "PERCENT", "VALUE", "AMOUNT"]]
# 8. ํˆฌ์ž์ž ๋ฆฌ์ŠคํŠธ (AUM ํฐ ์ˆœ์„œ)
investor_order = (
data.groupby("FILINGMANAGER_NAME")["INVESTOR_TOTAL"]
.first()
.sort_values(ascending=False)
.index
)
final["NAME"] = pd.Categorical(final["NAME"], categories=investor_order, ordered=True)
final = final.sort_values(["NAME", "VALUE"], ascending=[True, False]).reset_index(drop=True)
investors = pd.DataFrame({"NAME": investor_order})
print(f"[INFO] Done! Output ready โ†’ {len(final):,} rows from {len(investors)} real investors")
return final, investors
# ------------------------------------------------------------
# 5) MAIN
# ------------------------------------------------------------
def main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("--input", type=str, default="src_data/", help="์›์ฒœ ๋ฐ์ดํ„ฐ ํด๋”")
parser.add_argument(
"--url",
type=str,
default="https://www.sec.gov/files/structureddata/data/form-13f-data-sets/01jun2025-31aug2025_form13f.zip",
help="sec 13f zip ํŒŒ์ผ URL",
)
parser.add_argument("--output", type=str, default="data/", help="์ •๋ฆฌ๋œ ๋ฐ์ดํ„ฐ ์ €์žฅ ๊ฒฝ๋กœ")
args = parser.parse_args()
zip_path = download_zip(args.url, args.input)
submission, coverpage, infotable = load_tsv_from_zip(zip_path) # 3๊ฐœ ๋ฐ˜ํ™˜
info_table, investor_table = build_tables(submission, coverpage, infotable)
output_dir = Path(args.output)
output_dir.mkdir(parents=True, exist_ok=True)
info_table.to_csv(output_dir / "SEC_Filing_Manager.csv", index=False)
investor_table.to_csv(output_dir / "INVEST_NAME.csv", index=False)
print("[INFO] All done โ€“ clean SEC 13F data ready.")
if __name__ == "__main__":
main()