Spaces:
Runtime error
Runtime error
| import argparse | |
| import zipfile | |
| from pathlib import Path | |
| from urllib.parse import urlparse | |
| import requests | |
| import pandas as pd | |
| # ------------------------------------------------------------ | |
| # 1) ZIP ๋ค์ด๋ก๋ (์บ์ฑ) | |
| # ------------------------------------------------------------ | |
| def download_zip(url: str, cache_dir: str) -> Path: | |
| cache_dir = Path(cache_dir) | |
| cache_dir.mkdir(parents=True, exist_ok=True) | |
| zip_path = cache_dir / Path(urlparse(url).path).name | |
| if zip_path.exists(): | |
| print(f"[INFO] Using cached ZIP โ {zip_path}") | |
| return zip_path | |
| print(f"[INFO] Downloading ZIP from {url}") | |
| headers = {"User-Agent": "2Digit AI@2digit.io", "Host": "www.sec.gov"} | |
| resp = requests.get(url, headers=headers, stream=True) | |
| resp.raise_for_status() | |
| zip_path.write_bytes(resp.content) | |
| print(f"[INFO] Saved ZIP โ {zip_path}") | |
| return zip_path | |
| # ------------------------------------------------------------ | |
| # 2) ZIP ๋ด๋ถ ์ฝ๊ธฐ | |
| # ------------------------------------------------------------ | |
| def load_tsv_from_zip(zip_path: Path): | |
| print("[INFO] Loading raw 13F data from ZIP...") | |
| with zipfile.ZipFile(zip_path, "r") as z: | |
| names = z.namelist() | |
| cover_path = next(f for f in names if f.endswith("/COVERPAGE.tsv")) | |
| info_path = next(f for f in names if f.endswith("/INFOTABLE.tsv")) | |
| submission_path = next(f for f in names if f.endswith("/SUBMISSION.tsv")) | |
| print(f"[INFO] Loading SUBMISSION โ {submission_path.split('/')[-1]}") | |
| print(f"[INFO] Loading COVERPAGE โ {cover_path.split('/')[-1]}") | |
| print(f"[INFO] Loading INFOTABLE โ {info_path.split('/')[-1]}") | |
| # ์ต์ ์ปฌ๋ผ๋ง ์ฝ๊ธฐ | |
| submission = pd.read_csv( | |
| z.open(submission_path), sep="\t", | |
| usecols=["ACCESSION_NUMBER", "SUBMISSIONTYPE"], | |
| dtype=str | |
| ) | |
| cover = pd.read_csv( | |
| z.open(cover_path), sep="\t", | |
| usecols=["ACCESSION_NUMBER", "FILINGMANAGER_NAME"], | |
| dtype=str | |
| ) | |
| info_cols = ["ACCESSION_NUMBER", "NAMEOFISSUER", "CUSIP", "FIGI", "VALUE", "SSHPRNAMT"] | |
| info = pd.read_csv( | |
| z.open(info_path), sep="\t", | |
| usecols=info_cols, | |
| dtype=str, | |
| low_memory=False | |
| ) | |
| print(f"[INFO] Raw load complete:") | |
| print(f" โ SUBMISSION: {len(submission):,} rows") | |
| print(f" โ COVERPAGE : {len(cover):,} rows") | |
| print(f" โ INFOTABLE : {len(info):,} rows") | |
| return submission, cover, info | |
| # ------------------------------------------------------------ | |
| # 3) FIGI ๊ธฐ๋ฐ Issuer ์ด๋ฆ ์ ๊ทํ (๋ฉ๋ชจ๋ฆฌ ์ต์ํ) | |
| # ------------------------------------------------------------ | |
| def normalize_issuer(df: pd.DataFrame) -> pd.DataFrame: | |
| print("[INFO] Normalizing issuer names using FIGI...") | |
| df = df.copy() | |
| df["NAMEOFISSUER"] = df["NAMEOFISSUER"].str.upper().str.strip() | |
| # FIGI๊ฐ ์๋ ๊ฒฝ์ฐ์๋ง ๊ทธ๋ฃนํ โ ๋ถํ์ํ ๋ฉ๋ชจ๋ฆฌ ์ฌ์ฉ ๋ฐฉ์ง | |
| figi_notna = df["FIGI"].notna() | |
| if figi_notna.any(): | |
| figi_rep = ( | |
| df.loc[figi_notna, ['FIGI', 'NAMEOFISSUER']] | |
| .drop_duplicates(subset='FIGI') | |
| .set_index('FIGI')['NAMEOFISSUER'] | |
| ) | |
| df["NAMEOFISSUER"] = df["FIGI"].map(figi_rep).fillna(df["NAMEOFISSUER"]) | |
| return df | |
| # ------------------------------------------------------------ | |
| # 4) ํต์ฌ ์ฒ๋ฆฌ: ๊ฐ ํฌ์์๋ณ 1% ์ด์ ํฌ์ง์ ๋ง ๋จ๊ธฐ๊ธฐ | |
| # ------------------------------------------------------------ | |
| def build_tables(submission: pd.DataFrame, cover: pd.DataFrame, info: pd.DataFrame): | |
| print("[INFO] Starting post-processing and filtering...") | |
| # 1. 13F-HR / 13F-HR/A ๋ง ๋จ๊ธฐ๊ธฐ (13F-NT ๋ฐฐ์ ) | |
| valid_accessions = submission[ | |
| submission["SUBMISSIONTYPE"].isin({"13F-HR", "13F-HR/A"}) | |
| ]["ACCESSION_NUMBER"] | |
| before_cover = len(cover) | |
| cover = cover[cover["ACCESSION_NUMBER"].isin(valid_accessions)].copy() | |
| info = info[info["ACCESSION_NUMBER"].isin(valid_accessions)].copy() | |
| print(f"[INFO] Filtered out 13F-NT โ COVERPAGE {before_cover:,} โ {len(cover):,}") | |
| print(f"[INFO] โ INFOTABLE โ {len(info):,}") | |
| if info.empty: | |
| print("[WARN] No valid holdings after removing 13F-NT") | |
| return pd.DataFrame(), pd.DataFrame() | |
| # 2. ๋ณํฉ + ์ด๋ฆ ์ ๊ทํ | |
| data = ( | |
| info.pipe(normalize_issuer) | |
| .merge(cover[["ACCESSION_NUMBER", "FILINGMANAGER_NAME"]], on="ACCESSION_NUMBER") | |
| ) | |
| del submission, cover, info # ์ฆ์ ๋ฉ๋ชจ๋ฆฌ ํด์ | |
| data["FILINGMANAGER_NAME"] = data["FILINGMANAGER_NAME"].str.upper().str.strip() | |
| # 3. ์ซ์ ๋ณํ | |
| data["VALUE"] = pd.to_numeric(data["VALUE"], errors="coerce") | |
| data["AMOUNT"] = pd.to_numeric(data["SSHPRNAMT"], errors="coerce") | |
| # ํต์ฌ ์์ : VALUE๊ฐ 0์ด๊ฑฐ๋ NaN์ธ ํ ์์ ํ ์ ๊ฑฐ (AUM=0 ํฌ์์ ์ ๊ฑฐ) | |
| before = len(data) | |
| data = data[data["VALUE"] > 0].copy() | |
| print(f"[INFO] Removed VALUE โค 0 rows โ {before:,} โ {len(data):,}") | |
| if data.empty: | |
| print("[WARN] No positive VALUE holdings after cleaning") | |
| return pd.DataFrame(), pd.DataFrame() | |
| # ์ด์ ์์ ํ๊ฒ ํ ๋ณํ | |
| data["VALUE"] = data["VALUE"].astype("int64") | |
| data["AMOUNT"] = data["AMOUNT"].fillna(0).astype("int64") | |
| # 4. ํฌ์์๋ณ ์ด ๋ณด์ ์ก ๊ณ์ฐ | |
| data["INVESTOR_TOTAL"] = data.groupby("FILINGMANAGER_NAME", observed=True)["VALUE"].transform("sum") | |
| # 5. 1% ์ด์ ํฌ์ง์ ๋ง ๋จ๊ธฐ๊ธฐ | |
| data = data[data["VALUE"] >= data["INVESTOR_TOTAL"] * 0.01].copy() | |
| if data.empty: | |
| print("[WARN] No positions โฅ1% after all filtering") | |
| return pd.DataFrame(), pd.DataFrame() | |
| print(f"[INFO] Final meaningful holdings (โฅ1% per investor): {len(data):,}") | |
| print(f"[INFO] Active investors with significant positions: {data['FILINGMANAGER_NAME'].nunique():,}") | |
| # 6. ๋น์ค ๊ณ์ฐ | |
| data["PERCENT"] = (data["VALUE"] / data["INVESTOR_TOTAL"] * 100).round(3) | |
| # 7. ์ต์ข ์ ๋ฆฌ | |
| final = data.rename(columns={ | |
| "CUSIP": "ID", | |
| "FILINGMANAGER_NAME": "NAME", | |
| "NAMEOFISSUER": "COMPANY", | |
| })[["ID", "NAME", "COMPANY", "PERCENT", "VALUE", "AMOUNT"]] | |
| # 8. ํฌ์์ ๋ฆฌ์คํธ (AUM ํฐ ์์) | |
| investor_order = ( | |
| data.groupby("FILINGMANAGER_NAME")["INVESTOR_TOTAL"] | |
| .first() | |
| .sort_values(ascending=False) | |
| .index | |
| ) | |
| final["NAME"] = pd.Categorical(final["NAME"], categories=investor_order, ordered=True) | |
| final = final.sort_values(["NAME", "VALUE"], ascending=[True, False]).reset_index(drop=True) | |
| investors = pd.DataFrame({"NAME": investor_order}) | |
| print(f"[INFO] Done! Output ready โ {len(final):,} rows from {len(investors)} real investors") | |
| return final, investors | |
| # ------------------------------------------------------------ | |
| # 5) MAIN | |
| # ------------------------------------------------------------ | |
| def main(): | |
| parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) | |
| parser.add_argument("--input", type=str, default="src_data/", help="์์ฒ ๋ฐ์ดํฐ ํด๋") | |
| parser.add_argument( | |
| "--url", | |
| type=str, | |
| default="https://www.sec.gov/files/structureddata/data/form-13f-data-sets/01jun2025-31aug2025_form13f.zip", | |
| help="sec 13f zip ํ์ผ URL", | |
| ) | |
| parser.add_argument("--output", type=str, default="data/", help="์ ๋ฆฌ๋ ๋ฐ์ดํฐ ์ ์ฅ ๊ฒฝ๋ก") | |
| args = parser.parse_args() | |
| zip_path = download_zip(args.url, args.input) | |
| submission, coverpage, infotable = load_tsv_from_zip(zip_path) # 3๊ฐ ๋ฐํ | |
| info_table, investor_table = build_tables(submission, coverpage, infotable) | |
| output_dir = Path(args.output) | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| info_table.to_csv(output_dir / "SEC_Filing_Manager.csv", index=False) | |
| investor_table.to_csv(output_dir / "INVEST_NAME.csv", index=False) | |
| print("[INFO] All done โ clean SEC 13F data ready.") | |
| if __name__ == "__main__": | |
| main() |