Spaces:

Pushkya
/

Financial_bot

Running

File size: 13,822 Bytes
"""
SEC EDGAR Fetcher for Apple Inc. (AAPL)
========================================
Fetches 10-K, 10-Q, and 8-K filings from SEC EDGAR and stores them locally.

Usage:
    python sec_fetcher.py

Output structure:
    data/raw/sec_filings/
    └── AAPL/
        ├── 10-K/
        │   ├── 2024/
        │   │   ├── metadata.json
        │   │   └── filing.htm
        │   ├── 2023/
        │   └── 2022/
        ├── 10-Q/
        │   ├── 2026_Q2/
        │   └── ...
        └── 8-K/
            └── ...
"""

import json
import time
import logging
import requests
from datetime import datetime, timezone
from pathlib import Path

# ── Configuration ─────────────────────────────────────────────────────────────

COMPANY_NAME = "Apple Inc."
TICKER       = "AAPL"
CIK          = "0000320193"          # Apple's SEC CIK (with leading zeros)
CIK_PLAIN    = "320193"              # Without leading zeros (for archive URLs)

FILING_TYPES = ["10-K", "10-Q", "8-K"]

FETCH_COUNT  = {
    "10-K": 3,
    "10-Q": 6,
    "8-K" : 5,
}

BASE_DIR   = Path(__file__).parent.parent / "data" / "raw" / "sec_filings"
OUTPUT_DIR = BASE_DIR / TICKER

# SEC EDGAR endpoints
# NOTE: data.sec.gov  → submissions JSON (metadata)
#       www.sec.gov   → archive file downloads
# These are TWO different hosts — headers must match the host being called.
SUBMISSIONS_URL = f"https://data.sec.gov/submissions/CIK{CIK}.json"

# SEC requires a descriptive User-Agent (their policy)
USER_AGENT = "Morningstar RAG Research Pipeline contact@example.com"

REQUEST_DELAY = 0.15   # 150ms between requests — stays under SEC's 10 req/sec

# ── Logging ───────────────────────────────────────────────────────────────────

LOG_DIR = Path(__file__).parent.parent / "logs"
LOG_DIR.mkdir(parents=True, exist_ok=True)

logging.basicConfig(
    level   = logging.INFO,
    format  = "%(asctime)s  %(levelname)-8s  %(message)s",
    handlers=[
        logging.FileHandler(LOG_DIR / "sec_fetcher.log"),
        logging.StreamHandler(),
    ]
)
log = logging.getLogger(__name__)


# ── HTTP helpers ──────────────────────────────────────────────────────────────

def make_headers(host: str) -> dict:
    """
    Build headers for a specific host.
    SEC requires User-Agent; Host must match the actual destination.
    """
    return {
        "User-Agent"     : USER_AGENT,
        "Accept-Encoding": "gzip, deflate",
        "Host"           : host,
    }


def get(url: str, stream: bool = False, retries: int = 3) -> requests.Response:
    """GET with retry + rate limiting. Auto-detects host from URL."""
    from urllib.parse import urlparse
    host = urlparse(url).netloc   # e.g. "data.sec.gov" or "www.sec.gov"

    for attempt in range(1, retries + 1):
        try:
            time.sleep(REQUEST_DELAY)
            resp = requests.get(
                url,
                headers = make_headers(host),
                timeout = 30,
                stream  = stream,
            )
            if resp.status_code == 429:
                wait = int(resp.headers.get("Retry-After", 60))
                log.warning(f"Rate limited — waiting {wait}s ...")
                time.sleep(wait)
                continue

            resp.raise_for_status()
            return resp

        except requests.RequestException as e:
            log.warning(f"Attempt {attempt}/{retries} failed for {url}: {e}")
            if attempt < retries:
                time.sleep(5 * attempt)

    raise RuntimeError(f"Failed after {retries} attempts: {url}")


def save_json(data: dict, path: Path):
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, "w") as f:
        json.dump(data, f, indent=2)


def save_file(url: str, path: Path) -> bool:
    """Stream-download a file to disk. Returns True on success."""
    try:
        resp = get(url, stream=True)
        path.parent.mkdir(parents=True, exist_ok=True)
        with open(path, "wb") as f:
            for chunk in resp.iter_content(chunk_size=8192):
                f.write(chunk)
        size_kb = path.stat().st_size / 1024
        log.info(f"    Saved {path.name}  ({size_kb:.1f} KB)")
        return True
    except Exception as e:
        log.error(f"    Could not save {path.name}: {e}")
        return False


def now_utc() -> str:
    return datetime.now(timezone.utc).isoformat()


# ── Filing index lookup ───────────────────────────────────────────────────────

def fetch_filing_doc_list(accession_fmt: str) -> list:
    """
    Fetch the document list for one filing via the EDGAR index JSON.
    Returns a list of {name, type, size, url} dicts.
    accession_fmt: accession number with dashes removed, e.g. 000032019324000123
    """
    url = (
        f"https://data.sec.gov/Archives/edgar/data/"
        f"{CIK_PLAIN}/{accession_fmt}/"
        f"{accession_fmt[:10]}-{accession_fmt[10:12]}-{accession_fmt[12:]}-index.json"
    )
    # Rebuild with dashes: 0000320193-24-000123
    acc_dashed = f"{accession_fmt[:10]}-{accession_fmt[10:12]}-{accession_fmt[12:]}"
    url = (
        f"https://data.sec.gov/Archives/edgar/data/"
        f"{CIK_PLAIN}/{accession_fmt}/{acc_dashed}-index.json"
    )
    try:
        resp = get(url)
        items = resp.json().get("directory", {}).get("item", [])
        return [
            {
                "name": d["name"],
                "type": d.get("type", ""),
                "size": d.get("size", ""),
                "url" : (
                    f"https://www.sec.gov/Archives/edgar/data/"
                    f"{CIK_PLAIN}/{accession_fmt}/{d['name']}"
                ),
            }
            for d in items
            if isinstance(d, dict) and "name" in d
        ]
    except Exception as e:
        log.debug(f"    Doc list fetch failed for {accession_fmt}: {e}")
        return []


# ── Main fetcher class ────────────────────────────────────────────────────────

class SECEdgarFetcher:

    def __init__(self, output_dir: Path):
        self.output_dir = output_dir
        self.output_dir.mkdir(parents=True, exist_ok=True)

    def run(self):
        log.info("=" * 60)
        log.info(f"SEC EDGAR Fetcher  —  {COMPANY_NAME} ({TICKER})")
        log.info(f"CIK       : {CIK}")
        log.info(f"Output    : {self.output_dir}")
        log.info("=" * 60)

        submissions = self._fetch_submissions()
        if not submissions:
            log.error("Could not fetch submissions. Aborting.")
            return

        # Save company-level metadata once
        save_json({
            "name"        : submissions.get("name"),
            "cik"         : CIK,
            "ticker"      : TICKER,
            "sic"         : submissions.get("sic"),
            "sic_desc"    : submissions.get("sicDescription"),
            "state"       : submissions.get("stateOfIncorporation"),
            "fiscal_year" : submissions.get("fiscalYearEnd"),
            "fetched_at"  : now_utc(),
        }, self.output_dir / "company_metadata.json")

        log.info(f"Company   : {submissions.get('name')}")
        log.info(f"Industry  : {submissions.get('sicDescription')}")

        filings = self._parse_filings(submissions)
        summary = {}

        for ftype in FILING_TYPES:
            n       = FETCH_COUNT.get(ftype, 3)
            subset  = [f for f in filings if f["form"] == ftype][:n]
            log.info(f"\n--- {ftype}  ({len(subset)} filings) ---")
            saved   = self._process_filing_type(ftype, subset)
            summary[ftype] = {"requested": n, "saved": saved}

        save_json({"run_at": now_utc(), "summary": summary},
                  self.output_dir / "fetch_summary.json")

        log.info("\n" + "=" * 60)
        log.info("Done.")
        for ftype, s in summary.items():
            log.info(f"  {ftype:5s}  {s['saved']}/{s['requested']} saved")
        log.info("=" * 60)

    # ── Submissions ───────────────────────────────────────────────────────────

    def _fetch_submissions(self) -> dict:
        log.info(f"Fetching submissions index ...")
        try:
            return get(SUBMISSIONS_URL).json()
        except Exception as e:
            log.error(f"Submissions fetch failed: {e}")
            return {}

    def _parse_filings(self, submissions: dict) -> list:
        recent  = submissions.get("filings", {}).get("recent", {})
        forms   = recent.get("form",            [])
        dates   = recent.get("filingDate",       [])
        accnums = recent.get("accessionNumber",  [])
        docs    = recent.get("primaryDocument",  [])

        filings = []
        for form, date, acc, doc in zip(forms, dates, accnums, docs):
            filings.append({
                "form"       : form,
                "date"       : date,
                "accession"  : acc,                          # with dashes
                "acc_fmt"    : acc.replace("-", ""),         # without dashes
                "primary_doc": doc,
            })
        log.info(f"Total filings in index: {len(filings)}")
        return filings

    # ── Per-filing processing ─────────────────────────────────────────────────

    def _process_filing_type(self, ftype: str, filings: list) -> int:
        saved = 0
        for filing in filings:
            if self._process_one(ftype, filing):
                saved += 1
        return saved

    def _process_one(self, ftype: str, filing: dict) -> bool:
        date     = filing["date"]
        acc      = filing["accession"]
        acc_fmt  = filing["acc_fmt"]
        prim_doc = filing["primary_doc"]

        folder      = self._folder_name(ftype, date, acc)
        filing_dir  = self.output_dir / ftype / folder
        meta_path   = filing_dir / "metadata.json"

        # Already downloaded → skip only if a filing file also exists
        if meta_path.exists():
            has_filing = any(
                f for f in filing_dir.iterdir()
                if f.name != "metadata.json" and f.stat().st_size > 1024
            )
            if has_filing:
                log.info(f"  SKIP  {ftype}/{folder}  (already cached)")
                return True
            else:
                log.info(f"  Re-fetching  {ftype}/{folder}  (metadata exists but no filing file)")

        log.info(f"  Fetching  {ftype}/{folder}  ({date}) ...")

        # Fetch document list from the filing index
        doc_list = fetch_filing_doc_list(acc_fmt)

        # Build the primary document download URL
        # Archive base: https://www.sec.gov/Archives/edgar/data/<CIK>/<acc_fmt>/
        archive_base = (
            f"https://www.sec.gov/Archives/edgar/data/{CIK_PLAIN}/{acc_fmt}"
        )
        doc_url = f"{archive_base}/{prim_doc}"

        # Save metadata first (even if download fails, we have provenance)
        metadata = {
            "ticker"      : TICKER,
            "form"        : ftype,
            "filing_date" : date,
            "fiscal_year" : date[:4],
            "accession"   : acc,
            "primary_doc" : prim_doc,
            "doc_url"     : doc_url,
            "archive_base": archive_base,
            "all_docs"    : doc_list,
            "source"      : "SEC EDGAR",
            "license"     : "public",
            "fetched_at"  : now_utc(),
        }
        save_json(metadata, meta_path)

        # Download primary document
        ext      = Path(prim_doc).suffix.lower() or ".htm"
        doc_path = filing_dir / f"filing{ext}"
        success  = save_file(doc_url, doc_path)

        if not success:
            # Try alternative: look for an .htm file in the doc list
            for d in doc_list:
                if d["name"].endswith(".htm") and ftype.replace("-", "").lower() in d["name"].lower():
                    alt_url = d["url"]
                    log.info(f"    Trying alternative: {d['name']}")
                    success = save_file(alt_url, filing_dir / d["name"])
                    if success:
                        metadata["local_path"] = str(filing_dir / d["name"])
                        break

        if success:
            metadata["local_path"] = str(doc_path)
            save_json(metadata, meta_path)
            return True

        log.warning(f"  FAILED  {ftype}/{folder} — metadata saved, file not downloaded")
        return False

    def _folder_name(self, ftype: str, date: str, acc: str) -> str:
        year  = date[:4]
        month = int(date[5:7])
        if ftype == "10-K":
            return year
        elif ftype == "10-Q":
            quarter = (month - 1) // 3 + 1
            return f"{year}_Q{quarter}"
        else:
            return date   # 8-K: full date


# ── Entry point ───────────────────────────────────────────────────────────────

if __name__ == "__main__":
    fetcher = SECEdgarFetcher(output_dir=OUTPUT_DIR)
    fetcher.run()