File size: 13,822 Bytes
8299003
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
"""
SEC EDGAR Fetcher for Apple Inc. (AAPL)
========================================
Fetches 10-K, 10-Q, and 8-K filings from SEC EDGAR and stores them locally.

Usage:
    python sec_fetcher.py

Output structure:
    data/raw/sec_filings/
    └── AAPL/
        β”œβ”€β”€ 10-K/
        β”‚   β”œβ”€β”€ 2024/
        β”‚   β”‚   β”œβ”€β”€ metadata.json
        β”‚   β”‚   └── filing.htm
        β”‚   β”œβ”€β”€ 2023/
        β”‚   └── 2022/
        β”œβ”€β”€ 10-Q/
        β”‚   β”œβ”€β”€ 2026_Q2/
        β”‚   └── ...
        └── 8-K/
            └── ...
"""

import json
import time
import logging
import requests
from datetime import datetime, timezone
from pathlib import Path

# ── Configuration ─────────────────────────────────────────────────────────────

COMPANY_NAME = "Apple Inc."
TICKER       = "AAPL"
CIK          = "0000320193"          # Apple's SEC CIK (with leading zeros)
CIK_PLAIN    = "320193"              # Without leading zeros (for archive URLs)

FILING_TYPES = ["10-K", "10-Q", "8-K"]

FETCH_COUNT  = {
    "10-K": 3,
    "10-Q": 6,
    "8-K" : 5,
}

BASE_DIR   = Path(__file__).parent.parent / "data" / "raw" / "sec_filings"
OUTPUT_DIR = BASE_DIR / TICKER

# SEC EDGAR endpoints
# NOTE: data.sec.gov  β†’ submissions JSON (metadata)
#       www.sec.gov   β†’ archive file downloads
# These are TWO different hosts β€” headers must match the host being called.
SUBMISSIONS_URL = f"https://data.sec.gov/submissions/CIK{CIK}.json"

# SEC requires a descriptive User-Agent (their policy)
USER_AGENT = "Morningstar RAG Research Pipeline contact@example.com"

REQUEST_DELAY = 0.15   # 150ms between requests β€” stays under SEC's 10 req/sec

# ── Logging ───────────────────────────────────────────────────────────────────

LOG_DIR = Path(__file__).parent.parent / "logs"
LOG_DIR.mkdir(parents=True, exist_ok=True)

logging.basicConfig(
    level   = logging.INFO,
    format  = "%(asctime)s  %(levelname)-8s  %(message)s",
    handlers=[
        logging.FileHandler(LOG_DIR / "sec_fetcher.log"),
        logging.StreamHandler(),
    ]
)
log = logging.getLogger(__name__)


# ── HTTP helpers ──────────────────────────────────────────────────────────────

def make_headers(host: str) -> dict:
    """
    Build headers for a specific host.
    SEC requires User-Agent; Host must match the actual destination.
    """
    return {
        "User-Agent"     : USER_AGENT,
        "Accept-Encoding": "gzip, deflate",
        "Host"           : host,
    }


def get(url: str, stream: bool = False, retries: int = 3) -> requests.Response:
    """GET with retry + rate limiting. Auto-detects host from URL."""
    from urllib.parse import urlparse
    host = urlparse(url).netloc   # e.g. "data.sec.gov" or "www.sec.gov"

    for attempt in range(1, retries + 1):
        try:
            time.sleep(REQUEST_DELAY)
            resp = requests.get(
                url,
                headers = make_headers(host),
                timeout = 30,
                stream  = stream,
            )
            if resp.status_code == 429:
                wait = int(resp.headers.get("Retry-After", 60))
                log.warning(f"Rate limited β€” waiting {wait}s ...")
                time.sleep(wait)
                continue

            resp.raise_for_status()
            return resp

        except requests.RequestException as e:
            log.warning(f"Attempt {attempt}/{retries} failed for {url}: {e}")
            if attempt < retries:
                time.sleep(5 * attempt)

    raise RuntimeError(f"Failed after {retries} attempts: {url}")


def save_json(data: dict, path: Path):
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, "w") as f:
        json.dump(data, f, indent=2)


def save_file(url: str, path: Path) -> bool:
    """Stream-download a file to disk. Returns True on success."""
    try:
        resp = get(url, stream=True)
        path.parent.mkdir(parents=True, exist_ok=True)
        with open(path, "wb") as f:
            for chunk in resp.iter_content(chunk_size=8192):
                f.write(chunk)
        size_kb = path.stat().st_size / 1024
        log.info(f"    Saved {path.name}  ({size_kb:.1f} KB)")
        return True
    except Exception as e:
        log.error(f"    Could not save {path.name}: {e}")
        return False


def now_utc() -> str:
    return datetime.now(timezone.utc).isoformat()


# ── Filing index lookup ───────────────────────────────────────────────────────

def fetch_filing_doc_list(accession_fmt: str) -> list:
    """
    Fetch the document list for one filing via the EDGAR index JSON.
    Returns a list of {name, type, size, url} dicts.
    accession_fmt: accession number with dashes removed, e.g. 000032019324000123
    """
    url = (
        f"https://data.sec.gov/Archives/edgar/data/"
        f"{CIK_PLAIN}/{accession_fmt}/"
        f"{accession_fmt[:10]}-{accession_fmt[10:12]}-{accession_fmt[12:]}-index.json"
    )
    # Rebuild with dashes: 0000320193-24-000123
    acc_dashed = f"{accession_fmt[:10]}-{accession_fmt[10:12]}-{accession_fmt[12:]}"
    url = (
        f"https://data.sec.gov/Archives/edgar/data/"
        f"{CIK_PLAIN}/{accession_fmt}/{acc_dashed}-index.json"
    )
    try:
        resp = get(url)
        items = resp.json().get("directory", {}).get("item", [])
        return [
            {
                "name": d["name"],
                "type": d.get("type", ""),
                "size": d.get("size", ""),
                "url" : (
                    f"https://www.sec.gov/Archives/edgar/data/"
                    f"{CIK_PLAIN}/{accession_fmt}/{d['name']}"
                ),
            }
            for d in items
            if isinstance(d, dict) and "name" in d
        ]
    except Exception as e:
        log.debug(f"    Doc list fetch failed for {accession_fmt}: {e}")
        return []


# ── Main fetcher class ────────────────────────────────────────────────────────

class SECEdgarFetcher:

    def __init__(self, output_dir: Path):
        self.output_dir = output_dir
        self.output_dir.mkdir(parents=True, exist_ok=True)

    def run(self):
        log.info("=" * 60)
        log.info(f"SEC EDGAR Fetcher  β€”  {COMPANY_NAME} ({TICKER})")
        log.info(f"CIK       : {CIK}")
        log.info(f"Output    : {self.output_dir}")
        log.info("=" * 60)

        submissions = self._fetch_submissions()
        if not submissions:
            log.error("Could not fetch submissions. Aborting.")
            return

        # Save company-level metadata once
        save_json({
            "name"        : submissions.get("name"),
            "cik"         : CIK,
            "ticker"      : TICKER,
            "sic"         : submissions.get("sic"),
            "sic_desc"    : submissions.get("sicDescription"),
            "state"       : submissions.get("stateOfIncorporation"),
            "fiscal_year" : submissions.get("fiscalYearEnd"),
            "fetched_at"  : now_utc(),
        }, self.output_dir / "company_metadata.json")

        log.info(f"Company   : {submissions.get('name')}")
        log.info(f"Industry  : {submissions.get('sicDescription')}")

        filings = self._parse_filings(submissions)
        summary = {}

        for ftype in FILING_TYPES:
            n       = FETCH_COUNT.get(ftype, 3)
            subset  = [f for f in filings if f["form"] == ftype][:n]
            log.info(f"\n--- {ftype}  ({len(subset)} filings) ---")
            saved   = self._process_filing_type(ftype, subset)
            summary[ftype] = {"requested": n, "saved": saved}

        save_json({"run_at": now_utc(), "summary": summary},
                  self.output_dir / "fetch_summary.json")

        log.info("\n" + "=" * 60)
        log.info("Done.")
        for ftype, s in summary.items():
            log.info(f"  {ftype:5s}  {s['saved']}/{s['requested']} saved")
        log.info("=" * 60)

    # ── Submissions ───────────────────────────────────────────────────────────

    def _fetch_submissions(self) -> dict:
        log.info(f"Fetching submissions index ...")
        try:
            return get(SUBMISSIONS_URL).json()
        except Exception as e:
            log.error(f"Submissions fetch failed: {e}")
            return {}

    def _parse_filings(self, submissions: dict) -> list:
        recent  = submissions.get("filings", {}).get("recent", {})
        forms   = recent.get("form",            [])
        dates   = recent.get("filingDate",       [])
        accnums = recent.get("accessionNumber",  [])
        docs    = recent.get("primaryDocument",  [])

        filings = []
        for form, date, acc, doc in zip(forms, dates, accnums, docs):
            filings.append({
                "form"       : form,
                "date"       : date,
                "accession"  : acc,                          # with dashes
                "acc_fmt"    : acc.replace("-", ""),         # without dashes
                "primary_doc": doc,
            })
        log.info(f"Total filings in index: {len(filings)}")
        return filings

    # ── Per-filing processing ─────────────────────────────────────────────────

    def _process_filing_type(self, ftype: str, filings: list) -> int:
        saved = 0
        for filing in filings:
            if self._process_one(ftype, filing):
                saved += 1
        return saved

    def _process_one(self, ftype: str, filing: dict) -> bool:
        date     = filing["date"]
        acc      = filing["accession"]
        acc_fmt  = filing["acc_fmt"]
        prim_doc = filing["primary_doc"]

        folder      = self._folder_name(ftype, date, acc)
        filing_dir  = self.output_dir / ftype / folder
        meta_path   = filing_dir / "metadata.json"

        # Already downloaded β†’ skip only if a filing file also exists
        if meta_path.exists():
            has_filing = any(
                f for f in filing_dir.iterdir()
                if f.name != "metadata.json" and f.stat().st_size > 1024
            )
            if has_filing:
                log.info(f"  SKIP  {ftype}/{folder}  (already cached)")
                return True
            else:
                log.info(f"  Re-fetching  {ftype}/{folder}  (metadata exists but no filing file)")

        log.info(f"  Fetching  {ftype}/{folder}  ({date}) ...")

        # Fetch document list from the filing index
        doc_list = fetch_filing_doc_list(acc_fmt)

        # Build the primary document download URL
        # Archive base: https://www.sec.gov/Archives/edgar/data/<CIK>/<acc_fmt>/
        archive_base = (
            f"https://www.sec.gov/Archives/edgar/data/{CIK_PLAIN}/{acc_fmt}"
        )
        doc_url = f"{archive_base}/{prim_doc}"

        # Save metadata first (even if download fails, we have provenance)
        metadata = {
            "ticker"      : TICKER,
            "form"        : ftype,
            "filing_date" : date,
            "fiscal_year" : date[:4],
            "accession"   : acc,
            "primary_doc" : prim_doc,
            "doc_url"     : doc_url,
            "archive_base": archive_base,
            "all_docs"    : doc_list,
            "source"      : "SEC EDGAR",
            "license"     : "public",
            "fetched_at"  : now_utc(),
        }
        save_json(metadata, meta_path)

        # Download primary document
        ext      = Path(prim_doc).suffix.lower() or ".htm"
        doc_path = filing_dir / f"filing{ext}"
        success  = save_file(doc_url, doc_path)

        if not success:
            # Try alternative: look for an .htm file in the doc list
            for d in doc_list:
                if d["name"].endswith(".htm") and ftype.replace("-", "").lower() in d["name"].lower():
                    alt_url = d["url"]
                    log.info(f"    Trying alternative: {d['name']}")
                    success = save_file(alt_url, filing_dir / d["name"])
                    if success:
                        metadata["local_path"] = str(filing_dir / d["name"])
                        break

        if success:
            metadata["local_path"] = str(doc_path)
            save_json(metadata, meta_path)
            return True

        log.warning(f"  FAILED  {ftype}/{folder} β€” metadata saved, file not downloaded")
        return False

    def _folder_name(self, ftype: str, date: str, acc: str) -> str:
        year  = date[:4]
        month = int(date[5:7])
        if ftype == "10-K":
            return year
        elif ftype == "10-Q":
            quarter = (month - 1) // 3 + 1
            return f"{year}_Q{quarter}"
        else:
            return date   # 8-K: full date


# ── Entry point ───────────────────────────────────────────────────────────────

if __name__ == "__main__":
    fetcher = SECEdgarFetcher(output_dir=OUTPUT_DIR)
    fetcher.run()