Spaces:

Pushkya
/

Financial_bot

Running

App Files Files Community

Financial_bot / scripts /sec_fetcher.py

Pushkya

Upload 30 files

8299003 verified 1 day ago

Raw

History Blame Contribute Delete

13.8 kB

	"""
	SEC EDGAR Fetcher for Apple Inc. (AAPL)
	========================================
	Fetches 10-K, 10-Q, and 8-K filings from SEC EDGAR and stores them locally.

	Usage:
	python sec_fetcher.py

	Output structure:
	data/raw/sec_filings/
	└── AAPL/
	├── 10-K/
	│ ├── 2024/
	│ │ ├── metadata.json
	│ │ └── filing.htm
	│ ├── 2023/
	│ └── 2022/
	├── 10-Q/
	│ ├── 2026_Q2/
	│ └── ...
	└── 8-K/
	└── ...
	"""

	import json
	import time
	import logging
	import requests
	from datetime import datetime, timezone
	from pathlib import Path

	# ── Configuration ─────────────────────────────────────────────────────────────

	COMPANY_NAME = "Apple Inc."
	TICKER = "AAPL"
	CIK = "0000320193" # Apple's SEC CIK (with leading zeros)
	CIK_PLAIN = "320193" # Without leading zeros (for archive URLs)

	FILING_TYPES = ["10-K", "10-Q", "8-K"]

	FETCH_COUNT = {
	"10-K": 3,
	"10-Q": 6,
	"8-K" : 5,
	}

	BASE_DIR = Path(__file__).parent.parent / "data" / "raw" / "sec_filings"
	OUTPUT_DIR = BASE_DIR / TICKER

	# SEC EDGAR endpoints
	# NOTE: data.sec.gov → submissions JSON (metadata)
	# www.sec.gov → archive file downloads
	# These are TWO different hosts — headers must match the host being called.
	SUBMISSIONS_URL = f"https://data.sec.gov/submissions/CIK{CIK}.json"

	# SEC requires a descriptive User-Agent (their policy)
	USER_AGENT = "Morningstar RAG Research Pipeline contact@example.com"

	REQUEST_DELAY = 0.15 # 150ms between requests — stays under SEC's 10 req/sec

	# ── Logging ───────────────────────────────────────────────────────────────────

	LOG_DIR = Path(__file__).parent.parent / "logs"
	LOG_DIR.mkdir(parents=True, exist_ok=True)

	logging.basicConfig(
	level = logging.INFO,
	format = "%(asctime)s %(levelname)-8s %(message)s",
	handlers=[
	logging.FileHandler(LOG_DIR / "sec_fetcher.log"),
	logging.StreamHandler(),
	]
	)
	log = logging.getLogger(__name__)


	# ── HTTP helpers ──────────────────────────────────────────────────────────────

	def make_headers(host: str) -> dict:
	"""
	Build headers for a specific host.
	SEC requires User-Agent; Host must match the actual destination.
	"""
	return {
	"User-Agent" : USER_AGENT,
	"Accept-Encoding": "gzip, deflate",
	"Host" : host,
	}


	def get(url: str, stream: bool = False, retries: int = 3) -> requests.Response:
	"""GET with retry + rate limiting. Auto-detects host from URL."""
	from urllib.parse import urlparse
	host = urlparse(url).netloc # e.g. "data.sec.gov" or "www.sec.gov"

	for attempt in range(1, retries + 1):
	try:
	time.sleep(REQUEST_DELAY)
	resp = requests.get(
	url,
	headers = make_headers(host),
	timeout = 30,
	stream = stream,
	)
	if resp.status_code == 429:
	wait = int(resp.headers.get("Retry-After", 60))
	log.warning(f"Rate limited — waiting {wait}s ...")
	time.sleep(wait)
	continue

	resp.raise_for_status()
	return resp

	except requests.RequestException as e:
	log.warning(f"Attempt {attempt}/{retries} failed for {url}: {e}")
	if attempt < retries:
	time.sleep(5 * attempt)

	raise RuntimeError(f"Failed after {retries} attempts: {url}")


	def save_json(data: dict, path: Path):
	path.parent.mkdir(parents=True, exist_ok=True)
	with open(path, "w") as f:
	json.dump(data, f, indent=2)


	def save_file(url: str, path: Path) -> bool:
	"""Stream-download a file to disk. Returns True on success."""
	try:
	resp = get(url, stream=True)
	path.parent.mkdir(parents=True, exist_ok=True)
	with open(path, "wb") as f:
	for chunk in resp.iter_content(chunk_size=8192):
	f.write(chunk)
	size_kb = path.stat().st_size / 1024
	log.info(f" Saved {path.name} ({size_kb:.1f} KB)")
	return True
	except Exception as e:
	log.error(f" Could not save {path.name}: {e}")
	return False


	def now_utc() -> str:
	return datetime.now(timezone.utc).isoformat()


	# ── Filing index lookup ───────────────────────────────────────────────────────

	def fetch_filing_doc_list(accession_fmt: str) -> list:
	"""
	Fetch the document list for one filing via the EDGAR index JSON.
	Returns a list of {name, type, size, url} dicts.
	accession_fmt: accession number with dashes removed, e.g. 000032019324000123
	"""
	url = (
	f"https://data.sec.gov/Archives/edgar/data/"
	f"{CIK_PLAIN}/{accession_fmt}/"
	f"{accession_fmt[:10]}-{accession_fmt[10:12]}-{accession_fmt[12:]}-index.json"
	)
	# Rebuild with dashes: 0000320193-24-000123
	acc_dashed = f"{accession_fmt[:10]}-{accession_fmt[10:12]}-{accession_fmt[12:]}"
	url = (
	f"https://data.sec.gov/Archives/edgar/data/"
	f"{CIK_PLAIN}/{accession_fmt}/{acc_dashed}-index.json"
	)
	try:
	resp = get(url)
	items = resp.json().get("directory", {}).get("item", [])
	return [
	{
	"name": d["name"],
	"type": d.get("type", ""),
	"size": d.get("size", ""),
	"url" : (
	f"https://www.sec.gov/Archives/edgar/data/"
	f"{CIK_PLAIN}/{accession_fmt}/{d['name']}"
	),
	}
	for d in items
	if isinstance(d, dict) and "name" in d
	]
	except Exception as e:
	log.debug(f" Doc list fetch failed for {accession_fmt}: {e}")
	return []


	# ── Main fetcher class ────────────────────────────────────────────────────────

	class SECEdgarFetcher:

	def __init__(self, output_dir: Path):
	self.output_dir = output_dir
	self.output_dir.mkdir(parents=True, exist_ok=True)

	def run(self):
	log.info("=" * 60)
	log.info(f"SEC EDGAR Fetcher — {COMPANY_NAME} ({TICKER})")
	log.info(f"CIK : {CIK}")
	log.info(f"Output : {self.output_dir}")
	log.info("=" * 60)

	submissions = self._fetch_submissions()
	if not submissions:
	log.error("Could not fetch submissions. Aborting.")
	return

	# Save company-level metadata once
	save_json({
	"name" : submissions.get("name"),
	"cik" : CIK,
	"ticker" : TICKER,
	"sic" : submissions.get("sic"),
	"sic_desc" : submissions.get("sicDescription"),
	"state" : submissions.get("stateOfIncorporation"),
	"fiscal_year" : submissions.get("fiscalYearEnd"),
	"fetched_at" : now_utc(),
	}, self.output_dir / "company_metadata.json")

	log.info(f"Company : {submissions.get('name')}")
	log.info(f"Industry : {submissions.get('sicDescription')}")

	filings = self._parse_filings(submissions)
	summary = {}

	for ftype in FILING_TYPES:
	n = FETCH_COUNT.get(ftype, 3)
	subset = [f for f in filings if f["form"] == ftype][:n]
	log.info(f"\n--- {ftype} ({len(subset)} filings) ---")
	saved = self._process_filing_type(ftype, subset)
	summary[ftype] = {"requested": n, "saved": saved}

	save_json({"run_at": now_utc(), "summary": summary},
	self.output_dir / "fetch_summary.json")

	log.info("\n" + "=" * 60)
	log.info("Done.")
	for ftype, s in summary.items():
	log.info(f" {ftype:5s} {s['saved']}/{s['requested']} saved")
	log.info("=" * 60)

	# ── Submissions ───────────────────────────────────────────────────────────

	def _fetch_submissions(self) -> dict:
	log.info(f"Fetching submissions index ...")
	try:
	return get(SUBMISSIONS_URL).json()
	except Exception as e:
	log.error(f"Submissions fetch failed: {e}")
	return {}

	def _parse_filings(self, submissions: dict) -> list:
	recent = submissions.get("filings", {}).get("recent", {})
	forms = recent.get("form", [])
	dates = recent.get("filingDate", [])
	accnums = recent.get("accessionNumber", [])
	docs = recent.get("primaryDocument", [])

	filings = []
	for form, date, acc, doc in zip(forms, dates, accnums, docs):
	filings.append({
	"form" : form,
	"date" : date,
	"accession" : acc, # with dashes
	"acc_fmt" : acc.replace("-", ""), # without dashes
	"primary_doc": doc,
	})
	log.info(f"Total filings in index: {len(filings)}")
	return filings

	# ── Per-filing processing ─────────────────────────────────────────────────

	def _process_filing_type(self, ftype: str, filings: list) -> int:
	saved = 0
	for filing in filings:
	if self._process_one(ftype, filing):
	saved += 1
	return saved

	def _process_one(self, ftype: str, filing: dict) -> bool:
	date = filing["date"]
	acc = filing["accession"]
	acc_fmt = filing["acc_fmt"]
	prim_doc = filing["primary_doc"]

	folder = self._folder_name(ftype, date, acc)
	filing_dir = self.output_dir / ftype / folder
	meta_path = filing_dir / "metadata.json"

	# Already downloaded → skip only if a filing file also exists
	if meta_path.exists():
	has_filing = any(
	f for f in filing_dir.iterdir()
	if f.name != "metadata.json" and f.stat().st_size > 1024
	)
	if has_filing:
	log.info(f" SKIP {ftype}/{folder} (already cached)")
	return True
	else:
	log.info(f" Re-fetching {ftype}/{folder} (metadata exists but no filing file)")

	log.info(f" Fetching {ftype}/{folder} ({date}) ...")

	# Fetch document list from the filing index
	doc_list = fetch_filing_doc_list(acc_fmt)

	# Build the primary document download URL
	# Archive base: https://www.sec.gov/Archives/edgar/data/<CIK>/<acc_fmt>/
	archive_base = (
	f"https://www.sec.gov/Archives/edgar/data/{CIK_PLAIN}/{acc_fmt}"
	)
	doc_url = f"{archive_base}/{prim_doc}"

	# Save metadata first (even if download fails, we have provenance)
	metadata = {
	"ticker" : TICKER,
	"form" : ftype,
	"filing_date" : date,
	"fiscal_year" : date[:4],
	"accession" : acc,
	"primary_doc" : prim_doc,
	"doc_url" : doc_url,
	"archive_base": archive_base,
	"all_docs" : doc_list,
	"source" : "SEC EDGAR",
	"license" : "public",
	"fetched_at" : now_utc(),
	}
	save_json(metadata, meta_path)

	# Download primary document
	ext = Path(prim_doc).suffix.lower() or ".htm"
	doc_path = filing_dir / f"filing{ext}"
	success = save_file(doc_url, doc_path)

	if not success:
	# Try alternative: look for an .htm file in the doc list
	for d in doc_list:
	if d["name"].endswith(".htm") and ftype.replace("-", "").lower() in d["name"].lower():
	alt_url = d["url"]
	log.info(f" Trying alternative: {d['name']}")
	success = save_file(alt_url, filing_dir / d["name"])
	if success:
	metadata["local_path"] = str(filing_dir / d["name"])
	break

	if success:
	metadata["local_path"] = str(doc_path)
	save_json(metadata, meta_path)
	return True

	log.warning(f" FAILED {ftype}/{folder} — metadata saved, file not downloaded")
	return False

	def _folder_name(self, ftype: str, date: str, acc: str) -> str:
	year = date[:4]
	month = int(date[5:7])
	if ftype == "10-K":
	return year
	elif ftype == "10-Q":
	quarter = (month - 1) // 3 + 1
	return f"{year}_Q{quarter}"
	else:
	return date # 8-K: full date


	# ── Entry point ───────────────────────────────────────────────────────────────

	if __name__ == "__main__":
	fetcher = SECEdgarFetcher(output_dir=OUTPUT_DIR)
	fetcher.run()