"""SEC EDGAR integration — fetches 10-K/10-Q filings and extracts MD&A (Item 7) and Risk Factors (Item 1A) for the analyst prompt. Uses the EDGAR EFTS full-text search API (same endpoint used in ``src/discovery/insider_feed.py``) and BeautifulSoup for HTML parsing. """ import re import time from datetime import datetime, timedelta import requests from bs4 import BeautifulSoup from langchain_core.tools import tool from langchain_text_splitters import RecursiveCharacterTextSplitter from src.core.logger import get_logger logger = get_logger(__name__) _SEC_HEADERS = { "User-Agent": "PrimoGreedy/1.0 (contact@primogreedy.com)", "Accept": "application/json", } _EFTS_URL = "https://efts.sec.gov/LATEST/search-index" _MAX_SECTION_CHARS = 2000 _SPLITTER = RecursiveCharacterTextSplitter( chunk_size=_MAX_SECTION_CHARS, chunk_overlap=200 ) # Regex patterns for section headers in 10-K/10-Q filings _MDA_PATTERN = re.compile( r"Item\s*7[\.\s\—\-]+.*?Management.s\s+Discussion|" r"Item\s*7[\.\s\—\-]|" r"Management.s\s+Discussion\s+and\s+Analysis", re.IGNORECASE, ) _RISK_PATTERN = re.compile( r"Item\s*1A[\.\s\—\-]+.*?Risk\s+Factors|" r"Item\s*1A[\.\s\—\-]|" r"Risk\s+Factors", re.IGNORECASE, ) _NEXT_ITEM_PATTERN = re.compile(r"Item\s*\d+[A-Z]?[\.\s\—\-]", re.IGNORECASE) # --------------------------------------------------------------------------- # EFTS search — find the most recent 10-K or 10-Q for a ticker # --------------------------------------------------------------------------- def _search_filings(ticker: str) -> dict | None: """Query EDGAR EFTS for the most recent annual/quarterly filing. Returns the first hit as a dict with ``file_url``, ``form_type``, ``file_date``, ``company_name``, or *None* if nothing found. """ two_years_ago = (datetime.now() - timedelta(days=730)).strftime("%Y-%m-%d") today = datetime.now().strftime("%Y-%m-%d") params = { "q": "", "forms": "10-K,10-Q", "dateRange": "custom", "startdt": two_years_ago, "enddt": today, "tickers": ticker, } try: resp = requests.get(_EFTS_URL, params=params, headers=_SEC_HEADERS, timeout=10) if resp.status_code != 200: logger.info("EDGAR EFTS returned %d for %s", resp.status_code, ticker) return None hits = resp.json().get("hits", {}).get("hits", []) if not hits: return None src = hits[0].get("_source", {}) file_num = src.get("file_num", "") accession = ( src.get("accession_no", "") .replace("-", "") ) primary_doc = src.get("file_description", "") if not primary_doc: primary_doc = src.get("display_names", [""])[0] if src.get("display_names") else "" entity_id = src.get("entity_id", "") return { "form_type": src.get("form_type", "10-K"), "file_date": src.get("file_date", ""), "company_name": src.get("display_names", [""])[0] if src.get("display_names") else ticker, "entity_id": entity_id, "accession": accession, "file_num": file_num, } except requests.RequestException as exc: logger.warning("EDGAR EFTS request failed for %s: %s", ticker, exc) return None def _fetch_filing_index(entity_id: str, accession: str) -> str | None: """Fetch the filing index page and return the URL of the primary HTML document.""" if not entity_id or not accession: return None index_url = ( f"https://www.sec.gov/Archives/edgar/data/{entity_id}/{accession}/" ) time.sleep(0.5) try: resp = requests.get( index_url, headers={**_SEC_HEADERS, "Accept": "text/html"}, timeout=10, ) if resp.status_code != 200: return None soup = BeautifulSoup(resp.text, "html.parser") for a_tag in soup.find_all("a", href=True): href = a_tag["href"] if href.endswith(".htm") or href.endswith(".html"): if not href.startswith("http"): href = f"https://www.sec.gov{href}" if href.startswith("/") else f"{index_url}{href}" return href return None except requests.RequestException as exc: logger.warning("EDGAR filing index fetch failed: %s", exc) return None # --------------------------------------------------------------------------- # HTML parser — extract MD&A and Risk Factors sections # --------------------------------------------------------------------------- def _extract_section(full_text: str, start_pattern: re.Pattern, label: str) -> str: """Find a section by *start_pattern* and return text up to the next Item header.""" match = start_pattern.search(full_text) if not match: return "" start = match.end() remainder = full_text[start:] end_match = _NEXT_ITEM_PATTERN.search(remainder, pos=200) if end_match: section_text = remainder[: end_match.start()] else: section_text = remainder[:_MAX_SECTION_CHARS * 2] section_text = section_text.strip() if not section_text: return "" if len(section_text) > _MAX_SECTION_CHARS: chunks = _SPLITTER.split_text(section_text) section_text = chunks[0] if chunks else section_text[:_MAX_SECTION_CHARS] return section_text def parse_mda_risk_factors(html: str) -> str: """Extract MD&A and Risk Factors from a 10-K/10-Q HTML document. Returns a formatted string ready for ``{sec_context}`` in the prompt, or an empty string if extraction fails. """ try: soup = BeautifulSoup(html, "html.parser") for tag in soup(["script", "style", "meta", "link"]): tag.decompose() full_text = soup.get_text(separator="\n", strip=True) mda = _extract_section(full_text, _MDA_PATTERN, "MD&A") risk = _extract_section(full_text, _RISK_PATTERN, "Risk Factors") if not mda and not risk: return "" parts = ["SEC FILING GROUND TRUTH:"] if mda: parts.append(f"\nMD&A SUMMARY (Item 7):\n{mda}") if risk: parts.append(f"\nRISK FACTORS (Item 1A):\n{risk}") return "\n".join(parts) except Exception as exc: logger.warning("SEC filing parse error: %s", exc) return "" # --------------------------------------------------------------------------- # Public @tool — used by analyst_node # --------------------------------------------------------------------------- @tool def get_sec_filings(ticker: str) -> str: """Fetch the most recent 10-K or 10-Q filing from SEC EDGAR and extract MD&A and Risk Factors sections for investment analysis. Args: ticker: US stock ticker symbol (e.g. AAPL, MSFT) """ if "." in ticker: return "" filing = _search_filings(ticker) if not filing: logger.info("No SEC filings found for %s", ticker) return "" logger.info( "Found %s for %s (filed %s)", filing["form_type"], ticker, filing["file_date"], ) doc_url = _fetch_filing_index(filing["entity_id"], filing["accession"]) if not doc_url: return ( f"SEC FILING GROUND TRUTH:\n" f"Found {filing['form_type']} filed {filing['file_date']} " f"but could not retrieve document." ) time.sleep(0.5) try: resp = requests.get( doc_url, headers={**_SEC_HEADERS, "Accept": "text/html"}, timeout=15, ) if resp.status_code != 200: return "" if len(resp.text) > 2_000_000: logger.info("SEC filing too large (%d bytes), truncating", len(resp.text)) html = resp.text[:2_000_000] else: html = resp.text except requests.RequestException as exc: logger.warning("SEC filing fetch failed for %s: %s", ticker, exc) return "" result = parse_mda_risk_factors(html) if result: header = f"[Source: {filing['form_type']} filed {filing['file_date']}]" return f"{result}\n{header}" return ""