PrimoGreedy-Agent / src /sec_edgar.py
CiscsoPonce's picture
fix: cron performance — stale seeds, Kelly cache, ticker noise, SEC timeouts
4438658
"""SEC EDGAR integration — fetches 10-K/10-Q filings and extracts
MD&A (Item 7) and Risk Factors (Item 1A) for the analyst prompt.
Uses the EDGAR EFTS full-text search API (same endpoint used in
``src/discovery/insider_feed.py``) and BeautifulSoup for HTML parsing.
"""
import re
import time
from datetime import datetime, timedelta
import requests
from bs4 import BeautifulSoup
from langchain_core.tools import tool
from langchain_text_splitters import RecursiveCharacterTextSplitter
from src.core.logger import get_logger
logger = get_logger(__name__)
_SEC_HEADERS = {
"User-Agent": "PrimoGreedy/1.0 (contact@primogreedy.com)",
"Accept": "application/json",
}
_EFTS_URL = "https://efts.sec.gov/LATEST/search-index"
_MAX_SECTION_CHARS = 2000
_SPLITTER = RecursiveCharacterTextSplitter(
chunk_size=_MAX_SECTION_CHARS, chunk_overlap=200
)
# Regex patterns for section headers in 10-K/10-Q filings
_MDA_PATTERN = re.compile(
r"Item\s*7[\.\s\—\-]+.*?Management.s\s+Discussion|"
r"Item\s*7[\.\s\—\-]|"
r"Management.s\s+Discussion\s+and\s+Analysis",
re.IGNORECASE,
)
_RISK_PATTERN = re.compile(
r"Item\s*1A[\.\s\—\-]+.*?Risk\s+Factors|"
r"Item\s*1A[\.\s\—\-]|"
r"Risk\s+Factors",
re.IGNORECASE,
)
_NEXT_ITEM_PATTERN = re.compile(r"Item\s*\d+[A-Z]?[\.\s\—\-]", re.IGNORECASE)
# ---------------------------------------------------------------------------
# EFTS search — find the most recent 10-K or 10-Q for a ticker
# ---------------------------------------------------------------------------
def _search_filings(ticker: str) -> dict | None:
"""Query EDGAR EFTS for the most recent annual/quarterly filing.
Returns the first hit as a dict with ``file_url``, ``form_type``,
``file_date``, ``company_name``, or *None* if nothing found.
"""
two_years_ago = (datetime.now() - timedelta(days=730)).strftime("%Y-%m-%d")
today = datetime.now().strftime("%Y-%m-%d")
params = {
"q": "",
"forms": "10-K,10-Q",
"dateRange": "custom",
"startdt": two_years_ago,
"enddt": today,
"tickers": ticker,
}
try:
resp = requests.get(_EFTS_URL, params=params, headers=_SEC_HEADERS, timeout=10)
if resp.status_code != 200:
logger.info("EDGAR EFTS returned %d for %s", resp.status_code, ticker)
return None
hits = resp.json().get("hits", {}).get("hits", [])
if not hits:
return None
src = hits[0].get("_source", {})
file_num = src.get("file_num", "")
accession = (
src.get("accession_no", "")
.replace("-", "")
)
primary_doc = src.get("file_description", "")
if not primary_doc:
primary_doc = src.get("display_names", [""])[0] if src.get("display_names") else ""
entity_id = src.get("entity_id", "")
return {
"form_type": src.get("form_type", "10-K"),
"file_date": src.get("file_date", ""),
"company_name": src.get("display_names", [""])[0] if src.get("display_names") else ticker,
"entity_id": entity_id,
"accession": accession,
"file_num": file_num,
}
except requests.RequestException as exc:
logger.warning("EDGAR EFTS request failed for %s: %s", ticker, exc)
return None
def _fetch_filing_index(entity_id: str, accession: str) -> str | None:
"""Fetch the filing index page and return the URL of the primary HTML document."""
if not entity_id or not accession:
return None
index_url = (
f"https://www.sec.gov/Archives/edgar/data/{entity_id}/{accession}/"
)
time.sleep(0.5)
try:
resp = requests.get(
index_url,
headers={**_SEC_HEADERS, "Accept": "text/html"},
timeout=10,
)
if resp.status_code != 200:
return None
soup = BeautifulSoup(resp.text, "html.parser")
for a_tag in soup.find_all("a", href=True):
href = a_tag["href"]
if href.endswith(".htm") or href.endswith(".html"):
if not href.startswith("http"):
href = f"https://www.sec.gov{href}" if href.startswith("/") else f"{index_url}{href}"
return href
return None
except requests.RequestException as exc:
logger.warning("EDGAR filing index fetch failed: %s", exc)
return None
# ---------------------------------------------------------------------------
# HTML parser — extract MD&A and Risk Factors sections
# ---------------------------------------------------------------------------
def _extract_section(full_text: str, start_pattern: re.Pattern, label: str) -> str:
"""Find a section by *start_pattern* and return text up to the next Item header."""
match = start_pattern.search(full_text)
if not match:
return ""
start = match.end()
remainder = full_text[start:]
end_match = _NEXT_ITEM_PATTERN.search(remainder, pos=200)
if end_match:
section_text = remainder[: end_match.start()]
else:
section_text = remainder[:_MAX_SECTION_CHARS * 2]
section_text = section_text.strip()
if not section_text:
return ""
if len(section_text) > _MAX_SECTION_CHARS:
chunks = _SPLITTER.split_text(section_text)
section_text = chunks[0] if chunks else section_text[:_MAX_SECTION_CHARS]
return section_text
def parse_mda_risk_factors(html: str) -> str:
"""Extract MD&A and Risk Factors from a 10-K/10-Q HTML document.
Returns a formatted string ready for ``{sec_context}`` in the prompt,
or an empty string if extraction fails.
"""
try:
soup = BeautifulSoup(html, "html.parser")
for tag in soup(["script", "style", "meta", "link"]):
tag.decompose()
full_text = soup.get_text(separator="\n", strip=True)
mda = _extract_section(full_text, _MDA_PATTERN, "MD&A")
risk = _extract_section(full_text, _RISK_PATTERN, "Risk Factors")
if not mda and not risk:
return ""
parts = ["SEC FILING GROUND TRUTH:"]
if mda:
parts.append(f"\nMD&A SUMMARY (Item 7):\n{mda}")
if risk:
parts.append(f"\nRISK FACTORS (Item 1A):\n{risk}")
return "\n".join(parts)
except Exception as exc:
logger.warning("SEC filing parse error: %s", exc)
return ""
# ---------------------------------------------------------------------------
# Public @tool — used by analyst_node
# ---------------------------------------------------------------------------
@tool
def get_sec_filings(ticker: str) -> str:
"""Fetch the most recent 10-K or 10-Q filing from SEC EDGAR and extract
MD&A and Risk Factors sections for investment analysis.
Args:
ticker: US stock ticker symbol (e.g. AAPL, MSFT)
"""
if "." in ticker:
return ""
filing = _search_filings(ticker)
if not filing:
logger.info("No SEC filings found for %s", ticker)
return ""
logger.info(
"Found %s for %s (filed %s)",
filing["form_type"], ticker, filing["file_date"],
)
doc_url = _fetch_filing_index(filing["entity_id"], filing["accession"])
if not doc_url:
return (
f"SEC FILING GROUND TRUTH:\n"
f"Found {filing['form_type']} filed {filing['file_date']} "
f"but could not retrieve document."
)
time.sleep(0.5)
try:
resp = requests.get(
doc_url,
headers={**_SEC_HEADERS, "Accept": "text/html"},
timeout=15,
)
if resp.status_code != 200:
return ""
if len(resp.text) > 2_000_000:
logger.info("SEC filing too large (%d bytes), truncating", len(resp.text))
html = resp.text[:2_000_000]
else:
html = resp.text
except requests.RequestException as exc:
logger.warning("SEC filing fetch failed for %s: %s", ticker, exc)
return ""
result = parse_mda_risk_factors(html)
if result:
header = f"[Source: {filing['form_type']} filed {filing['file_date']}]"
return f"{result}\n{header}"
return ""