Spaces:

CiscsoPonce
/

PrimoGreedy-Agent

Sleeping

File size: 8,301 Bytes

"""SEC EDGAR integration — fetches 10-K/10-Q filings and extracts
MD&A (Item 7) and Risk Factors (Item 1A) for the analyst prompt.

Uses the EDGAR EFTS full-text search API (same endpoint used in
``src/discovery/insider_feed.py``) and BeautifulSoup for HTML parsing.
"""

import re
import time
from datetime import datetime, timedelta

import requests
from bs4 import BeautifulSoup
from langchain_core.tools import tool
from langchain_text_splitters import RecursiveCharacterTextSplitter

from src.core.logger import get_logger

logger = get_logger(__name__)

_SEC_HEADERS = {
    "User-Agent": "PrimoGreedy/1.0 (contact@primogreedy.com)",
    "Accept": "application/json",
}
_EFTS_URL = "https://efts.sec.gov/LATEST/search-index"
_MAX_SECTION_CHARS = 2000
_SPLITTER = RecursiveCharacterTextSplitter(
    chunk_size=_MAX_SECTION_CHARS, chunk_overlap=200
)

# Regex patterns for section headers in 10-K/10-Q filings
_MDA_PATTERN = re.compile(
    r"Item\s*7[\.\s\—\-]+.*?Management.s\s+Discussion|"
    r"Item\s*7[\.\s\—\-]|"
    r"Management.s\s+Discussion\s+and\s+Analysis",
    re.IGNORECASE,
)
_RISK_PATTERN = re.compile(
    r"Item\s*1A[\.\s\—\-]+.*?Risk\s+Factors|"
    r"Item\s*1A[\.\s\—\-]|"
    r"Risk\s+Factors",
    re.IGNORECASE,
)
_NEXT_ITEM_PATTERN = re.compile(r"Item\s*\d+[A-Z]?[\.\s\—\-]", re.IGNORECASE)


# ---------------------------------------------------------------------------
# EFTS search — find the most recent 10-K or 10-Q for a ticker
# ---------------------------------------------------------------------------

def _search_filings(ticker: str) -> dict | None:
    """Query EDGAR EFTS for the most recent annual/quarterly filing.

    Returns the first hit as a dict with ``file_url``, ``form_type``,
    ``file_date``, ``company_name``, or *None* if nothing found.
    """
    two_years_ago = (datetime.now() - timedelta(days=730)).strftime("%Y-%m-%d")
    today = datetime.now().strftime("%Y-%m-%d")

    params = {
        "q": "",
        "forms": "10-K,10-Q",
        "dateRange": "custom",
        "startdt": two_years_ago,
        "enddt": today,
        "tickers": ticker,
    }

    try:
        resp = requests.get(_EFTS_URL, params=params, headers=_SEC_HEADERS, timeout=10)
        if resp.status_code != 200:
            logger.info("EDGAR EFTS returned %d for %s", resp.status_code, ticker)
            return None

        hits = resp.json().get("hits", {}).get("hits", [])
        if not hits:
            return None

        src = hits[0].get("_source", {})
        file_num = src.get("file_num", "")
        accession = (
            src.get("accession_no", "")
            .replace("-", "")
        )

        primary_doc = src.get("file_description", "")
        if not primary_doc:
            primary_doc = src.get("display_names", [""])[0] if src.get("display_names") else ""

        entity_id = src.get("entity_id", "")

        return {
            "form_type": src.get("form_type", "10-K"),
            "file_date": src.get("file_date", ""),
            "company_name": src.get("display_names", [""])[0] if src.get("display_names") else ticker,
            "entity_id": entity_id,
            "accession": accession,
            "file_num": file_num,
        }

    except requests.RequestException as exc:
        logger.warning("EDGAR EFTS request failed for %s: %s", ticker, exc)
        return None


def _fetch_filing_index(entity_id: str, accession: str) -> str | None:
    """Fetch the filing index page and return the URL of the primary HTML document."""
    if not entity_id or not accession:
        return None

    index_url = (
        f"https://www.sec.gov/Archives/edgar/data/{entity_id}/{accession}/"
    )

    time.sleep(0.5)

    try:
        resp = requests.get(
            index_url,
            headers={**_SEC_HEADERS, "Accept": "text/html"},
            timeout=10,
        )
        if resp.status_code != 200:
            return None

        soup = BeautifulSoup(resp.text, "html.parser")
        for a_tag in soup.find_all("a", href=True):
            href = a_tag["href"]
            if href.endswith(".htm") or href.endswith(".html"):
                if not href.startswith("http"):
                    href = f"https://www.sec.gov{href}" if href.startswith("/") else f"{index_url}{href}"
                return href

        return None

    except requests.RequestException as exc:
        logger.warning("EDGAR filing index fetch failed: %s", exc)
        return None


# ---------------------------------------------------------------------------
# HTML parser — extract MD&A and Risk Factors sections
# ---------------------------------------------------------------------------

def _extract_section(full_text: str, start_pattern: re.Pattern, label: str) -> str:
    """Find a section by *start_pattern* and return text up to the next Item header."""
    match = start_pattern.search(full_text)
    if not match:
        return ""

    start = match.end()
    remainder = full_text[start:]

    end_match = _NEXT_ITEM_PATTERN.search(remainder, pos=200)
    if end_match:
        section_text = remainder[: end_match.start()]
    else:
        section_text = remainder[:_MAX_SECTION_CHARS * 2]

    section_text = section_text.strip()
    if not section_text:
        return ""

    if len(section_text) > _MAX_SECTION_CHARS:
        chunks = _SPLITTER.split_text(section_text)
        section_text = chunks[0] if chunks else section_text[:_MAX_SECTION_CHARS]

    return section_text


def parse_mda_risk_factors(html: str) -> str:
    """Extract MD&A and Risk Factors from a 10-K/10-Q HTML document.

    Returns a formatted string ready for ``{sec_context}`` in the prompt,
    or an empty string if extraction fails.
    """
    try:
        soup = BeautifulSoup(html, "html.parser")

        for tag in soup(["script", "style", "meta", "link"]):
            tag.decompose()

        full_text = soup.get_text(separator="\n", strip=True)

        mda = _extract_section(full_text, _MDA_PATTERN, "MD&A")
        risk = _extract_section(full_text, _RISK_PATTERN, "Risk Factors")

        if not mda and not risk:
            return ""

        parts = ["SEC FILING GROUND TRUTH:"]
        if mda:
            parts.append(f"\nMD&A SUMMARY (Item 7):\n{mda}")
        if risk:
            parts.append(f"\nRISK FACTORS (Item 1A):\n{risk}")

        return "\n".join(parts)

    except Exception as exc:
        logger.warning("SEC filing parse error: %s", exc)
        return ""


# ---------------------------------------------------------------------------
# Public @tool — used by analyst_node
# ---------------------------------------------------------------------------

@tool
def get_sec_filings(ticker: str) -> str:
    """Fetch the most recent 10-K or 10-Q filing from SEC EDGAR and extract
    MD&A and Risk Factors sections for investment analysis.

    Args:
        ticker: US stock ticker symbol (e.g. AAPL, MSFT)
    """
    if "." in ticker:
        return ""

    filing = _search_filings(ticker)
    if not filing:
        logger.info("No SEC filings found for %s", ticker)
        return ""

    logger.info(
        "Found %s for %s (filed %s)",
        filing["form_type"], ticker, filing["file_date"],
    )

    doc_url = _fetch_filing_index(filing["entity_id"], filing["accession"])
    if not doc_url:
        return (
            f"SEC FILING GROUND TRUTH:\n"
            f"Found {filing['form_type']} filed {filing['file_date']} "
            f"but could not retrieve document."
        )

    time.sleep(0.5)

    try:
        resp = requests.get(
            doc_url,
            headers={**_SEC_HEADERS, "Accept": "text/html"},
            timeout=15,
        )
        if resp.status_code != 200:
            return ""

        if len(resp.text) > 2_000_000:
            logger.info("SEC filing too large (%d bytes), truncating", len(resp.text))
            html = resp.text[:2_000_000]
        else:
            html = resp.text

    except requests.RequestException as exc:
        logger.warning("SEC filing fetch failed for %s: %s", ticker, exc)
        return ""

    result = parse_mda_risk_factors(html)

    if result:
        header = f"[Source: {filing['form_type']} filed {filing['file_date']}]"
        return f"{result}\n{header}"

    return ""