import html import re from typing import Any, Dict, List, Optional from urllib.parse import urljoin import httpx from core.config import settings REPORTS_URL = "https://ntp.niehs.nih.gov/publications/reports" BASE = "https://ntp.niehs.nih.gov" INDEX_URL = "https://ntp.niehs.nih.gov/data/tr" TR_RE = re.compile(r"\bTR-?(\d{3,4})\b", re.IGNORECASE) HREF_RE = re.compile(r"href=[\"']([^\"']+)[\"']", re.IGNORECASE) TITLE_RE = re.compile(r"