Spaces:

hchevva
/

TOXRA.AI

Sleeping

App Files Files Community

TOXRA.AI / core /sources /ntp.py

hchevva

Upload 4 files

02835d5 verified 25 days ago

raw

history blame contribute delete

8.64 kB

	import html
	import re
	from typing import Any, Dict, List, Optional
	from urllib.parse import urljoin

	import httpx

	from core.config import settings

	REPORTS_URL = "https://ntp.niehs.nih.gov/publications/reports"
	BASE = "https://ntp.niehs.nih.gov"
	INDEX_URL = "https://ntp.niehs.nih.gov/data/tr"

	TR_RE = re.compile(r"\bTR-?(\d{3,4})\b", re.IGNORECASE)
	HREF_RE = re.compile(r"href=[\"']([^\"']+)[\"']", re.IGNORECASE)
	TITLE_RE = re.compile(r"<title[^>]>(.?)</title>", re.IGNORECASE \| re.DOTALL)
	TR_MARKER_RE = re.compile(r">(TR-\d{3,})<", re.IGNORECASE)


	def _strip_tags(html_text: str) -> str:
	# crude but robust enough for the NTP index page
	text = re.sub(r"<script[\s\S]*?</script>", " ", html_text, flags=re.IGNORECASE)
	text = re.sub(r"<style[\s\S]*?</style>", " ", text, flags=re.IGNORECASE)
	text = re.sub(r"<[^>]+>", " ", text)
	text = html.unescape(text)
	text = re.sub(r"\s+", " ", text).strip()
	return text


	def _extract_title(page_html: str) -> str:
	m = TITLE_RE.search(page_html or "")
	if not m:
	return ""
	t = html.unescape(m.group(1))
	t = re.sub(r"\s+", " ", t).strip()
	# common suffix
	t = re.sub(r"\s\\|\sNTP.*$", "", t, flags=re.IGNORECASE).strip()
	return t


	def _extract_pdf_url(page_html: str, page_url: str) -> Optional[str]:
	# Look for any href ending in .pdf
	hrefs = HREF_RE.findall(page_html or "")
	for href in hrefs:
	if ".pdf" not in href.lower():
	continue
	if href.startswith("#"):
	continue
	return urljoin(page_url, href)
	return None


	async def _fetch_tr_page(num: str, http: httpx.AsyncClient) -> Optional[Dict[str, Any]]:
	page_url = f"{BASE}/publications/reports/tr{num}"
	try:
	r = await http.get(
	page_url,
	timeout=25,
	follow_redirects=True,
	headers={"User-Agent": "Mozilla/5.0"},
	)
	if r.status_code >= 400:
	return None
	page_html = r.text
	except Exception:
	return None

	title = _extract_title(page_html)
	pdf_url = _extract_pdf_url(page_html, str(r.url))

	# Try to find a year in the title
	year = None
	if title:
	years = re.findall(r"\b(19\d{2}\|20\d{2})\b", title)
	if years:
	year = years[-1]

	item = {
	"num": num,
	"tr": f"TR-{num}",
	"report_page": str(r.url),
	"title": title,
	"year": year,
	"pdf": pdf_url,
	}
	return item


	async def search_technical_reports(query: str, http: httpx.AsyncClient, limit: int = 8) -> Dict[str, Any]:
	"""Search NTP Technical Reports and return ONLY TR hits relevant to the query.

	Implementation mirrors production (Cloudflare worker):
	- download the NTP reports index HTML
	- locate TR-### occurrences
	- keep a TR if the query appears in the surrounding neighborhood text
	- fetch each TR page to obtain report page + PDF
	"""

	q = (query or "").strip()
	if not q:
	return {"ok": False, "error": "Empty query", "items": []}

	# Prefer worker proxy if configured (matches production behavior)
	if settings.worker_base_url:
	try:
	worker_url = settings.worker_base_url.rstrip("/") + "/ntp-tr"
	r = await http.post(worker_url, json={"query": q, "limit": limit}, timeout=25.0)
	if r.status_code < 400:
	data = r.json()
	rows = data.get("results") or []
	items: List[Dict[str, Any]] = []
	for row in rows:
	if not isinstance(row, dict):
	continue
	items.append(
	{
	"num": (row.get("tr") or "").replace("TR-", ""),
	"tr": row.get("tr"),
	"report_page": row.get("page") or row.get("url"),
	"title": row.get("title") or "NTP Technical Report",
	"year": row.get("year"),
	"pdf": row.get("pdf"),
	}
	)
	if items:
	return {"ok": True, "query": q, "items": items}
	except Exception:
	pass

	try:
	r = await http.get(
	REPORTS_URL,
	timeout=25,
	follow_redirects=True,
	headers={"User-Agent": "Mozilla/5.0"},
	)
	r.raise_for_status()
	index_html = r.text
	except Exception as e:
	return {"ok": False, "error": f"Failed to fetch NTP index: {e}", "items": []}

	lines = index_html.splitlines()
	q_low = q.lower()
	is_cas = bool(re.match(r"^\d{2,7}-\d{2}-\d$", q))
	q_digits = re.sub(r"\D", "", q) if is_cas else ""
	results: List[Dict[str, Any]] = []

	for i, line in enumerate(lines):
	m = TR_MARKER_RE.search(line)
	if not m:
	continue

	snippet = " ".join(lines[i : min(i + 12, len(lines))])
	mini_snippet = " ".join(lines[i : min(i + 3, len(lines))])
	low_text = _strip_tags(snippet).lower()
	mini_text = _strip_tags(mini_snippet)

	if is_cas:
	if q_digits not in re.sub(r"\D", "", mini_text):
	continue
	else:
	if q_low not in low_text:
	continue

	tr_id = m.group(1)
	hrefs = HREF_RE.findall(snippet)

	# Prefer a non-PDF link under /publications/ or /go/
	candidates = [urljoin(REPORTS_URL, h) for h in hrefs if not re.search(r"\.pdf(\?\|$)", h, re.I)]

	def score(u: str) -> int:
	s = 0
	if "/publications/" in u:
	s += 3
	if "/go/" in u:
	s += 3
	if tr_id and tr_id.lower() in u.lower():
	s += 2
	if re.search(r"/reports?", u):
	s += 1
	return s

	candidates.sort(key=score, reverse=True)
	page_href = candidates[0] if candidates else None

	pdf_match = re.search(r'href="([^"]+\.pdf)"', snippet, re.I)
	pdf_url = urljoin(REPORTS_URL, pdf_match.group(1)) if pdf_match else None

	text_block = _strip_tags(snippet)
	if is_cas and q_digits not in re.sub(r"\D", "", text_block):
	continue
	title_match = re.search(
	r"TR-\d{3,}\s+PDF\s+(.*?)\s+\b(20\d{2}\|19\d{2})\b\s+Technical Report",
	text_block,
	re.I,
	)
	year_match = re.search(r"\b(20\d{2}\|19\d{2})\b", text_block)

	results.append(
	{
	"tr": tr_id,
	"title": title_match.group(1) if title_match else "",
	"year": year_match.group(1) if year_match else "",
	"pdf": pdf_url,
	"report_page": page_href or REPORTS_URL,
	}
	)
	if len(results) >= int(limit):
	break

	# Fallback: scan the TR index page (data/tr)
	if not results:
	try:
	r2 = await http.get(
	INDEX_URL,
	timeout=25,
	follow_redirects=True,
	headers={"User-Agent": "Mozilla/5.0"},
	)
	if r2.status_code < 400:
	idx_html = r2.text
	else:
	idx_html = ""
	except Exception:
	idx_html = ""

	if idx_html:
	idx_lines = idx_html.splitlines()
	for i, row in enumerate(idx_lines):
	if not re.search(r"TR-\d{3,}", row, re.I):
	continue
	block = " ".join(idx_lines[i : min(i + 6, len(idx_lines))])
	block_text = _strip_tags(block)
	low = block_text.lower()
	if q_low not in low:
	continue

	tr = re.search(r"TR-\d{3,}", block_text, re.I)
	cas = re.search(r"\b\d{2,7}-\d{2}-\d\b", block_text)
	name = block_text
	if tr:
	name = re.sub(r"TR-\d{3,}", "", name, flags=re.I)
	if cas:
	name = re.sub(r"\b\d{2,7}-\d{2}-\d\b", "", name)
	name = name.strip()

	results.append(
	{
	"tr": tr.group(0) if tr else "",
	"title": name or "",
	"year": "",
	"pdf": None,
	"report_page": INDEX_URL,
	}
	)
	if len(results) >= int(limit):
	break

	return {"ok": True, "query": q, "items": results}