from __future__ import annotations import json import os from dataclasses import dataclass from datetime import datetime, timezone from typing import Literal import html from urllib.parse import urljoin, urlparse from pydantic import BaseModel, Field, HttpUrl Status = Literal["ok", "no_upcoming", "error"] USER_AGENT = ( "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/120 Safari/537.36" ) # SYSTEM_PROMPT = ( # "You are given the page text content and a list of links from the page. " # "Find up to the NEXT 3 upcoming talks/events AFTER now, sorted by time ascending. " # "If the page lists only a date (no time), output start_time as an ISO-8601 DATE like '2026-01-15'. " # "If events are listed as lines with a date (e.g. 'Jan 15, 2026 – ...'), treat each such line as an event. " # "If there is no dedicated per-event URL, set event_url to the Source URL. " # "If the schedule is not on this page, set next_url_to_check to the single best link to follow (one of LINKS). " # "Do NOT invent placeholder titles like 'TBA' unless the page text explicitly contains 'TBA'. " # "IMPORTANT: Keep fields clean and separated: " # "- events[].title MUST be the talk/event title ONLY (no speaker name, no affiliation, no date/time). " # "- Put the speaker/person name in events[].speaker when present in the text (e.g. 'Speaker: …', 'by …', 'Presenter: …'). " # "- If the speaker affiliation/institution is present (e.g. 'UC Davis', 'MIT', 'Google DeepMind'), put it in events[].affiliation (do not mix it into title). " # "- If a line contains both speaker and title (e.g. 'Jane Doe — Learning Robots' or 'Learning Robots — Jane Doe'), split them correctly. " # "Choose the talk title that is best-supported by the page text. Give priority to explicit cues like 'Title:', 'Talk title:', 'Topic:', and text near 'Abstract:'/'Summary:'. " # "If a header includes a series label plus a person name/affiliation (e.g. 'Seminar: Jane Doe (MIT)'), treat that as speaker/affiliation (not title) and keep searching the body for the real title. " # "e.g. this Robotics Institute Seminar: Mahdi Tavakoli (University of Alberta) is not a title" # "Keep person name and affiliation seprate. Each should be put in its own field. " # "Never guess; quote evidence from the provided text." # "Never reply with any text other than the JSON object. If you don't find any events, still reply with a JSON object containing the no_upcoming status. " # "REMEMBER: START TIME IS ALWAYS REQUIRED FOR EACH EVENT. IF THERE IS NO START TIME, DO NOT INCLUDE THE EVENT. " # "Respond in JSON with the following schema: " # "status: ok | no_upcoming | error" # "events: array of up to 3 objects (required if ok)" # "events[].title: string" # "events[].start_time: ISO-8601 string; if only date is known, use YYYY-MM-DD (always required)" # "events[].event_url: string URL" # "events[].speaker: string " # "events[].affiliation: string (if prersent)" # "events[].evidence: short snippet from provided text (always required)" # "error: short error string (required if error)" # "next_url_to_check: string URL (optional - must be one of LINKS if provided)" # ) SYSTEM_PROMPT = """ You are a JSON extraction engine. You do NOT write code. CRITICAL OUTPUT CONSTRAINTS (HARD): - Your entire reply MUST be valid JSON (RFC 8259). - Reply with exactly ONE JSON object. - The first non-whitespace character MUST be "{" and the last MUST be "}". - Use double quotes for all JSON strings. Never use single quotes. - Do NOT include markdown fences (```), explanations, pseudocode, or Python. If you cannot follow these constraints, reply exactly: {"status":"error","error":"non_json_or_invalid_schema"} Event extraction rules (HARD): - Return up to 3 upcoming events after "now", sorted by start_time ascending. - Every event MUST include start_time and it MUST be ISO-8601 (e.g. "2026-01-15" or "2026-01-15T14:00+00:00"). - Do NOT use month names or informal dates like "Jan 15, 2026". - If you cannot find a date/time in PAGE_TEXT for an event, DO NOT include that event. - If you find zero events with a date/time, return {"status":"no_upcoming"}. Title rule (HARD): - title MUST be copied verbatim from PAGE_TEXT (no paraphrasing). - title MUST come from the same local event block as the date/time: - it must appear within 300 characters of the date/time text you used for start_time. - Do NOT use site/series/page headings or navigation as title. Examples of INVALID titles: "Seminar Series", "Events", "Robotics", "University of Toronto", page header text. - If you cannot find a specific talk/topic title near the date/time, DO NOT include the event. If PREVIOUS_EVENTS is provided: - Use PREVIOUS_EVENTS as a strict copy source. - If PAGE_TEXT contains an event that matches one in PREVIOUS_EVENTS, you MUST include that event in your output and you MUST copy the entire event object exactly from PREVIOUS_EVENTS. - Do NOT omit a matched event. Do NOT say it is "already known". Do NOT reduce the number of returned events because PREVIOUS_EVENTS were provided. - If PREVIOUS_EVENTS contains events that are NOT present in PAGE_TEXT, ignore them. Final self-check (HARD, perform before replying): - Your reply must be valid JSON only. - For each event: verify start_time exists and is a non-empty string. - If an event has missing/empty start_time, REMOVE that event. - If no events remain, output {"status":"no_upcoming"}. Schema: { "status": "ok"|"no_upcoming"|"error", "events": [{"title": "...", "start_time": "...", "event_url": "...", "speaker": "", "affiliation": null, "attending_count": 0}], "error": "...", "next_url_to_check": "..." } """ class OrgSource(BaseModel): id: str name: str url: HttpUrl tags: list[str] = Field(default_factory=list) class LlmEvent(BaseModel): title: str start_time: str event_url: str | None = None speaker: str affiliation: str | None = None attending_count: int = 0 class LlmHopResult(BaseModel): status: Status events: list[LlmEvent] = Field(default_factory=list) error: str | None = None next_url_to_check: str | None = None class EventResult(BaseModel): org_id: str org_name: str source_url: str status: Status events: list[LlmEvent] = Field(default_factory=list) checked_at: str hops: int = 0 visited_urls: list[str] = Field(default_factory=list) error: str | None = None @dataclass class LlmConfig: model: str api_key: str | None = None api_base: str | None = None def parse_dt_utc(value: str) -> datetime: from dateutil import parser as dtparser dt = dtparser.isoparse(value) if dt.tzinfo is None: dt = dt.replace(tzinfo=timezone.utc) return dt.astimezone(timezone.utc) def text_and_links(page_html: str, *, base_url: str, limit: int = 40) -> tuple[str, list[str]]: from bs4 import BeautifulSoup soup = BeautifulSoup(html.unescape(page_html), "html.parser") for tag in soup(["script", "style", "noscript"]): tag.decompose() text = "\n".join(ln.strip() for ln in soup.get_text("\n").splitlines() if ln.strip())[:24000] base_dom = urlparse(base_url).netloc.lower() links: list[str] = [] for a in soup.find_all("a", href=True): u = urljoin(base_url, str(a["href"]).strip()) p = urlparse(u) if p.scheme in {"http", "https"} and p.netloc.lower() == base_dom: links.append(u) if len(links) >= limit: break return text, links def llm_extract(*, config: LlmConfig, org: OrgSource, url: str, page_html: str, now_iso: str, previous_events: list[dict]) -> LlmHopResult: from litellm import completion # type: ignore page_text, links = text_and_links(page_html, base_url=url) messages = [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": json.dumps({"org": org.name, "now": now_iso, "source_url": url, "PREVIOUS_EVENTS": previous_events})}, {"role": "user", "content": "LINKS:\n" + "\n".join(links)}, {"role": "user", "content": "PAGE_TEXT_BEGIN\n" + page_text + "\nPAGE_TEXT_END"}, {"role": "user", "content": "Return ONLY one JSON object (no markdown, no code). " "Must start with '{' and end with '}'. Use double quotes only. " "start_time MUST be ISO-8601 only (e.g. 2026-01-15 or 2026-01-15T14:00+00:00), never 'Jan 15, 2026'. " "Before returning, delete any event missing/empty start_time. " "Title must be copied verbatim from PAGE_TEXT near the date/time. " "IMPORTANT: PREVIOUS_EVENTS are NOT a reason to omit events. If PAGE_TEXT contains an event that matches PREVIOUS_EVENTS, you MUST re-output it by copying the entire event object exactly from PREVIOUS_EVENTS. " "If none remain, return {\"status\":\"no_upcoming\"}."} ] timeout = float(os.environ.get("LLM_TIMEOUT_SECS", "60")) kwargs: dict[str, object] = {"model": config.model, "temperature": 0, "timeout": timeout} if config.api_key: kwargs["api_key"] = config.api_key if config.api_base: kwargs["api_base"] = config.api_base content = completion(messages=messages, **kwargs)["choices"][0]["message"]["content"] content = content.replace('`', '').replace('json', '') content = content[content.find("{") : content.rfind("}") + 1] try: content = json.loads(content) except json.JSONDecodeError: raise ValueError(f"LLM did not return valid JSON:\n{content}") try: LlmHopResult.model_validate(content) except Exception as e: raise ValueError(f"LLM returned JSON that does not match schema: {e}\n{content}") return LlmHopResult.model_validate(content)