robotic_seminars / src /models.py
ar0s's picture
added persistent storage
68fc3c9
from __future__ import annotations
import json
import os
from dataclasses import dataclass
from datetime import datetime, timezone
from typing import Literal
import html
from urllib.parse import urljoin, urlparse
from pydantic import BaseModel, Field, HttpUrl
Status = Literal["ok", "no_upcoming", "error"]
USER_AGENT = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/120 Safari/537.36"
)
# SYSTEM_PROMPT = (
# "You are given the page text content and a list of links from the page. "
# "Find up to the NEXT 3 upcoming talks/events AFTER now, sorted by time ascending. "
# "If the page lists only a date (no time), output start_time as an ISO-8601 DATE like '2026-01-15'. "
# "If events are listed as lines with a date (e.g. 'Jan 15, 2026 – ...'), treat each such line as an event. "
# "If there is no dedicated per-event URL, set event_url to the Source URL. "
# "If the schedule is not on this page, set next_url_to_check to the single best link to follow (one of LINKS). "
# "Do NOT invent placeholder titles like 'TBA' unless the page text explicitly contains 'TBA'. "
# "IMPORTANT: Keep fields clean and separated: "
# "- events[].title MUST be the talk/event title ONLY (no speaker name, no affiliation, no date/time). "
# "- Put the speaker/person name in events[].speaker when present in the text (e.g. 'Speaker: …', 'by …', 'Presenter: …'). "
# "- If the speaker affiliation/institution is present (e.g. 'UC Davis', 'MIT', 'Google DeepMind'), put it in events[].affiliation (do not mix it into title). "
# "- If a line contains both speaker and title (e.g. 'Jane Doe — Learning Robots' or 'Learning Robots — Jane Doe'), split them correctly. "
# "Choose the talk title that is best-supported by the page text. Give priority to explicit cues like 'Title:', 'Talk title:', 'Topic:', and text near 'Abstract:'/'Summary:'. "
# "If a header includes a series label plus a person name/affiliation (e.g. 'Seminar: Jane Doe (MIT)'), treat that as speaker/affiliation (not title) and keep searching the body for the real title. "
# "e.g. this Robotics Institute Seminar: Mahdi Tavakoli (University of Alberta) is not a title"
# "Keep person name and affiliation seprate. Each should be put in its own field. "
# "Never guess; quote evidence from the provided text."
# "Never reply with any text other than the JSON object. If you don't find any events, still reply with a JSON object containing the no_upcoming status. "
# "REMEMBER: START TIME IS ALWAYS REQUIRED FOR EACH EVENT. IF THERE IS NO START TIME, DO NOT INCLUDE THE EVENT. "
# "Respond in JSON with the following schema: "
# "status: ok | no_upcoming | error"
# "events: array of up to 3 objects (required if ok)"
# "events[].title: string"
# "events[].start_time: ISO-8601 string; if only date is known, use YYYY-MM-DD (always required)"
# "events[].event_url: string URL"
# "events[].speaker: string "
# "events[].affiliation: string (if prersent)"
# "events[].evidence: short snippet from provided text (always required)"
# "error: short error string (required if error)"
# "next_url_to_check: string URL (optional - must be one of LINKS if provided)"
# )
SYSTEM_PROMPT = """
You are a JSON extraction engine. You do NOT write code.
CRITICAL OUTPUT CONSTRAINTS (HARD):
- Your entire reply MUST be valid JSON (RFC 8259).
- Reply with exactly ONE JSON object.
- The first non-whitespace character MUST be "{" and the last MUST be "}".
- Use double quotes for all JSON strings. Never use single quotes.
- Do NOT include markdown fences (```), explanations, pseudocode, or Python.
If you cannot follow these constraints, reply exactly:
{"status":"error","error":"non_json_or_invalid_schema"}
Event extraction rules (HARD):
- Return up to 3 upcoming events after "now", sorted by start_time ascending.
- Every event MUST include start_time and it MUST be ISO-8601 (e.g. "2026-01-15" or "2026-01-15T14:00+00:00").
- Do NOT use month names or informal dates like "Jan 15, 2026".
- If you cannot find a date/time in PAGE_TEXT for an event, DO NOT include that event.
- If you find zero events with a date/time, return {"status":"no_upcoming"}.
Title rule (HARD):
- title MUST be copied verbatim from PAGE_TEXT (no paraphrasing).
- title MUST come from the same local event block as the date/time:
- it must appear within 300 characters of the date/time text you used for start_time.
- Do NOT use site/series/page headings or navigation as title.
Examples of INVALID titles: "Seminar Series", "Events", "Robotics", "University of Toronto", page header text.
- If you cannot find a specific talk/topic title near the date/time, DO NOT include the event.
If PREVIOUS_EVENTS is provided:
- Use PREVIOUS_EVENTS as a strict copy source.
- If PAGE_TEXT contains an event that matches one in PREVIOUS_EVENTS, you MUST include that event in your output and you MUST copy the entire event object exactly from PREVIOUS_EVENTS.
- Do NOT omit a matched event. Do NOT say it is "already known". Do NOT reduce the number of returned events because PREVIOUS_EVENTS were provided.
- If PREVIOUS_EVENTS contains events that are NOT present in PAGE_TEXT, ignore them.
Final self-check (HARD, perform before replying):
- Your reply must be valid JSON only.
- For each event: verify start_time exists and is a non-empty string.
- If an event has missing/empty start_time, REMOVE that event.
- If no events remain, output {"status":"no_upcoming"}.
Schema:
{ "status": "ok"|"no_upcoming"|"error",
"events": [{"title": "...", "start_time": "...", "event_url": "...", "speaker": "", "affiliation": null, "attending_count": 0}],
"error": "...",
"next_url_to_check": "..." }
"""
class OrgSource(BaseModel):
id: str
name: str
url: HttpUrl
tags: list[str] = Field(default_factory=list)
class LlmEvent(BaseModel):
title: str
start_time: str
event_url: str | None = None
speaker: str
affiliation: str | None = None
attending_count: int = 0
class LlmHopResult(BaseModel):
status: Status
events: list[LlmEvent] = Field(default_factory=list)
error: str | None = None
next_url_to_check: str | None = None
class EventResult(BaseModel):
org_id: str
org_name: str
source_url: str
status: Status
events: list[LlmEvent] = Field(default_factory=list)
checked_at: str
hops: int = 0
visited_urls: list[str] = Field(default_factory=list)
error: str | None = None
@dataclass
class LlmConfig:
model: str
api_key: str | None = None
api_base: str | None = None
def parse_dt_utc(value: str) -> datetime:
from dateutil import parser as dtparser
dt = dtparser.isoparse(value)
if dt.tzinfo is None:
dt = dt.replace(tzinfo=timezone.utc)
return dt.astimezone(timezone.utc)
def text_and_links(page_html: str, *, base_url: str, limit: int = 40) -> tuple[str, list[str]]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html.unescape(page_html), "html.parser")
for tag in soup(["script", "style", "noscript"]):
tag.decompose()
text = "\n".join(ln.strip() for ln in soup.get_text("\n").splitlines() if ln.strip())[:24000]
base_dom = urlparse(base_url).netloc.lower()
links: list[str] = []
for a in soup.find_all("a", href=True):
u = urljoin(base_url, str(a["href"]).strip())
p = urlparse(u)
if p.scheme in {"http", "https"} and p.netloc.lower() == base_dom:
links.append(u)
if len(links) >= limit:
break
return text, links
def llm_extract(*, config: LlmConfig, org: OrgSource, url: str, page_html: str, now_iso: str, previous_events: list[dict]) -> LlmHopResult:
from litellm import completion # type: ignore
page_text, links = text_and_links(page_html, base_url=url)
messages = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": json.dumps({"org": org.name, "now": now_iso, "source_url": url, "PREVIOUS_EVENTS": previous_events})},
{"role": "user", "content": "LINKS:\n" + "\n".join(links)},
{"role": "user", "content": "PAGE_TEXT_BEGIN\n" + page_text + "\nPAGE_TEXT_END"},
{"role": "user", "content": "Return ONLY one JSON object (no markdown, no code). "
"Must start with '{' and end with '}'. Use double quotes only. "
"start_time MUST be ISO-8601 only (e.g. 2026-01-15 or 2026-01-15T14:00+00:00), never 'Jan 15, 2026'. "
"Before returning, delete any event missing/empty start_time. "
"Title must be copied verbatim from PAGE_TEXT near the date/time. "
"IMPORTANT: PREVIOUS_EVENTS are NOT a reason to omit events. If PAGE_TEXT contains an event that matches PREVIOUS_EVENTS, you MUST re-output it by copying the entire event object exactly from PREVIOUS_EVENTS. "
"If none remain, return {\"status\":\"no_upcoming\"}."}
]
timeout = float(os.environ.get("LLM_TIMEOUT_SECS", "60"))
kwargs: dict[str, object] = {"model": config.model, "temperature": 0, "timeout": timeout}
if config.api_key:
kwargs["api_key"] = config.api_key
if config.api_base:
kwargs["api_base"] = config.api_base
content = completion(messages=messages, **kwargs)["choices"][0]["message"]["content"]
content = content.replace('`', '').replace('json', '')
content = content[content.find("{") : content.rfind("}") + 1]
try:
content = json.loads(content)
except json.JSONDecodeError:
raise ValueError(f"LLM did not return valid JSON:\n{content}")
try:
LlmHopResult.model_validate(content)
except Exception as e:
raise ValueError(f"LLM returned JSON that does not match schema: {e}\n{content}")
return LlmHopResult.model_validate(content)