Spaces:

ar0s
/

robotic_seminars

Sleeping

App Files Files Community

robotic_seminars / src /models.py

ar0s

added persistent storage

68fc3c9 24 days ago

raw

history blame contribute delete

10.2 kB

	from __future__ import annotations

	import json
	import os
	from dataclasses import dataclass
	from datetime import datetime, timezone
	from typing import Literal

	import html
	from urllib.parse import urljoin, urlparse

	from pydantic import BaseModel, Field, HttpUrl

	Status = Literal["ok", "no_upcoming", "error"]

	USER_AGENT = (
	"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
	"(KHTML, like Gecko) Chrome/120 Safari/537.36"
	)

	# SYSTEM_PROMPT = (
	# "You are given the page text content and a list of links from the page. "
	# "Find up to the NEXT 3 upcoming talks/events AFTER now, sorted by time ascending. "
	# "If the page lists only a date (no time), output start_time as an ISO-8601 DATE like '2026-01-15'. "
	# "If events are listed as lines with a date (e.g. 'Jan 15, 2026 – ...'), treat each such line as an event. "
	# "If there is no dedicated per-event URL, set event_url to the Source URL. "
	# "If the schedule is not on this page, set next_url_to_check to the single best link to follow (one of LINKS). "
	# "Do NOT invent placeholder titles like 'TBA' unless the page text explicitly contains 'TBA'. "
	# "IMPORTANT: Keep fields clean and separated: "
	# "- events[].title MUST be the talk/event title ONLY (no speaker name, no affiliation, no date/time). "
	# "- Put the speaker/person name in events[].speaker when present in the text (e.g. 'Speaker: …', 'by …', 'Presenter: …'). "
	# "- If the speaker affiliation/institution is present (e.g. 'UC Davis', 'MIT', 'Google DeepMind'), put it in events[].affiliation (do not mix it into title). "
	# "- If a line contains both speaker and title (e.g. 'Jane Doe — Learning Robots' or 'Learning Robots — Jane Doe'), split them correctly. "
	# "Choose the talk title that is best-supported by the page text. Give priority to explicit cues like 'Title:', 'Talk title:', 'Topic:', and text near 'Abstract:'/'Summary:'. "
	# "If a header includes a series label plus a person name/affiliation (e.g. 'Seminar: Jane Doe (MIT)'), treat that as speaker/affiliation (not title) and keep searching the body for the real title. "
	# "e.g. this Robotics Institute Seminar: Mahdi Tavakoli (University of Alberta) is not a title"
	# "Keep person name and affiliation seprate. Each should be put in its own field. "
	# "Never guess; quote evidence from the provided text."
	# "Never reply with any text other than the JSON object. If you don't find any events, still reply with a JSON object containing the no_upcoming status. "
	# "REMEMBER: START TIME IS ALWAYS REQUIRED FOR EACH EVENT. IF THERE IS NO START TIME, DO NOT INCLUDE THE EVENT. "

	# "Respond in JSON with the following schema: "

	# "status: ok \| no_upcoming \| error"
	# "events: array of up to 3 objects (required if ok)"
	# "events[].title: string"
	# "events[].start_time: ISO-8601 string; if only date is known, use YYYY-MM-DD (always required)"
	# "events[].event_url: string URL"
	# "events[].speaker: string "
	# "events[].affiliation: string (if prersent)"
	# "events[].evidence: short snippet from provided text (always required)"
	# "error: short error string (required if error)"
	# "next_url_to_check: string URL (optional - must be one of LINKS if provided)"
	# )

	SYSTEM_PROMPT = """
	You are a JSON extraction engine. You do NOT write code.

	CRITICAL OUTPUT CONSTRAINTS (HARD):
	- Your entire reply MUST be valid JSON (RFC 8259).
	- Reply with exactly ONE JSON object.
	- The first non-whitespace character MUST be "{" and the last MUST be "}".
	- Use double quotes for all JSON strings. Never use single quotes.
	- Do NOT include markdown fences (```), explanations, pseudocode, or Python.

	If you cannot follow these constraints, reply exactly:
	{"status":"error","error":"non_json_or_invalid_schema"}

	Event extraction rules (HARD):
	- Return up to 3 upcoming events after "now", sorted by start_time ascending.
	- Every event MUST include start_time and it MUST be ISO-8601 (e.g. "2026-01-15" or "2026-01-15T14:00+00:00").
	- Do NOT use month names or informal dates like "Jan 15, 2026".
	- If you cannot find a date/time in PAGE_TEXT for an event, DO NOT include that event.
	- If you find zero events with a date/time, return {"status":"no_upcoming"}.

	Title rule (HARD):
	- title MUST be copied verbatim from PAGE_TEXT (no paraphrasing).
	- title MUST come from the same local event block as the date/time:
	- it must appear within 300 characters of the date/time text you used for start_time.
	- Do NOT use site/series/page headings or navigation as title.
	Examples of INVALID titles: "Seminar Series", "Events", "Robotics", "University of Toronto", page header text.
	- If you cannot find a specific talk/topic title near the date/time, DO NOT include the event.

	If PREVIOUS_EVENTS is provided:
	- Use PREVIOUS_EVENTS as a strict copy source.
	- If PAGE_TEXT contains an event that matches one in PREVIOUS_EVENTS, you MUST include that event in your output and you MUST copy the entire event object exactly from PREVIOUS_EVENTS.
	- Do NOT omit a matched event. Do NOT say it is "already known". Do NOT reduce the number of returned events because PREVIOUS_EVENTS were provided.
	- If PREVIOUS_EVENTS contains events that are NOT present in PAGE_TEXT, ignore them.

	Final self-check (HARD, perform before replying):
	- Your reply must be valid JSON only.
	- For each event: verify start_time exists and is a non-empty string.
	- If an event has missing/empty start_time, REMOVE that event.
	- If no events remain, output {"status":"no_upcoming"}.

	Schema:
	{ "status": "ok"\|"no_upcoming"\|"error",
	"events": [{"title": "...", "start_time": "...", "event_url": "...", "speaker": "", "affiliation": null, "attending_count": 0}],
	"error": "...",
	"next_url_to_check": "..." }

	"""


	class OrgSource(BaseModel):
	id: str
	name: str
	url: HttpUrl
	tags: list[str] = Field(default_factory=list)


	class LlmEvent(BaseModel):
	title: str
	start_time: str
	event_url: str \| None = None
	speaker: str
	affiliation: str \| None = None
	attending_count: int = 0


	class LlmHopResult(BaseModel):
	status: Status
	events: list[LlmEvent] = Field(default_factory=list)
	error: str \| None = None
	next_url_to_check: str \| None = None


	class EventResult(BaseModel):
	org_id: str
	org_name: str
	source_url: str
	status: Status
	events: list[LlmEvent] = Field(default_factory=list)
	checked_at: str
	hops: int = 0
	visited_urls: list[str] = Field(default_factory=list)
	error: str \| None = None


	@dataclass
	class LlmConfig:
	model: str
	api_key: str \| None = None
	api_base: str \| None = None

	def parse_dt_utc(value: str) -> datetime:
	from dateutil import parser as dtparser

	dt = dtparser.isoparse(value)
	if dt.tzinfo is None:
	dt = dt.replace(tzinfo=timezone.utc)
	return dt.astimezone(timezone.utc)


	def text_and_links(page_html: str, *, base_url: str, limit: int = 40) -> tuple[str, list[str]]:
	from bs4 import BeautifulSoup

	soup = BeautifulSoup(html.unescape(page_html), "html.parser")
	for tag in soup(["script", "style", "noscript"]):
	tag.decompose()
	text = "\n".join(ln.strip() for ln in soup.get_text("\n").splitlines() if ln.strip())[:24000]

	base_dom = urlparse(base_url).netloc.lower()
	links: list[str] = []
	for a in soup.find_all("a", href=True):
	u = urljoin(base_url, str(a["href"]).strip())
	p = urlparse(u)
	if p.scheme in {"http", "https"} and p.netloc.lower() == base_dom:
	links.append(u)
	if len(links) >= limit:
	break

	return text, links


	def llm_extract(*, config: LlmConfig, org: OrgSource, url: str, page_html: str, now_iso: str, previous_events: list[dict]) -> LlmHopResult:
	from litellm import completion # type: ignore

	page_text, links = text_and_links(page_html, base_url=url)
	messages = [
	{"role": "system", "content": SYSTEM_PROMPT},
	{"role": "user", "content": json.dumps({"org": org.name, "now": now_iso, "source_url": url, "PREVIOUS_EVENTS": previous_events})},
	{"role": "user", "content": "LINKS:\n" + "\n".join(links)},
	{"role": "user", "content": "PAGE_TEXT_BEGIN\n" + page_text + "\nPAGE_TEXT_END"},
	{"role": "user", "content": "Return ONLY one JSON object (no markdown, no code). "
	"Must start with '{' and end with '}'. Use double quotes only. "
	"start_time MUST be ISO-8601 only (e.g. 2026-01-15 or 2026-01-15T14:00+00:00), never 'Jan 15, 2026'. "
	"Before returning, delete any event missing/empty start_time. "
	"Title must be copied verbatim from PAGE_TEXT near the date/time. "
	"IMPORTANT: PREVIOUS_EVENTS are NOT a reason to omit events. If PAGE_TEXT contains an event that matches PREVIOUS_EVENTS, you MUST re-output it by copying the entire event object exactly from PREVIOUS_EVENTS. "
	"If none remain, return {\"status\":\"no_upcoming\"}."}
	]

	timeout = float(os.environ.get("LLM_TIMEOUT_SECS", "60"))
	kwargs: dict[str, object] = {"model": config.model, "temperature": 0, "timeout": timeout}
	if config.api_key:
	kwargs["api_key"] = config.api_key
	if config.api_base:
	kwargs["api_base"] = config.api_base

	content = completion(messages=messages, **kwargs)["choices"][0]["message"]["content"]
	content = content.replace('`', '').replace('json', '')
	content = content[content.find("{") : content.rfind("}") + 1]

	try:
	content = json.loads(content)
	except json.JSONDecodeError:
	raise ValueError(f"LLM did not return valid JSON:\n{content}")

	try:
	LlmHopResult.model_validate(content)
	except Exception as e:
	raise ValueError(f"LLM returned JSON that does not match schema: {e}\n{content}")

	return LlmHopResult.model_validate(content)