Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import json | |
| import os | |
| from dataclasses import dataclass | |
| from datetime import datetime, timezone | |
| from typing import Literal | |
| import html | |
| from urllib.parse import urljoin, urlparse | |
| from pydantic import BaseModel, Field, HttpUrl | |
| Status = Literal["ok", "no_upcoming", "error"] | |
| USER_AGENT = ( | |
| "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " | |
| "(KHTML, like Gecko) Chrome/120 Safari/537.36" | |
| ) | |
| # SYSTEM_PROMPT = ( | |
| # "You are given the page text content and a list of links from the page. " | |
| # "Find up to the NEXT 3 upcoming talks/events AFTER now, sorted by time ascending. " | |
| # "If the page lists only a date (no time), output start_time as an ISO-8601 DATE like '2026-01-15'. " | |
| # "If events are listed as lines with a date (e.g. 'Jan 15, 2026 – ...'), treat each such line as an event. " | |
| # "If there is no dedicated per-event URL, set event_url to the Source URL. " | |
| # "If the schedule is not on this page, set next_url_to_check to the single best link to follow (one of LINKS). " | |
| # "Do NOT invent placeholder titles like 'TBA' unless the page text explicitly contains 'TBA'. " | |
| # "IMPORTANT: Keep fields clean and separated: " | |
| # "- events[].title MUST be the talk/event title ONLY (no speaker name, no affiliation, no date/time). " | |
| # "- Put the speaker/person name in events[].speaker when present in the text (e.g. 'Speaker: …', 'by …', 'Presenter: …'). " | |
| # "- If the speaker affiliation/institution is present (e.g. 'UC Davis', 'MIT', 'Google DeepMind'), put it in events[].affiliation (do not mix it into title). " | |
| # "- If a line contains both speaker and title (e.g. 'Jane Doe — Learning Robots' or 'Learning Robots — Jane Doe'), split them correctly. " | |
| # "Choose the talk title that is best-supported by the page text. Give priority to explicit cues like 'Title:', 'Talk title:', 'Topic:', and text near 'Abstract:'/'Summary:'. " | |
| # "If a header includes a series label plus a person name/affiliation (e.g. 'Seminar: Jane Doe (MIT)'), treat that as speaker/affiliation (not title) and keep searching the body for the real title. " | |
| # "e.g. this Robotics Institute Seminar: Mahdi Tavakoli (University of Alberta) is not a title" | |
| # "Keep person name and affiliation seprate. Each should be put in its own field. " | |
| # "Never guess; quote evidence from the provided text." | |
| # "Never reply with any text other than the JSON object. If you don't find any events, still reply with a JSON object containing the no_upcoming status. " | |
| # "REMEMBER: START TIME IS ALWAYS REQUIRED FOR EACH EVENT. IF THERE IS NO START TIME, DO NOT INCLUDE THE EVENT. " | |
| # "Respond in JSON with the following schema: " | |
| # "status: ok | no_upcoming | error" | |
| # "events: array of up to 3 objects (required if ok)" | |
| # "events[].title: string" | |
| # "events[].start_time: ISO-8601 string; if only date is known, use YYYY-MM-DD (always required)" | |
| # "events[].event_url: string URL" | |
| # "events[].speaker: string " | |
| # "events[].affiliation: string (if prersent)" | |
| # "events[].evidence: short snippet from provided text (always required)" | |
| # "error: short error string (required if error)" | |
| # "next_url_to_check: string URL (optional - must be one of LINKS if provided)" | |
| # ) | |
| SYSTEM_PROMPT = """ | |
| You are a JSON extraction engine. You do NOT write code. | |
| CRITICAL OUTPUT CONSTRAINTS (HARD): | |
| - Your entire reply MUST be valid JSON (RFC 8259). | |
| - Reply with exactly ONE JSON object. | |
| - The first non-whitespace character MUST be "{" and the last MUST be "}". | |
| - Use double quotes for all JSON strings. Never use single quotes. | |
| - Do NOT include markdown fences (```), explanations, pseudocode, or Python. | |
| If you cannot follow these constraints, reply exactly: | |
| {"status":"error","error":"non_json_or_invalid_schema"} | |
| Event extraction rules (HARD): | |
| - Return up to 3 upcoming events after "now", sorted by start_time ascending. | |
| - Every event MUST include start_time and it MUST be ISO-8601 (e.g. "2026-01-15" or "2026-01-15T14:00+00:00"). | |
| - Do NOT use month names or informal dates like "Jan 15, 2026". | |
| - If you cannot find a date/time in PAGE_TEXT for an event, DO NOT include that event. | |
| - If you find zero events with a date/time, return {"status":"no_upcoming"}. | |
| Title rule (HARD): | |
| - title MUST be copied verbatim from PAGE_TEXT (no paraphrasing). | |
| - title MUST come from the same local event block as the date/time: | |
| - it must appear within 300 characters of the date/time text you used for start_time. | |
| - Do NOT use site/series/page headings or navigation as title. | |
| Examples of INVALID titles: "Seminar Series", "Events", "Robotics", "University of Toronto", page header text. | |
| - If you cannot find a specific talk/topic title near the date/time, DO NOT include the event. | |
| If PREVIOUS_EVENTS is provided: | |
| - Use PREVIOUS_EVENTS as a strict copy source. | |
| - If PAGE_TEXT contains an event that matches one in PREVIOUS_EVENTS, you MUST include that event in your output and you MUST copy the entire event object exactly from PREVIOUS_EVENTS. | |
| - Do NOT omit a matched event. Do NOT say it is "already known". Do NOT reduce the number of returned events because PREVIOUS_EVENTS were provided. | |
| - If PREVIOUS_EVENTS contains events that are NOT present in PAGE_TEXT, ignore them. | |
| Final self-check (HARD, perform before replying): | |
| - Your reply must be valid JSON only. | |
| - For each event: verify start_time exists and is a non-empty string. | |
| - If an event has missing/empty start_time, REMOVE that event. | |
| - If no events remain, output {"status":"no_upcoming"}. | |
| Schema: | |
| { "status": "ok"|"no_upcoming"|"error", | |
| "events": [{"title": "...", "start_time": "...", "event_url": "...", "speaker": "", "affiliation": null, "attending_count": 0}], | |
| "error": "...", | |
| "next_url_to_check": "..." } | |
| """ | |
| class OrgSource(BaseModel): | |
| id: str | |
| name: str | |
| url: HttpUrl | |
| tags: list[str] = Field(default_factory=list) | |
| class LlmEvent(BaseModel): | |
| title: str | |
| start_time: str | |
| event_url: str | None = None | |
| speaker: str | |
| affiliation: str | None = None | |
| attending_count: int = 0 | |
| class LlmHopResult(BaseModel): | |
| status: Status | |
| events: list[LlmEvent] = Field(default_factory=list) | |
| error: str | None = None | |
| next_url_to_check: str | None = None | |
| class EventResult(BaseModel): | |
| org_id: str | |
| org_name: str | |
| source_url: str | |
| status: Status | |
| events: list[LlmEvent] = Field(default_factory=list) | |
| checked_at: str | |
| hops: int = 0 | |
| visited_urls: list[str] = Field(default_factory=list) | |
| error: str | None = None | |
| class LlmConfig: | |
| model: str | |
| api_key: str | None = None | |
| api_base: str | None = None | |
| def parse_dt_utc(value: str) -> datetime: | |
| from dateutil import parser as dtparser | |
| dt = dtparser.isoparse(value) | |
| if dt.tzinfo is None: | |
| dt = dt.replace(tzinfo=timezone.utc) | |
| return dt.astimezone(timezone.utc) | |
| def text_and_links(page_html: str, *, base_url: str, limit: int = 40) -> tuple[str, list[str]]: | |
| from bs4 import BeautifulSoup | |
| soup = BeautifulSoup(html.unescape(page_html), "html.parser") | |
| for tag in soup(["script", "style", "noscript"]): | |
| tag.decompose() | |
| text = "\n".join(ln.strip() for ln in soup.get_text("\n").splitlines() if ln.strip())[:24000] | |
| base_dom = urlparse(base_url).netloc.lower() | |
| links: list[str] = [] | |
| for a in soup.find_all("a", href=True): | |
| u = urljoin(base_url, str(a["href"]).strip()) | |
| p = urlparse(u) | |
| if p.scheme in {"http", "https"} and p.netloc.lower() == base_dom: | |
| links.append(u) | |
| if len(links) >= limit: | |
| break | |
| return text, links | |
| def llm_extract(*, config: LlmConfig, org: OrgSource, url: str, page_html: str, now_iso: str, previous_events: list[dict]) -> LlmHopResult: | |
| from litellm import completion # type: ignore | |
| page_text, links = text_and_links(page_html, base_url=url) | |
| messages = [ | |
| {"role": "system", "content": SYSTEM_PROMPT}, | |
| {"role": "user", "content": json.dumps({"org": org.name, "now": now_iso, "source_url": url, "PREVIOUS_EVENTS": previous_events})}, | |
| {"role": "user", "content": "LINKS:\n" + "\n".join(links)}, | |
| {"role": "user", "content": "PAGE_TEXT_BEGIN\n" + page_text + "\nPAGE_TEXT_END"}, | |
| {"role": "user", "content": "Return ONLY one JSON object (no markdown, no code). " | |
| "Must start with '{' and end with '}'. Use double quotes only. " | |
| "start_time MUST be ISO-8601 only (e.g. 2026-01-15 or 2026-01-15T14:00+00:00), never 'Jan 15, 2026'. " | |
| "Before returning, delete any event missing/empty start_time. " | |
| "Title must be copied verbatim from PAGE_TEXT near the date/time. " | |
| "IMPORTANT: PREVIOUS_EVENTS are NOT a reason to omit events. If PAGE_TEXT contains an event that matches PREVIOUS_EVENTS, you MUST re-output it by copying the entire event object exactly from PREVIOUS_EVENTS. " | |
| "If none remain, return {\"status\":\"no_upcoming\"}."} | |
| ] | |
| timeout = float(os.environ.get("LLM_TIMEOUT_SECS", "60")) | |
| kwargs: dict[str, object] = {"model": config.model, "temperature": 0, "timeout": timeout} | |
| if config.api_key: | |
| kwargs["api_key"] = config.api_key | |
| if config.api_base: | |
| kwargs["api_base"] = config.api_base | |
| content = completion(messages=messages, **kwargs)["choices"][0]["message"]["content"] | |
| content = content.replace('`', '').replace('json', '') | |
| content = content[content.find("{") : content.rfind("}") + 1] | |
| try: | |
| content = json.loads(content) | |
| except json.JSONDecodeError: | |
| raise ValueError(f"LLM did not return valid JSON:\n{content}") | |
| try: | |
| LlmHopResult.model_validate(content) | |
| except Exception as e: | |
| raise ValueError(f"LLM returned JSON that does not match schema: {e}\n{content}") | |
| return LlmHopResult.model_validate(content) | |