File size: 10,229 Bytes
2b8ca65
 
 
68fc3c9
2b8ca65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68fc3c9
 
 
2b8ca65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68fc3c9
2b8ca65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68fc3c9
2b8ca65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68fc3c9
2b8ca65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68fc3c9
2b8ca65
 
 
 
 
 
68fc3c9
 
2b8ca65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
from __future__ import annotations

import json
import os
from dataclasses import dataclass
from datetime import datetime, timezone
from typing import Literal

import html
from urllib.parse import urljoin, urlparse

from pydantic import BaseModel, Field, HttpUrl

Status = Literal["ok", "no_upcoming", "error"]

USER_AGENT = (
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Chrome/120 Safari/537.36"
)

# SYSTEM_PROMPT = (
#             "You are given the page text content and a list of links from the page. "
#             "Find up to the NEXT 3 upcoming talks/events AFTER now, sorted by time ascending. "
#             "If the page lists only a date (no time), output start_time as an ISO-8601 DATE like '2026-01-15'. "
#             "If events are listed as lines with a date (e.g. 'Jan 15, 2026 – ...'), treat each such line as an event. "
#             "If there is no dedicated per-event URL, set event_url to the Source URL. "
#             "If the schedule is not on this page, set next_url_to_check to the single best link to follow (one of LINKS). "
#             "Do NOT invent placeholder titles like 'TBA' unless the page text explicitly contains 'TBA'. "
#             "IMPORTANT: Keep fields clean and separated: "
#             "- events[].title MUST be the talk/event title ONLY (no speaker name, no affiliation, no date/time). "
#             "- Put the speaker/person name in events[].speaker when present in the text (e.g. 'Speaker: …', 'by …', 'Presenter: …'). "
#             "- If the speaker affiliation/institution is present (e.g. 'UC Davis', 'MIT', 'Google DeepMind'), put it in events[].affiliation (do not mix it into title). "
#             "- If a line contains both speaker and title (e.g. 'Jane Doe — Learning Robots' or 'Learning Robots — Jane Doe'), split them correctly. "
#             "Choose the talk title that is best-supported by the page text. Give priority to explicit cues like 'Title:', 'Talk title:', 'Topic:', and text near 'Abstract:'/'Summary:'. "
#             "If a header includes a series label plus a person name/affiliation (e.g. 'Seminar: Jane Doe (MIT)'), treat that as speaker/affiliation (not title) and keep searching the body for the real title. "
#             "e.g. this Robotics Institute Seminar: Mahdi Tavakoli (University of Alberta) is not a title"
#             "Keep person name and affiliation seprate. Each should be put in its own field. "
#             "Never guess; quote evidence from the provided text."
#             "Never reply with any text other than the JSON object. If you don't find any events, still reply with a JSON object containing the no_upcoming status. "
#             "REMEMBER: START TIME IS ALWAYS REQUIRED FOR EACH EVENT. IF THERE IS NO START TIME, DO NOT INCLUDE THE EVENT. "

#             "Respond in JSON with the following schema: "

#             "status: ok | no_upcoming | error"
#             "events: array of up to 3 objects (required if ok)"
#             "events[].title: string"
#             "events[].start_time: ISO-8601 string; if only date is known, use YYYY-MM-DD (always required)" 
#             "events[].event_url: string URL"
#             "events[].speaker: string "
#             "events[].affiliation: string (if prersent)"
#             "events[].evidence: short snippet from provided text (always required)" 
#             "error: short error string (required if error)"
#             "next_url_to_check: string URL (optional - must be one of LINKS if provided)"
# )

SYSTEM_PROMPT = """
You are a JSON extraction engine. You do NOT write code.

CRITICAL OUTPUT CONSTRAINTS (HARD):
- Your entire reply MUST be valid JSON (RFC 8259).
- Reply with exactly ONE JSON object.
- The first non-whitespace character MUST be "{" and the last MUST be "}".
- Use double quotes for all JSON strings. Never use single quotes.
- Do NOT include markdown fences (```), explanations, pseudocode, or Python.

If you cannot follow these constraints, reply exactly:
{"status":"error","error":"non_json_or_invalid_schema"}

Event extraction rules (HARD):
- Return up to 3 upcoming events after "now", sorted by start_time ascending.
- Every event MUST include start_time and it MUST be ISO-8601 (e.g. "2026-01-15" or "2026-01-15T14:00+00:00").
- Do NOT use month names or informal dates like "Jan 15, 2026".
- If you cannot find a date/time in PAGE_TEXT for an event, DO NOT include that event.
- If you find zero events with a date/time, return {"status":"no_upcoming"}.

Title rule (HARD):
- title MUST be copied verbatim from PAGE_TEXT (no paraphrasing).
- title MUST come from the same local event block as the date/time:
  - it must appear within 300 characters of the date/time text you used for start_time.
- Do NOT use site/series/page headings or navigation as title.
  Examples of INVALID titles: "Seminar Series", "Events", "Robotics", "University of Toronto", page header text.
- If you cannot find a specific talk/topic title near the date/time, DO NOT include the event.

If PREVIOUS_EVENTS is provided:
- Use PREVIOUS_EVENTS as a strict copy source.
- If PAGE_TEXT contains an event that matches one in PREVIOUS_EVENTS, you MUST include that event in your output and you MUST copy the entire event object exactly from PREVIOUS_EVENTS.
- Do NOT omit a matched event. Do NOT say it is "already known". Do NOT reduce the number of returned events because PREVIOUS_EVENTS were provided.
- If PREVIOUS_EVENTS contains events that are NOT present in PAGE_TEXT, ignore them.

Final self-check (HARD, perform before replying):
- Your reply must be valid JSON only.
- For each event: verify start_time exists and is a non-empty string.
- If an event has missing/empty start_time, REMOVE that event.
- If no events remain, output {"status":"no_upcoming"}.

Schema:
{ "status": "ok"|"no_upcoming"|"error",
    "events": [{"title": "...", "start_time": "...", "event_url": "...", "speaker": "", "affiliation": null, "attending_count": 0}],
  "error": "...",
  "next_url_to_check": "..." }

"""


class OrgSource(BaseModel):
    id: str
    name: str
    url: HttpUrl
    tags: list[str] = Field(default_factory=list)


class LlmEvent(BaseModel):
    title: str
    start_time: str
    event_url: str | None = None
    speaker: str
    affiliation: str | None = None
    attending_count: int = 0


class LlmHopResult(BaseModel):
    status: Status
    events: list[LlmEvent] = Field(default_factory=list)
    error: str | None = None
    next_url_to_check: str | None = None


class EventResult(BaseModel):
    org_id: str
    org_name: str
    source_url: str
    status: Status
    events: list[LlmEvent] = Field(default_factory=list)
    checked_at: str
    hops: int = 0
    visited_urls: list[str] = Field(default_factory=list)
    error: str | None = None


@dataclass
class LlmConfig:
    model: str
    api_key: str | None = None
    api_base: str | None = None

def parse_dt_utc(value: str) -> datetime:
    from dateutil import parser as dtparser

    dt = dtparser.isoparse(value)
    if dt.tzinfo is None:
        dt = dt.replace(tzinfo=timezone.utc)
    return dt.astimezone(timezone.utc)


def text_and_links(page_html: str, *, base_url: str, limit: int = 40) -> tuple[str, list[str]]:
    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html.unescape(page_html), "html.parser")
    for tag in soup(["script", "style", "noscript"]):
        tag.decompose()
    text = "\n".join(ln.strip() for ln in soup.get_text("\n").splitlines() if ln.strip())[:24000]

    base_dom = urlparse(base_url).netloc.lower()
    links: list[str] = []
    for a in soup.find_all("a", href=True):
        u = urljoin(base_url, str(a["href"]).strip())
        p = urlparse(u)
        if p.scheme in {"http", "https"} and p.netloc.lower() == base_dom:
            links.append(u)
        if len(links) >= limit:
            break

    return text, links


def llm_extract(*, config: LlmConfig, org: OrgSource, url: str, page_html: str, now_iso: str, previous_events: list[dict]) -> LlmHopResult:
    from litellm import completion  # type: ignore

    page_text, links = text_and_links(page_html, base_url=url)
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": json.dumps({"org": org.name, "now": now_iso, "source_url": url, "PREVIOUS_EVENTS": previous_events})},
        {"role": "user", "content": "LINKS:\n" + "\n".join(links)},
        {"role": "user", "content": "PAGE_TEXT_BEGIN\n" + page_text + "\nPAGE_TEXT_END"},
        {"role": "user", "content": "Return ONLY one JSON object (no markdown, no code). "
                        "Must start with '{' and end with '}'. Use double quotes only. "
                        "start_time MUST be ISO-8601 only (e.g. 2026-01-15 or 2026-01-15T14:00+00:00), never 'Jan 15, 2026'. "
                        "Before returning, delete any event missing/empty start_time. "
                        "Title must be copied verbatim from PAGE_TEXT near the date/time. "
                        "IMPORTANT: PREVIOUS_EVENTS are NOT a reason to omit events. If PAGE_TEXT contains an event that matches PREVIOUS_EVENTS, you MUST re-output it by copying the entire event object exactly from PREVIOUS_EVENTS. "
                        "If none remain, return {\"status\":\"no_upcoming\"}."}
    ]

    timeout = float(os.environ.get("LLM_TIMEOUT_SECS", "60"))
    kwargs: dict[str, object] = {"model": config.model, "temperature": 0, "timeout": timeout}
    if config.api_key:
        kwargs["api_key"] = config.api_key
    if config.api_base:
        kwargs["api_base"] = config.api_base

    content = completion(messages=messages, **kwargs)["choices"][0]["message"]["content"]
    content = content.replace('`', '').replace('json', '')
    content = content[content.find("{") : content.rfind("}") + 1]

    try:
        content = json.loads(content)
    except json.JSONDecodeError:
        raise ValueError(f"LLM did not return valid JSON:\n{content}")
    
    try:
        LlmHopResult.model_validate(content)
    except Exception as e:
        raise ValueError(f"LLM returned JSON that does not match schema: {e}\n{content}")

    return LlmHopResult.model_validate(content)