ar0s commited on
Commit
2b8ca65
·
1 Parent(s): 588344b

first commit

Browse files
Files changed (14) hide show
  1. .gitignore +7 -0
  2. ARCHITECTURE.md +264 -0
  3. README.md +54 -6
  4. app.py +153 -0
  5. requirements.txt +13 -0
  6. sources.json +61 -0
  7. src/__init__.py +0 -0
  8. src/fetcher.py +115 -0
  9. src/interested.py +67 -0
  10. src/models.py +245 -0
  11. src/org_colors.py +26 -0
  12. src/ui_log.py +44 -0
  13. tests/__init__.py +0 -0
  14. tests/test_golden.py +46 -0
.gitignore ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ .venv/
2
+ __pycache__/
3
+ *.pyc
4
+ .data/
5
+ /data/
6
+ *.log
7
+ .env
ARCHITECTURE.md ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Architecture / Code Flow (with ASCII maps)
2
+
3
+ This repo is intentionally small:
4
+
5
+ - `app.py` = UI + HTTP API (Gradio mounted into FastAPI)
6
+ - `src/fetcher.py` = crawling + LLM extraction + validation + caching
7
+ - `sources.json` = list of org sources to crawl
8
+ - `.data/events.json` (or `/data/events.json`) = cache / golden output format
9
+
10
+ ---
11
+
12
+ ## 1) High-level module map
13
+
14
+ ```
15
+ +--------------------+ imports / calls +---------------------+
16
+ | app.py | ---------------------------> | src/fetcher.py |
17
+ | | | |
18
+ | SeminarsWebApp | | SeminarFetcher |
19
+ | - Gradio UI | | + refresh_all* |
20
+ | - /refresh API | | + models + cache |
21
+ +--------------------+ +---------------------+
22
+ |
23
+ | reads
24
+ v
25
+ +--------------------+
26
+ | sources.json |
27
+ | list[OrgSource] |
28
+ +--------------------+
29
+
30
+ Cache file (written/read by fetcher):
31
+
32
+ +--------------------+
33
+ | .data/events.json | (or /data/events.json)
34
+ | meta + results |
35
+ +--------------------+
36
+ ```
37
+
38
+ ---
39
+
40
+ ## 2) Runtime flow (what happens when you open the UI)
41
+
42
+ ### 2.1 Gradio initial load
43
+
44
+ ```
45
+ Browser
46
+ |
47
+ v
48
+ Gradio page load
49
+ |
50
+ v
51
+ app.py: demo.load(load_initial)
52
+ |
53
+ v
54
+ SeminarsWebApp._stream_refresh(force=False) [generator]
55
+ |
56
+ v
57
+ src/fetcher.py: refresh_all_stream(... force=False)
58
+ |
59
+ +--> if cache usable -> yield logs + cached results -> done
60
+ |
61
+ \--> else -> crawl + LLM -> write cache -> done
62
+ ```
63
+
64
+ ### 2.2 Manual refresh button
65
+
66
+ ```
67
+ User clicks "Refresh now"
68
+ |
69
+ v
70
+ app.py: refresh_btn.click(refresh_click)
71
+ |
72
+ v
73
+ SeminarsWebApp._stream_refresh(force=True)
74
+ |
75
+ v
76
+ src/fetcher.py: refresh_all_stream(... force=True)
77
+ |
78
+ v
79
+ Always crawls + LLM, then writes cache
80
+ ```
81
+
82
+ ---
83
+
84
+ ## 3) Cache flow (explicit in the app)
85
+
86
+ The app explicitly checks cache before crawling.
87
+
88
+ ```
89
+ SeminarsWebApp.stream_refresh(force=False)
90
+ |
91
+ v
92
+ cache = CacheStore(config=..., ttl_hours=...)
93
+ |
94
+ +--> if cache.is_usable(): cache.load() -> UI updates
95
+ |
96
+ \--> else: crawl + LLM -> cache.write(results)
97
+ ```
98
+
99
+ ---
100
+
101
+ ## 4) Detailed fetcher flow (one org)
102
+
103
+ The fetcher is designed around a *stream* of events:
104
+
105
+ - log event: `("log", level, message)`
106
+ - result event: `("result", EventResult)`
107
+
108
+ ### 4.1 One-org pipeline
109
+
110
+ ```
111
+ SeminarFetcher.fetch_next_event_for_org_stream(org)
112
+ |
113
+ v
114
+ for hop in 1..max_hops:
115
+ |
116
+ +--> fetch_html(url)
117
+ | - httpx GET
118
+ | - if 403: optional curl fallback
119
+ |
120
+ +--> llm_extract(...)
121
+ | - text_and_links(html)
122
+ | - LiteLLM completion(...)
123
+ | - safe_json() + normalize_llm_payload()
124
+ | - Pydantic validation -> LlmHopResult
125
+ |
126
+ +--> validate_events(hop, now)
127
+ | - parse_dt(start_time)
128
+ | - filter to future events only
129
+ | - ensure evidence + http(s) URL
130
+ |
131
+ +--> yield ("result", EventResult)
132
+ |
133
+ \--> (optional) follow hop.next_url_to_check if provided
134
+ ```
135
+
136
+ ### 4.2 Key idea: strictness retry
137
+
138
+ If the LLM returns something that is not valid JSON or doesn’t validate, the code retries once in “strict” mode.
139
+
140
+ ```
141
+ llm_extract(strict=False)
142
+ |
143
+ +--> (fails JSON / schema) => retry
144
+ v
145
+ llm_extract(strict=True)
146
+ ```
147
+
148
+ ---
149
+
150
+ ## 5) Data model / JSON shapes
151
+
152
+ ### 5.1 Source input (`sources.json`)
153
+
154
+ ```
155
+ [
156
+ {
157
+ "id": "utoronto",
158
+ "name": "UofT Robotics",
159
+ "url": "https://...",
160
+ "tags": ["canada", "university"]
161
+ },
162
+ ...
163
+ ]
164
+ ```
165
+
166
+ Validated into `OrgSource`.
167
+
168
+ ### 5.2 Per-org output (`EventResult`)
169
+
170
+ ```
171
+ EventResult:
172
+ org_id, org_name, source_url
173
+ status: "ok" | "no_upcoming" | "error"
174
+ events: [LlmEvent, ...]
175
+ checked_at
176
+ hops
177
+ visited_urls
178
+ error (optional)
179
+ ```
180
+
181
+ ### 5.3 Cache file (`.data/events.json`)
182
+
183
+ ```
184
+ {
185
+ "meta": {
186
+ "model": "...",
187
+ "schema_version": 3,
188
+ "cached_at": "...",
189
+ "ttl_hours": 12
190
+ },
191
+ "results": [ EventResult, ... ]
192
+ }
193
+ ```
194
+
195
+ Cache is considered usable when:
196
+ - file exists
197
+ - file age < ttl
198
+ - `meta.schema_version == 3`
199
+ - `meta.model == current LLM model`
200
+
201
+ ---
202
+
203
+ ## 6) Where logs come from
204
+
205
+ Logs are generated in two layers:
206
+
207
+ 1) Fetcher (per hop / per org)
208
+
209
+ ```
210
+ "{org}: hop i/j — HTTP GET start: ..."
211
+ "{org}: hop i/j — HTTP GET done (...)"
212
+ "{org}: hop i/j — LLM call start (model=...)"
213
+ "{org}: hop i/j — LLM call done (...)"
214
+ "{org}: hop i/j — validating extracted event(s)"
215
+ "{org}: success (...)" OR "no upcoming events" OR "error (...)"
216
+ ```
217
+
218
+ 2) App wrapper (per org result summary)
219
+
220
+ ```
221
+ "{org}: ok (k event(s))"
222
+ "{org}: no upcoming events found"
223
+ "{org}: <error message>"
224
+ ```
225
+
226
+ ---
227
+
228
+ ## 7) Environment variables (practical cheat-sheet)
229
+
230
+ ### App / paths
231
+
232
+ - `SOURCES_PATH` (default `sources.json`)
233
+ - `DATA_DIR` (default `.data`, or `/data` if that directory exists)
234
+ - `CACHE_TTL_HOURS` (default `12`)
235
+ - `PORT` (default `7860`)
236
+
237
+ ### `/refresh` auth
238
+
239
+ - `REFRESH_TOKEN` (required to use `/refresh`)
240
+
241
+ ### LLM (LiteLLM)
242
+
243
+ - `LITELLM_MODEL` (or `GEMINI_MODEL` fallback)
244
+ - `LITELLM_API_KEY` (or `GEMINI_API_KEY` fallback)
245
+ - `LITELLM_API_BASE` (optional)
246
+
247
+ Optional knobs:
248
+ - `LLM_TEMPERATURE` (default `0`)
249
+ - `LLM_SEED` (optional)
250
+ - `LLM_MIN_INTERVAL_SECONDS` (optional throttling)
251
+ - `NOW_ISO` (optional override of “current time” for deterministic runs)
252
+
253
+ ---
254
+
255
+ ## 8) Quick “read order” (if you’re new)
256
+
257
+ 1) `app.py`:
258
+ - `SeminarsWebApp._stream_refresh()` to see end-to-end UI flow
259
+ - `build_fastapi()` for `/refresh`
260
+
261
+ 2) `src/fetcher.py`:
262
+ - `refresh_all_stream()` to see caching vs crawling
263
+ - `SeminarFetcher.fetch_next_event_for_org_stream()` for the main pipeline
264
+ - `llm_extract()` + `validate_events()` for correctness guarantees
README.md CHANGED
@@ -1,12 +1,60 @@
1
  ---
2
- title: Robotic Seminars
3
- emoji: 📊
4
- colorFrom: indigo
5
- colorTo: pink
6
  sdk: gradio
7
- sdk_version: 6.3.0
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Robotic seminars
3
+ emoji: "🤖"
4
+ colorFrom: blue
5
+ colorTo: purple
6
  sdk: gradio
7
+ sdk_version: "5.12.0"
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
12
+ # Robotic seminars (HF Space)
13
+
14
+ This Hugging Face Space aggregates the **next upcoming event** per organization using an LLM via **LiteLLM**. It fetches each page as HTML, extracts a compact text context + candidate links, and asks the model to return structured JSON (up to 3 hops).
15
+
16
+ ## Files
17
+
18
+ - [sources.json](sources.json): list of orgs + starting URLs
19
+ - [app.py](app.py): Gradio UI
20
+ - [src/fetcher.py](src/fetcher.py): LiteLLM hop loop, validation, caching
21
+
22
+ ## Environment variables (HF Space “Secrets”)
23
+
24
+ - `LITELLM_MODEL` (recommended): LiteLLM model string, e.g. `gemini/gemini-2.0-flash`, `openai/gpt-4o-mini`, `anthropic/claude-3-5-sonnet-20241022`
25
+ - `LITELLM_FALLBACK_MODELS` (optional): comma-separated fallback models to try if the primary hits a rate limit
26
+ - `LLM_MIN_INTERVAL_SECONDS` (optional): minimum delay between LLM calls (useful for very low RPM limits)
27
+ - `LITELLM_API_KEY` (optional): explicit API key to pass to LiteLLM
28
+ - `LITELLM_API_BASE` (optional): custom base URL (useful for proxies/self-hosted endpoints)
29
+
30
+ Backwards-compatible (still accepted):
31
+ - `GEMINI_API_KEY` (optional): used as a fallback for `LITELLM_API_KEY`
32
+ - `GEMINI_MODEL` (optional): used as a fallback for `LITELLM_MODEL`
33
+
34
+ Provider-specific env vars also work (recommended):
35
+ - `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `GOOGLE_API_KEY`, etc.
36
+
37
+ - `CACHE_TTL_HOURS` (optional): defaults to `12`
38
+
39
+ ## Cache
40
+
41
+ The app writes its cache to `/data/robotic_seminars/events.json`.
42
+
43
+ - On Hugging Face Spaces: enable **Persistent Storage** so `/data` exists and is writable.
44
+ - Locally: create `/data/robotic_seminars` and ensure it’s writable by your user.
45
+
46
+ ## Local run
47
+
48
+ ```bash
49
+ python -m venv .venv
50
+ source .venv/bin/activate
51
+ pip install -r requirements.txt
52
+
53
+ # Create a .env from the template and set your keys/model:
54
+ cp .env.example .env
55
+
56
+ # Or export env vars manually if you prefer.
57
+ python app.py
58
+ ```
59
+
60
+ Open http://localhost:7860
app.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ from datetime import datetime, timezone
6
+ from pathlib import Path
7
+
8
+ import gradio as gr
9
+ from dotenv import load_dotenv
10
+ from fastapi import FastAPI
11
+
12
+ from src.fetcher import CacheStore, LlmConfig, OrgSource, SeminarFetcher
13
+ from src.interested import COLS, inc_interested, results_table
14
+ from src.models import parse_dt_utc
15
+ from src.org_colors import org_colors
16
+ from src.ui_log import bind as bind_log, error, info
17
+
18
+ # import debugpy
19
+ # print("Waiting for debugger attach...")
20
+ # debugpy.listen(5678)
21
+ # debugpy.wait_for_client()
22
+ # print("Debugger attached.")
23
+
24
+ load_dotenv(Path(__file__).with_name(".env"))
25
+
26
+
27
+ def ts() -> str:
28
+ return datetime.now(timezone.utc).isoformat(timespec="seconds")
29
+
30
+ SOURCES_PATH = os.environ.get("SOURCES_PATH", "sources.json")
31
+ TTL_HOURS = float(os.environ.get("CACHE_TTL_HOURS", "12"))
32
+ LLM = LlmConfig(model=os.environ.get("LITELLM_MODEL", "gemini/gemini-2.0-flash"), api_key=os.environ.get("LITELLM_API_KEY"))
33
+
34
+ SOURCES_RAW = json.loads(Path(SOURCES_PATH).read_text(encoding="utf-8"))
35
+ COLORS = org_colors(SOURCES_RAW)
36
+
37
+ SOURCES_MD = "\n".join(
38
+ f"- [{s['name']}]({s['url']})"
39
+ for s in SOURCES_RAW
40
+ )
41
+
42
+
43
+ def stream_refresh(force: bool):
44
+ logs: list[str] = []
45
+ results: list[dict] = []
46
+ colors = COLORS
47
+
48
+ def emit(status: str):
49
+ df, row_map = results_table(results, colors)
50
+ return status, df, "<br>\n".join(logs), results, row_map
51
+
52
+ with bind_log(logs):
53
+ info(f"refresh(force={force}, ttl_hours={TTL_HOURS})")
54
+ info(f"model={LLM.model}")
55
+
56
+ try:
57
+ done = 0
58
+ sources = [OrgSource.model_validate(s) for s in SOURCES_RAW]
59
+
60
+ started = ts()
61
+ cache = CacheStore(config=LLM, ttl_hours=TTL_HOURS)
62
+ if not force and cache.is_usable():
63
+ results.extend(json.loads(cache.path.read_text(encoding="utf-8"))["results"])
64
+ info(f"used cache: {cache.path}")
65
+ yield emit(f"Refreshed: {ts()} (started {started})")
66
+ return
67
+ info("cache miss, crawling")
68
+
69
+ old_results = json.loads(cache.path.read_text(encoding="utf-8"))["results"] if cache.path.exists() else []
70
+ prev_by_org = {r["org_id"]: r["events"] for r in old_results}
71
+
72
+ old_by_exact: dict[tuple[str, str, str], int] = {}
73
+ old_by_dt: dict[tuple[str, str], int] = {}
74
+ old_by_dt_n: dict[tuple[str, str], int] = {}
75
+ for r in old_results:
76
+ for ev in r["events"]:
77
+ dt = parse_dt_utc(ev["start_time"]).isoformat(timespec="minutes")
78
+ url = ev["event_url"] or ""
79
+ old_by_exact[(r["org_id"], dt, url)] = ev["interested_count"]
80
+ k2 = (r["org_id"], dt)
81
+ try:
82
+ old_by_dt_n[k2] += 1
83
+ except KeyError:
84
+ old_by_dt_n[k2] = 1
85
+ if k2 not in old_by_dt:
86
+ old_by_dt[k2] = ev["interested_count"]
87
+
88
+ fetcher = SeminarFetcher(config=LLM, now=None, max_hops=3, max_events=3, previous_events_by_org=prev_by_org)
89
+ event_results = []
90
+ for org in sources:
91
+ yield emit(f"Refreshing… {done}/{len(sources)} (started {started})")
92
+ done += 1
93
+
94
+ for r in fetcher.fetch_next_event_for_org_stream(org):
95
+ for ev in r.events:
96
+ dt = parse_dt_utc(ev.start_time).isoformat(timespec="minutes")
97
+ url = ev.event_url or ""
98
+ k1 = (r.org_id, dt, url)
99
+ if k1 in old_by_exact:
100
+ ev.interested_count = old_by_exact[k1]
101
+ continue
102
+ k2 = (r.org_id, dt)
103
+ if k2 in old_by_dt and old_by_dt_n[k2] == 1:
104
+ ev.interested_count = old_by_dt[k2]
105
+ event_results.append(r)
106
+ results.append(r.model_dump())
107
+
108
+ cache.write(results=event_results)
109
+ info(f"wrote cache: {cache.path}")
110
+ yield emit(f"Refreshed: {ts()} (started {started})")
111
+ except Exception:
112
+ import traceback
113
+
114
+ error(f"Unhandled exception:\n{traceback.format_exc()}")
115
+ yield emit(f"Error: {ts()}")
116
+ return
117
+
118
+
119
+ with gr.Blocks() as demo:
120
+ gr.Markdown("# Robotic seminars\nNext upcoming event per org.")
121
+ status = gr.Markdown("")
122
+ table = gr.Dataframe(
123
+ headers=COLS,
124
+ datatype=["str", "markdown", "str", "markdown", "str"],
125
+ interactive=True,
126
+ wrap=True,
127
+ )
128
+ results_state = gr.State([])
129
+ row_map_state = gr.State([])
130
+ refresh_btn = gr.Button("Refresh now")
131
+ with gr.Accordion("Sources", open=False):
132
+ gr.Markdown(SOURCES_MD)
133
+ with gr.Accordion("Logs", open=False):
134
+ logs_box = gr.Markdown()
135
+
136
+ def on_select(results: list[dict], row_map: list[tuple[int, int]], evt: gr.SelectData):
137
+ return inc_interested(evt, results, row_map, colors=COLORS, llm=LLM, ttl_hours=TTL_HOURS)
138
+
139
+ demo.load(stream_refresh, inputs=[gr.State(False)], outputs=[status, table, logs_box, results_state, row_map_state])
140
+ refresh_btn.click(stream_refresh, inputs=[gr.State(True)], outputs=[status, table, logs_box, results_state, row_map_state])
141
+ table.select(
142
+ on_select,
143
+ inputs=[results_state, row_map_state],
144
+ outputs=[table, results_state, row_map_state],
145
+ )
146
+
147
+ # HF Spaces runs Gradio apps itself; avoid mounting + running Uvicorn here.
148
+ # app = gr.mount_gradio_app(FastAPI(), demo, path="/")
149
+ if __name__ == "__main__":
150
+ # import uvicorn
151
+ # uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("PORT", "7860")))
152
+
153
+ demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", "7860")))
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio==5.12.0
2
+ fastapi==0.115.6
3
+ uvicorn==0.34.0
4
+ pydantic==2.10.5
5
+ python-dateutil==2.9.0.post0
6
+ pandas==2.2.3
7
+ httpx==0.28.1
8
+ h2>=4.1.0
9
+ litellm>=1.0.0
10
+ python-dotenv>=1.0.0
11
+ beautifulsoup4>=4.12.2
12
+ pytest>=8.0.0
13
+ curl_cffi>=0.14.0
sources.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": "cmu-ri-seminar",
4
+ "name": "Carnegie Mellon University Robotics Institute Seminar Series",
5
+ "url": "https://www.ri.cmu.edu/events/",
6
+ "tags": ["robotics"]
7
+ },
8
+ {
9
+ "id": "rig",
10
+ "name": "Robotics Institute Germany (RIG) Lecture Series",
11
+ "url": "https://robotics-institute-germany.de/rig-lecture-series-weekly-online-lectures-on-robotics/",
12
+ "tags": ["robotics"]
13
+ },
14
+ {
15
+ "id": "stanford-engr319",
16
+ "name": "Stanford Robotics & Autonomous Systems Seminar (ENGR319)",
17
+ "url": "https://stanfordasl.github.io/robotics_seminar/",
18
+ "tags": ["robotics"]
19
+ },
20
+ {
21
+ "id": "utoronto-ri",
22
+ "name": "University of Toronto Robotics Institute Seminar Series",
23
+ "url": "https://robotics.utoronto.ca/seminar-series/",
24
+ "tags": ["robotics"]
25
+ },
26
+ {
27
+ "id": "eth-rvc-talks",
28
+ "name": "ETH Zürich Robotics, Vision, and Controls Talks",
29
+ "url": "https://robotics-talks.com/",
30
+ "tags": ["robotics"]
31
+ },
32
+ {
33
+ "id": "umd-mrc-seminars",
34
+ "name": "Maryland Robotics Center (UMD) Robotics Seminar Series",
35
+ "url": "https://robotics.umd.edu/events/mrc-seminars",
36
+ "tags": ["robotics"]
37
+ },
38
+ {
39
+ "id": "imperial-rl-seminar",
40
+ "name": "Imperial College London Robot Learning Seminar Series",
41
+ "url": "https://www.robot-learning.uk/seminar-series",
42
+ "tags": ["robotics"]
43
+ },
44
+ {
45
+ "id": "gatech-irim-seminar",
46
+ "name": "Georgia Tech IRIM Seminar Series",
47
+ "url": "https://research.gatech.edu/robotics/irim-seminar-series",
48
+ "tags": ["robotics"]
49
+ },
50
+ { "id": "robot-talk",
51
+ "name": "Robot Talk",
52
+ "url": "https://www.robottalk.org/latest-episodes/",
53
+ "tags": ["robotics"]
54
+ },
55
+ {
56
+ "id": "montreal-robotics",
57
+ "name": "Montréal Robotics / Mila Robot Learning Seminar",
58
+ "url": "https://montrealrobotics.ca/robotlearningseries/",
59
+ "tags": ["robotics"]
60
+ }
61
+ ]
src/__init__.py ADDED
File without changes
src/fetcher.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ from datetime import datetime, timezone
5
+ from typing import Iterator
6
+
7
+ from urllib.parse import urlparse
8
+ from curl_cffi import requests as r # type: ignore
9
+
10
+ from dateutil import parser as dtparser
11
+
12
+ from .models import (
13
+ CacheStore,
14
+ EventResult,
15
+ LlmConfig,
16
+ LlmEvent,
17
+ OrgSource,
18
+ USER_AGENT,
19
+ llm_extract,
20
+ parse_dt_utc,
21
+ )
22
+ from .ui_log import error, info, warn
23
+
24
+
25
+ class SeminarFetcher:
26
+ def __init__(
27
+ self,
28
+ config: LlmConfig,
29
+ now: datetime | None = None,
30
+ max_hops: int = 3,
31
+ max_events: int = 3,
32
+ previous_events_by_org: dict[str, list[dict]] | None = None,
33
+ ):
34
+ self.config = config
35
+ self.max_hops = max_hops
36
+ self.max_events = max_events
37
+ self.previous_events_by_org = previous_events_by_org or {}
38
+
39
+ if now is None:
40
+ raw = (os.environ.get("NOW_ISO") or "").strip()
41
+ now = dtparser.isoparse(raw) if raw else datetime.now(timezone.utc)
42
+ if now.tzinfo is None:
43
+ now = now.replace(tzinfo=timezone.utc)
44
+ self.now = now.astimezone(timezone.utc)
45
+
46
+ def fetch_html(self, url: str) -> str:
47
+
48
+ p = urlparse(url)
49
+ resp = r.get(url, timeout=20, allow_redirects=True, impersonate="chrome120",
50
+ headers={"User-Agent": USER_AGENT, "Accept": "text/html", "Referer": f"{p.scheme}://{p.netloc}/"},
51
+ )
52
+ resp.raise_for_status()
53
+ return resp.text
54
+
55
+ def fetch_next_event_for_org_stream(self, org: OrgSource) -> Iterator[EventResult]:
56
+ checked_at = datetime.now(timezone.utc).isoformat()
57
+ now_iso = self.now.isoformat()
58
+ url = str(org.url)
59
+ visited: list[str] = []
60
+ info(f"Inspecting {org.name}…")
61
+ result = EventResult(org_id=org.id, org_name=org.name, source_url=str(org.url), status="no_upcoming",
62
+ events=[], checked_at=checked_at, hops=self.max_hops, visited_urls=visited, error="Max hops reached.",
63
+ )
64
+
65
+ for hop_i in range(1, self.max_hops + 1):
66
+ visited.append(url)
67
+ info(f"Inspecting {org.name}: fetching {url}")
68
+
69
+ previous_events = self.previous_events_by_org[org.id] if org.id in self.previous_events_by_org else []
70
+ hop = llm_extract(
71
+ config=self.config,
72
+ org=org,
73
+ url=url,
74
+ page_html=self.fetch_html(url),
75
+ now_iso=now_iso,
76
+ previous_events=previous_events,
77
+ )
78
+ if hop.status != "ok":
79
+ next_url = hop.next_url_to_check
80
+ if next_url and next_url not in visited:
81
+ info(f"Inspecting {org.name}: following {next_url}")
82
+ url = next_url
83
+ continue
84
+ (error if hop.status == "error" else warn)(f"Inspecting {org.name}: {hop.status}: {hop.error}")
85
+ result.status = hop.status
86
+ result.hops = hop_i
87
+ result.error = hop.error
88
+ break
89
+
90
+ events: list[LlmEvent] = []
91
+ parsed = [(parse_dt_utc(e.start_time), e) for e in hop.events]
92
+ parsed.sort(key=lambda x: x[0])
93
+ for dt, e in parsed:
94
+ if dt <= self.now:
95
+ continue
96
+ if e.event_url is None:
97
+ e = e.model_copy(update={"event_url": url})
98
+ events.append(e)
99
+ if len(events) >= self.max_events:
100
+ break
101
+
102
+ result.status = "ok" if events else "no_upcoming"
103
+ result.events = events
104
+ result.hops = hop_i
105
+ result.error = hop.error
106
+ if events:
107
+ info(f"Found {len(events)} upcoming event(s) for {org.name}")
108
+ else:
109
+ warn(f"No upcoming events found for {org.name}")
110
+ break
111
+
112
+ yield result
113
+
114
+ def fetch_next_event_for_org(self, org: OrgSource) -> EventResult:
115
+ return next(self.fetch_next_event_for_org_stream(org))
src/interested.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import gradio as gr
4
+ import pandas as pd
5
+
6
+ from .fetcher import CacheStore, EventResult, LlmConfig
7
+ from .org_colors import org_tag
8
+
9
+ COLS = ["Date/Time (UTC)", "Title", "Speaker", "Organization", "Interested"]
10
+
11
+
12
+ def results_table(results: list[dict], colors: dict[str, str]) -> tuple[pd.DataFrame, list[tuple[int, int]]]:
13
+ rows: list[dict[str, object]] = []
14
+ for r_i, r in enumerate(results):
15
+ if r["status"] != "ok":
16
+ continue
17
+ for ev_i, ev in enumerate(r["events"]):
18
+ sp = ev["speaker"] or ""
19
+ aff = ev["affiliation"] or ""
20
+ speaker = f"{sp} ({aff})" if sp and aff else sp or aff
21
+ rows.append(
22
+ {
23
+ "Date/Time (UTC)": ev["start_time"],
24
+ "Title": f"[{ev['title']}]({ev['event_url']})" if ev["event_url"] else ev["title"],
25
+ "Speaker": speaker,
26
+ "Organization": org_tag(r["org_name"], colors),
27
+ "Interested": f"{ev['interested_count']} (+)",
28
+ "_org": r["org_name"],
29
+ "_r": r_i,
30
+ "_e": ev_i,
31
+ }
32
+ )
33
+ if not rows:
34
+ return pd.DataFrame(columns=COLS), []
35
+ df = pd.DataFrame(rows)
36
+ df["_sort"] = pd.to_datetime(df["Date/Time (UTC)"], utc=True, errors="coerce")
37
+ df = df.sort_values(by=["_sort", "_org"], na_position="last").reset_index(drop=True)
38
+ row_map = list(zip(df["_r"].astype(int).tolist(), df["_e"].astype(int).tolist()))
39
+ return df.drop(columns=["_sort", "_org", "_r", "_e"])[COLS], row_map
40
+
41
+
42
+ def inc_interested(
43
+ evt: gr.SelectData | None,
44
+ results: list[dict],
45
+ row_map: list[tuple[int, int]],
46
+ *,
47
+ colors: dict[str, str],
48
+ llm: LlmConfig,
49
+ ttl_hours: float,
50
+ ):
51
+ if evt is None or evt.index is None:
52
+ df, row_map = results_table(results, colors)
53
+ return df, results, row_map
54
+ r = int(evt.index[0])
55
+ c = int(evt.index[1])
56
+ if c != COLS.index("Interested"):
57
+ df, row_map = results_table(results, colors)
58
+ return df, results, row_map
59
+
60
+ r_i, ev_i = row_map[r]
61
+ results[r_i]["events"][ev_i]["interested_count"] += 1
62
+
63
+ cache = CacheStore(config=llm, ttl_hours=ttl_hours)
64
+ cache.write(results=[EventResult.model_validate(x) for x in results])
65
+
66
+ df, row_map = results_table(results, colors)
67
+ return df, results, row_map
src/models.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from dataclasses import dataclass
5
+ from datetime import datetime, timezone
6
+ from pathlib import Path
7
+ from typing import Literal
8
+
9
+ import html
10
+ from urllib.parse import urljoin, urlparse
11
+
12
+ from pydantic import BaseModel, Field, HttpUrl
13
+
14
+ Status = Literal["ok", "no_upcoming", "error"]
15
+
16
+ CACHE_SCHEMA_VERSION = 4
17
+ USER_AGENT = (
18
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
19
+ "(KHTML, like Gecko) Chrome/120 Safari/537.36"
20
+ )
21
+
22
+ # SYSTEM_PROMPT = (
23
+ # "You are given the page text content and a list of links from the page. "
24
+ # "Find up to the NEXT 3 upcoming talks/events AFTER now, sorted by time ascending. "
25
+ # "If the page lists only a date (no time), output start_time as an ISO-8601 DATE like '2026-01-15'. "
26
+ # "If events are listed as lines with a date (e.g. 'Jan 15, 2026 – ...'), treat each such line as an event. "
27
+ # "If there is no dedicated per-event URL, set event_url to the Source URL. "
28
+ # "If the schedule is not on this page, set next_url_to_check to the single best link to follow (one of LINKS). "
29
+ # "Do NOT invent placeholder titles like 'TBA' unless the page text explicitly contains 'TBA'. "
30
+ # "IMPORTANT: Keep fields clean and separated: "
31
+ # "- events[].title MUST be the talk/event title ONLY (no speaker name, no affiliation, no date/time). "
32
+ # "- Put the speaker/person name in events[].speaker when present in the text (e.g. 'Speaker: …', 'by …', 'Presenter: …'). "
33
+ # "- If the speaker affiliation/institution is present (e.g. 'UC Davis', 'MIT', 'Google DeepMind'), put it in events[].affiliation (do not mix it into title). "
34
+ # "- If a line contains both speaker and title (e.g. 'Jane Doe — Learning Robots' or 'Learning Robots — Jane Doe'), split them correctly. "
35
+ # "Choose the talk title that is best-supported by the page text. Give priority to explicit cues like 'Title:', 'Talk title:', 'Topic:', and text near 'Abstract:'/'Summary:'. "
36
+ # "If a header includes a series label plus a person name/affiliation (e.g. 'Seminar: Jane Doe (MIT)'), treat that as speaker/affiliation (not title) and keep searching the body for the real title. "
37
+ # "e.g. this Robotics Institute Seminar: Mahdi Tavakoli (University of Alberta) is not a title"
38
+ # "Keep person name and affiliation seprate. Each should be put in its own field. "
39
+ # "Never guess; quote evidence from the provided text."
40
+ # "Never reply with any text other than the JSON object. If you don't find any events, still reply with a JSON object containing the no_upcoming status. "
41
+ # "REMEMBER: START TIME IS ALWAYS REQUIRED FOR EACH EVENT. IF THERE IS NO START TIME, DO NOT INCLUDE THE EVENT. "
42
+
43
+ # "Respond in JSON with the following schema: "
44
+
45
+ # "status: ok | no_upcoming | error"
46
+ # "events: array of up to 3 objects (required if ok)"
47
+ # "events[].title: string"
48
+ # "events[].start_time: ISO-8601 string; if only date is known, use YYYY-MM-DD (always required)"
49
+ # "events[].event_url: string URL"
50
+ # "events[].speaker: string "
51
+ # "events[].affiliation: string (if prersent)"
52
+ # "events[].evidence: short snippet from provided text (always required)"
53
+ # "error: short error string (required if error)"
54
+ # "next_url_to_check: string URL (optional - must be one of LINKS if provided)"
55
+ # )
56
+
57
+ SYSTEM_PROMPT = """
58
+ You are a JSON extraction engine. You do NOT write code.
59
+
60
+ CRITICAL OUTPUT CONSTRAINTS (HARD):
61
+ - Your entire reply MUST be valid JSON (RFC 8259).
62
+ - Reply with exactly ONE JSON object.
63
+ - The first non-whitespace character MUST be "{" and the last MUST be "}".
64
+ - Use double quotes for all JSON strings. Never use single quotes.
65
+ - Do NOT include markdown fences (```), explanations, pseudocode, or Python.
66
+
67
+ If you cannot follow these constraints, reply exactly:
68
+ {"status":"error","error":"non_json_or_invalid_schema"}
69
+
70
+ Event extraction rules (HARD):
71
+ - Return up to 3 upcoming events after "now", sorted by start_time ascending.
72
+ - Every event MUST include start_time. If you cannot find a date/time in PAGE_TEXT for an event, DO NOT include that event.
73
+ - If you find zero events with a date/time, return {"status":"no_upcoming"}.
74
+
75
+ Title rule (HARD):
76
+ - title MUST be copied verbatim from PAGE_TEXT (no paraphrasing).
77
+ - title MUST come from the same local event block as the date/time:
78
+ - it must appear within 300 characters of the date/time text you used for start_time.
79
+ - Do NOT use site/series/page headings or navigation as title.
80
+ Examples of INVALID titles: "Seminar Series", "Events", "Robotics", "University of Toronto", page header text.
81
+ - If you cannot find a specific talk/topic title near the date/time, DO NOT include the event.
82
+
83
+ If PREVIOUS_EVENTS is provided:
84
+ - Use PREVIOUS_EVENTS as a strict copy source.
85
+ - If PAGE_TEXT contains an event that matches one in PREVIOUS_EVENTS, you MUST include that event in your output and you MUST copy the entire event object exactly from PREVIOUS_EVENTS.
86
+ - Do NOT omit a matched event. Do NOT say it is "already known". Do NOT reduce the number of returned events because PREVIOUS_EVENTS were provided.
87
+ - If PREVIOUS_EVENTS contains events that are NOT present in PAGE_TEXT, ignore them.
88
+
89
+ Final self-check (HARD, perform before replying):
90
+ - Your reply must be valid JSON only.
91
+ - For each event: verify start_time exists and is a non-empty string.
92
+ - If an event has missing/empty start_time, REMOVE that event.
93
+ - If no events remain, output {"status":"no_upcoming"}.
94
+
95
+ Schema:
96
+ { "status": "ok"|"no_upcoming"|"error",
97
+ "events": [{"title": "...", "start_time": "...", "event_url": "...", "speaker": "", "affiliation": null, "interested_count": 0}],
98
+ "error": "...",
99
+ "next_url_to_check": "..." }
100
+
101
+ """
102
+
103
+
104
+ class OrgSource(BaseModel):
105
+ id: str
106
+ name: str
107
+ url: HttpUrl
108
+ tags: list[str] = Field(default_factory=list)
109
+
110
+
111
+ class LlmEvent(BaseModel):
112
+ title: str
113
+ start_time: str
114
+ event_url: str | None = None
115
+ speaker: str
116
+ affiliation: str | None = None
117
+ interested_count: int = 0
118
+
119
+
120
+ class LlmHopResult(BaseModel):
121
+ status: Status
122
+ events: list[LlmEvent] = Field(default_factory=list)
123
+ error: str | None = None
124
+ next_url_to_check: str | None = None
125
+
126
+
127
+ class EventResult(BaseModel):
128
+ org_id: str
129
+ org_name: str
130
+ source_url: str
131
+ status: Status
132
+ events: list[LlmEvent] = Field(default_factory=list)
133
+ checked_at: str
134
+ hops: int = 0
135
+ visited_urls: list[str] = Field(default_factory=list)
136
+ error: str | None = None
137
+
138
+
139
+ @dataclass
140
+ class LlmConfig:
141
+ model: str
142
+ api_key: str | None = None
143
+ api_base: str | None = None
144
+
145
+
146
+ class CacheStore:
147
+ def __init__(self, config: LlmConfig, ttl_hours: float, path: Path = Path("/data/robotic_seminars/events.json")):
148
+ self.config = config
149
+ self.ttl_hours = ttl_hours
150
+ self.path = path
151
+
152
+ def is_fresh(self) -> bool:
153
+ if not self.path.exists():
154
+ return False
155
+ mtime = datetime.fromtimestamp(self.path.stat().st_mtime, tz=timezone.utc)
156
+ return (datetime.now(timezone.utc) - mtime).total_seconds() < self.ttl_hours * 3600
157
+
158
+ def is_usable(self) -> bool:
159
+ if not self.is_fresh():
160
+ return False
161
+ meta = json.loads(self.path.read_text(encoding="utf-8"))["meta"]
162
+ return meta["schema_version"] == CACHE_SCHEMA_VERSION and meta["model"] == self.config.model
163
+
164
+ def write(self, *, results: list[EventResult]) -> None:
165
+ self.path.parent.mkdir(parents=True, exist_ok=True)
166
+ payload = {
167
+ "meta": {
168
+ "model": self.config.model,
169
+ "schema_version": CACHE_SCHEMA_VERSION,
170
+ "cached_at": datetime.now(timezone.utc).isoformat(),
171
+ "ttl_hours": self.ttl_hours,
172
+ },
173
+ "results": [r.model_dump() for r in results],
174
+ }
175
+ self.path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
176
+
177
+
178
+ def parse_dt_utc(value: str) -> datetime:
179
+ from dateutil import parser as dtparser
180
+
181
+ dt = dtparser.parse(value)
182
+ if dt.tzinfo is None:
183
+ dt = dt.replace(tzinfo=timezone.utc)
184
+ return dt.astimezone(timezone.utc)
185
+
186
+
187
+ def text_and_links(page_html: str, *, base_url: str, limit: int = 40) -> tuple[str, list[str]]:
188
+ from bs4 import BeautifulSoup
189
+
190
+ soup = BeautifulSoup(html.unescape(page_html), "html.parser")
191
+ for tag in soup(["script", "style", "noscript"]):
192
+ tag.decompose()
193
+ text = "\n".join(ln.strip() for ln in soup.get_text("\n").splitlines() if ln.strip())[:24000]
194
+
195
+ base_dom = urlparse(base_url).netloc.lower()
196
+ links: list[str] = []
197
+ for a in soup.find_all("a", href=True):
198
+ u = urljoin(base_url, str(a["href"]).strip())
199
+ p = urlparse(u)
200
+ if p.scheme in {"http", "https"} and p.netloc.lower() == base_dom:
201
+ links.append(u)
202
+ if len(links) >= limit:
203
+ break
204
+
205
+ return text, links
206
+
207
+
208
+ def llm_extract(*, config: LlmConfig, org: OrgSource, url: str, page_html: str, now_iso: str, previous_events: list[dict]) -> LlmHopResult:
209
+ from litellm import completion # type: ignore
210
+
211
+ page_text, links = text_and_links(page_html, base_url=url)
212
+ messages = [
213
+ {"role": "system", "content": SYSTEM_PROMPT},
214
+ {"role": "user", "content": json.dumps({"org": org.name, "now": now_iso, "source_url": url, "PREVIOUS_EVENTS": previous_events})},
215
+ {"role": "user", "content": "LINKS:\n" + "\n".join(links)},
216
+ {"role": "user", "content": "PAGE_TEXT_BEGIN\n" + page_text + "\nPAGE_TEXT_END"},
217
+ {"role": "user", "content": "Return ONLY one JSON object (no markdown, no code). "
218
+ "Must start with '{' and end with '}'. Use double quotes only. "
219
+ "Before returning, delete any event missing/empty start_time. "
220
+ "Title must be copied verbatim from PAGE_TEXT near the date/time. "
221
+ "IMPORTANT: PREVIOUS_EVENTS are NOT a reason to omit events. If PAGE_TEXT contains an event that matches PREVIOUS_EVENTS, you MUST re-output it by copying the entire event object exactly from PREVIOUS_EVENTS. "
222
+ "If none remain, return {\"status\":\"no_upcoming\"}."}
223
+ ]
224
+
225
+ kwargs: dict[str, object] = {"model": config.model, "temperature": 0}
226
+ if config.api_key:
227
+ kwargs["api_key"] = config.api_key
228
+ if config.api_base:
229
+ kwargs["api_base"] = config.api_base
230
+
231
+ content = completion(messages=messages, **kwargs)["choices"][0]["message"]["content"]
232
+ content = content.replace('`', '').replace('json', '')
233
+ content = content[content.find("{") : content.rfind("}") + 1]
234
+
235
+ try:
236
+ content = json.loads(content)
237
+ except json.JSONDecodeError:
238
+ raise ValueError(f"LLM did not return valid JSON:\n{content}")
239
+
240
+ try:
241
+ LlmHopResult.model_validate(content)
242
+ except Exception as e:
243
+ raise ValueError(f"LLM returned JSON that does not match schema: {e}\n{content}")
244
+
245
+ return LlmHopResult.model_validate(content)
src/org_colors.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from html import escape
4
+
5
+ PALETTE = [
6
+ "#e6194b",
7
+ "#3cb44b",
8
+ "#4363d8",
9
+ "#f58231",
10
+ "#911eb4",
11
+ "#46f0f0",
12
+ "#f032e6",
13
+ "#bcf60c",
14
+ "#fabebe",
15
+ "#008080",
16
+ "#e6beff",
17
+ "#9a6324",
18
+ ]
19
+
20
+
21
+ def org_colors(sources: list[dict]) -> dict[str, str]:
22
+ return {s["name"]: PALETTE[i % len(PALETTE)] for i, s in enumerate(sources)}
23
+
24
+
25
+ def org_tag(org: str, colors: dict[str, str]) -> str:
26
+ return f'<span style="color:{colors[org]}">{escape(org)}</span>'
src/ui_log.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from contextlib import contextmanager
4
+ from datetime import datetime, timezone
5
+
6
+ _ACTIVE_LINES: list[str] = []
7
+
8
+
9
+ def ts() -> str:
10
+ return datetime.now(timezone.utc).isoformat(timespec="seconds")
11
+
12
+
13
+ def log(level: str, msg: str) -> None:
14
+ html = {
15
+ "INFO": "dodgerblue",
16
+ "WARN": "goldenrod",
17
+ "ERR": "crimson",
18
+ }
19
+ c = html.get(level)
20
+ tag = f"<span style=\"color:{c}\">{level}</span>" if c else level
21
+ _ACTIVE_LINES.append(f"[{ts()}] {tag}: {msg}")
22
+
23
+
24
+ def info(msg: str) -> None:
25
+ log("INFO", msg)
26
+
27
+
28
+ def warn(msg: str) -> None:
29
+ log("WARN", msg)
30
+
31
+
32
+ def error(msg: str) -> None:
33
+ log("ERR", msg)
34
+
35
+
36
+ @contextmanager
37
+ def bind(lines: list[str]):
38
+ global _ACTIVE_LINES
39
+ prev = _ACTIVE_LINES
40
+ _ACTIVE_LINES = lines
41
+ try:
42
+ yield
43
+ finally:
44
+ _ACTIVE_LINES = prev
tests/__init__.py ADDED
File without changes
tests/test_golden.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json, os, shutil, tempfile
2
+ from pathlib import Path
3
+
4
+ ROOT = Path(__file__).resolve().parents[1]
5
+ GOLDEN = ROOT / ".data" / "events.json"
6
+
7
+
8
+ def _load(p: Path) -> dict: return json.loads(p.read_text(encoding="utf-8"))
9
+
10
+
11
+ def _norm(d: dict) -> dict:
12
+ d = json.loads(json.dumps(d)); (d.get("meta") or {}).pop("cached_at", None)
13
+ for r in d.get("results") or []:
14
+ if isinstance(r, dict): r.pop("checked_at", None)
15
+ return d
16
+
17
+
18
+ def test_cached_results_match_golden():
19
+ golden = _load(GOLDEN); model = (golden.get("meta") or {}).get("model"); assert isinstance(model, str)
20
+ with tempfile.TemporaryDirectory() as td:
21
+ data_dir = Path(td); shutil.copy2(GOLDEN, data_dir / "events.json")
22
+ from src.fetcher import CacheStore, LlmConfig
23
+ cache = CacheStore(config=LlmConfig(model=model), ttl_hours=9999.0, path=data_dir / "events.json")
24
+ assert cache.is_usable()
25
+ got = _load(cache.path)
26
+ assert got["results"] == golden["results"]
27
+
28
+
29
+ def test_live_crawl_matches_golden_snapshot():
30
+ if os.environ.get("RUN_LIVE_TESTS") != "1": return
31
+ golden = _load(GOLDEN); model = (golden.get("meta") or {}).get("model"); assert isinstance(model, str)
32
+ with tempfile.TemporaryDirectory() as td:
33
+ data_dir = Path(td)
34
+ os.environ.update({"LLM_TEMPERATURE": "0", "LLM_SEED": "1", "LITELLM_FALLBACK_MODELS": ""})
35
+ cached_at = (golden.get("meta") or {}).get("cached_at")
36
+ if isinstance(cached_at, str) and cached_at.strip(): os.environ["NOW_ISO"] = cached_at.strip()
37
+ from src.fetcher import CacheStore, LlmConfig, OrgSource, SeminarFetcher
38
+ cfg = LlmConfig(model=model)
39
+ raw_sources = json.loads((ROOT / "sources.json").read_text(encoding="utf-8"))
40
+ sources = [OrgSource.model_validate(s) for s in raw_sources]
41
+ fetcher = SeminarFetcher(config=cfg, now=None, max_hops=3, max_events=3)
42
+ results = [fetcher.fetch_next_event_for_org(o) for o in sources]
43
+ cache = CacheStore(config=cfg, ttl_hours=float((golden.get("meta") or {}).get("ttl_hours") or 12.0), path=data_dir / "events.json")
44
+ cache.write(results=results)
45
+ produced = _load(cache.path)
46
+ assert _norm(produced) == _norm(golden)