Spaces:
Sleeping
Sleeping
first commit
Browse files- .gitignore +7 -0
- ARCHITECTURE.md +264 -0
- README.md +54 -6
- app.py +153 -0
- requirements.txt +13 -0
- sources.json +61 -0
- src/__init__.py +0 -0
- src/fetcher.py +115 -0
- src/interested.py +67 -0
- src/models.py +245 -0
- src/org_colors.py +26 -0
- src/ui_log.py +44 -0
- tests/__init__.py +0 -0
- tests/test_golden.py +46 -0
.gitignore
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.venv/
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.pyc
|
| 4 |
+
.data/
|
| 5 |
+
/data/
|
| 6 |
+
*.log
|
| 7 |
+
.env
|
ARCHITECTURE.md
ADDED
|
@@ -0,0 +1,264 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Architecture / Code Flow (with ASCII maps)
|
| 2 |
+
|
| 3 |
+
This repo is intentionally small:
|
| 4 |
+
|
| 5 |
+
- `app.py` = UI + HTTP API (Gradio mounted into FastAPI)
|
| 6 |
+
- `src/fetcher.py` = crawling + LLM extraction + validation + caching
|
| 7 |
+
- `sources.json` = list of org sources to crawl
|
| 8 |
+
- `.data/events.json` (or `/data/events.json`) = cache / golden output format
|
| 9 |
+
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
## 1) High-level module map
|
| 13 |
+
|
| 14 |
+
```
|
| 15 |
+
+--------------------+ imports / calls +---------------------+
|
| 16 |
+
| app.py | ---------------------------> | src/fetcher.py |
|
| 17 |
+
| | | |
|
| 18 |
+
| SeminarsWebApp | | SeminarFetcher |
|
| 19 |
+
| - Gradio UI | | + refresh_all* |
|
| 20 |
+
| - /refresh API | | + models + cache |
|
| 21 |
+
+--------------------+ +---------------------+
|
| 22 |
+
|
|
| 23 |
+
| reads
|
| 24 |
+
v
|
| 25 |
+
+--------------------+
|
| 26 |
+
| sources.json |
|
| 27 |
+
| list[OrgSource] |
|
| 28 |
+
+--------------------+
|
| 29 |
+
|
| 30 |
+
Cache file (written/read by fetcher):
|
| 31 |
+
|
| 32 |
+
+--------------------+
|
| 33 |
+
| .data/events.json | (or /data/events.json)
|
| 34 |
+
| meta + results |
|
| 35 |
+
+--------------------+
|
| 36 |
+
```
|
| 37 |
+
|
| 38 |
+
---
|
| 39 |
+
|
| 40 |
+
## 2) Runtime flow (what happens when you open the UI)
|
| 41 |
+
|
| 42 |
+
### 2.1 Gradio initial load
|
| 43 |
+
|
| 44 |
+
```
|
| 45 |
+
Browser
|
| 46 |
+
|
|
| 47 |
+
v
|
| 48 |
+
Gradio page load
|
| 49 |
+
|
|
| 50 |
+
v
|
| 51 |
+
app.py: demo.load(load_initial)
|
| 52 |
+
|
|
| 53 |
+
v
|
| 54 |
+
SeminarsWebApp._stream_refresh(force=False) [generator]
|
| 55 |
+
|
|
| 56 |
+
v
|
| 57 |
+
src/fetcher.py: refresh_all_stream(... force=False)
|
| 58 |
+
|
|
| 59 |
+
+--> if cache usable -> yield logs + cached results -> done
|
| 60 |
+
|
|
| 61 |
+
\--> else -> crawl + LLM -> write cache -> done
|
| 62 |
+
```
|
| 63 |
+
|
| 64 |
+
### 2.2 Manual refresh button
|
| 65 |
+
|
| 66 |
+
```
|
| 67 |
+
User clicks "Refresh now"
|
| 68 |
+
|
|
| 69 |
+
v
|
| 70 |
+
app.py: refresh_btn.click(refresh_click)
|
| 71 |
+
|
|
| 72 |
+
v
|
| 73 |
+
SeminarsWebApp._stream_refresh(force=True)
|
| 74 |
+
|
|
| 75 |
+
v
|
| 76 |
+
src/fetcher.py: refresh_all_stream(... force=True)
|
| 77 |
+
|
|
| 78 |
+
v
|
| 79 |
+
Always crawls + LLM, then writes cache
|
| 80 |
+
```
|
| 81 |
+
|
| 82 |
+
---
|
| 83 |
+
|
| 84 |
+
## 3) Cache flow (explicit in the app)
|
| 85 |
+
|
| 86 |
+
The app explicitly checks cache before crawling.
|
| 87 |
+
|
| 88 |
+
```
|
| 89 |
+
SeminarsWebApp.stream_refresh(force=False)
|
| 90 |
+
|
|
| 91 |
+
v
|
| 92 |
+
cache = CacheStore(config=..., ttl_hours=...)
|
| 93 |
+
|
|
| 94 |
+
+--> if cache.is_usable(): cache.load() -> UI updates
|
| 95 |
+
|
|
| 96 |
+
\--> else: crawl + LLM -> cache.write(results)
|
| 97 |
+
```
|
| 98 |
+
|
| 99 |
+
---
|
| 100 |
+
|
| 101 |
+
## 4) Detailed fetcher flow (one org)
|
| 102 |
+
|
| 103 |
+
The fetcher is designed around a *stream* of events:
|
| 104 |
+
|
| 105 |
+
- log event: `("log", level, message)`
|
| 106 |
+
- result event: `("result", EventResult)`
|
| 107 |
+
|
| 108 |
+
### 4.1 One-org pipeline
|
| 109 |
+
|
| 110 |
+
```
|
| 111 |
+
SeminarFetcher.fetch_next_event_for_org_stream(org)
|
| 112 |
+
|
|
| 113 |
+
v
|
| 114 |
+
for hop in 1..max_hops:
|
| 115 |
+
|
|
| 116 |
+
+--> fetch_html(url)
|
| 117 |
+
| - httpx GET
|
| 118 |
+
| - if 403: optional curl fallback
|
| 119 |
+
|
|
| 120 |
+
+--> llm_extract(...)
|
| 121 |
+
| - text_and_links(html)
|
| 122 |
+
| - LiteLLM completion(...)
|
| 123 |
+
| - safe_json() + normalize_llm_payload()
|
| 124 |
+
| - Pydantic validation -> LlmHopResult
|
| 125 |
+
|
|
| 126 |
+
+--> validate_events(hop, now)
|
| 127 |
+
| - parse_dt(start_time)
|
| 128 |
+
| - filter to future events only
|
| 129 |
+
| - ensure evidence + http(s) URL
|
| 130 |
+
|
|
| 131 |
+
+--> yield ("result", EventResult)
|
| 132 |
+
|
|
| 133 |
+
\--> (optional) follow hop.next_url_to_check if provided
|
| 134 |
+
```
|
| 135 |
+
|
| 136 |
+
### 4.2 Key idea: strictness retry
|
| 137 |
+
|
| 138 |
+
If the LLM returns something that is not valid JSON or doesn’t validate, the code retries once in “strict” mode.
|
| 139 |
+
|
| 140 |
+
```
|
| 141 |
+
llm_extract(strict=False)
|
| 142 |
+
|
|
| 143 |
+
+--> (fails JSON / schema) => retry
|
| 144 |
+
v
|
| 145 |
+
llm_extract(strict=True)
|
| 146 |
+
```
|
| 147 |
+
|
| 148 |
+
---
|
| 149 |
+
|
| 150 |
+
## 5) Data model / JSON shapes
|
| 151 |
+
|
| 152 |
+
### 5.1 Source input (`sources.json`)
|
| 153 |
+
|
| 154 |
+
```
|
| 155 |
+
[
|
| 156 |
+
{
|
| 157 |
+
"id": "utoronto",
|
| 158 |
+
"name": "UofT Robotics",
|
| 159 |
+
"url": "https://...",
|
| 160 |
+
"tags": ["canada", "university"]
|
| 161 |
+
},
|
| 162 |
+
...
|
| 163 |
+
]
|
| 164 |
+
```
|
| 165 |
+
|
| 166 |
+
Validated into `OrgSource`.
|
| 167 |
+
|
| 168 |
+
### 5.2 Per-org output (`EventResult`)
|
| 169 |
+
|
| 170 |
+
```
|
| 171 |
+
EventResult:
|
| 172 |
+
org_id, org_name, source_url
|
| 173 |
+
status: "ok" | "no_upcoming" | "error"
|
| 174 |
+
events: [LlmEvent, ...]
|
| 175 |
+
checked_at
|
| 176 |
+
hops
|
| 177 |
+
visited_urls
|
| 178 |
+
error (optional)
|
| 179 |
+
```
|
| 180 |
+
|
| 181 |
+
### 5.3 Cache file (`.data/events.json`)
|
| 182 |
+
|
| 183 |
+
```
|
| 184 |
+
{
|
| 185 |
+
"meta": {
|
| 186 |
+
"model": "...",
|
| 187 |
+
"schema_version": 3,
|
| 188 |
+
"cached_at": "...",
|
| 189 |
+
"ttl_hours": 12
|
| 190 |
+
},
|
| 191 |
+
"results": [ EventResult, ... ]
|
| 192 |
+
}
|
| 193 |
+
```
|
| 194 |
+
|
| 195 |
+
Cache is considered usable when:
|
| 196 |
+
- file exists
|
| 197 |
+
- file age < ttl
|
| 198 |
+
- `meta.schema_version == 3`
|
| 199 |
+
- `meta.model == current LLM model`
|
| 200 |
+
|
| 201 |
+
---
|
| 202 |
+
|
| 203 |
+
## 6) Where logs come from
|
| 204 |
+
|
| 205 |
+
Logs are generated in two layers:
|
| 206 |
+
|
| 207 |
+
1) Fetcher (per hop / per org)
|
| 208 |
+
|
| 209 |
+
```
|
| 210 |
+
"{org}: hop i/j — HTTP GET start: ..."
|
| 211 |
+
"{org}: hop i/j — HTTP GET done (...)"
|
| 212 |
+
"{org}: hop i/j — LLM call start (model=...)"
|
| 213 |
+
"{org}: hop i/j — LLM call done (...)"
|
| 214 |
+
"{org}: hop i/j — validating extracted event(s)"
|
| 215 |
+
"{org}: success (...)" OR "no upcoming events" OR "error (...)"
|
| 216 |
+
```
|
| 217 |
+
|
| 218 |
+
2) App wrapper (per org result summary)
|
| 219 |
+
|
| 220 |
+
```
|
| 221 |
+
"{org}: ok (k event(s))"
|
| 222 |
+
"{org}: no upcoming events found"
|
| 223 |
+
"{org}: <error message>"
|
| 224 |
+
```
|
| 225 |
+
|
| 226 |
+
---
|
| 227 |
+
|
| 228 |
+
## 7) Environment variables (practical cheat-sheet)
|
| 229 |
+
|
| 230 |
+
### App / paths
|
| 231 |
+
|
| 232 |
+
- `SOURCES_PATH` (default `sources.json`)
|
| 233 |
+
- `DATA_DIR` (default `.data`, or `/data` if that directory exists)
|
| 234 |
+
- `CACHE_TTL_HOURS` (default `12`)
|
| 235 |
+
- `PORT` (default `7860`)
|
| 236 |
+
|
| 237 |
+
### `/refresh` auth
|
| 238 |
+
|
| 239 |
+
- `REFRESH_TOKEN` (required to use `/refresh`)
|
| 240 |
+
|
| 241 |
+
### LLM (LiteLLM)
|
| 242 |
+
|
| 243 |
+
- `LITELLM_MODEL` (or `GEMINI_MODEL` fallback)
|
| 244 |
+
- `LITELLM_API_KEY` (or `GEMINI_API_KEY` fallback)
|
| 245 |
+
- `LITELLM_API_BASE` (optional)
|
| 246 |
+
|
| 247 |
+
Optional knobs:
|
| 248 |
+
- `LLM_TEMPERATURE` (default `0`)
|
| 249 |
+
- `LLM_SEED` (optional)
|
| 250 |
+
- `LLM_MIN_INTERVAL_SECONDS` (optional throttling)
|
| 251 |
+
- `NOW_ISO` (optional override of “current time” for deterministic runs)
|
| 252 |
+
|
| 253 |
+
---
|
| 254 |
+
|
| 255 |
+
## 8) Quick “read order” (if you’re new)
|
| 256 |
+
|
| 257 |
+
1) `app.py`:
|
| 258 |
+
- `SeminarsWebApp._stream_refresh()` to see end-to-end UI flow
|
| 259 |
+
- `build_fastapi()` for `/refresh`
|
| 260 |
+
|
| 261 |
+
2) `src/fetcher.py`:
|
| 262 |
+
- `refresh_all_stream()` to see caching vs crawling
|
| 263 |
+
- `SeminarFetcher.fetch_next_event_for_org_stream()` for the main pipeline
|
| 264 |
+
- `llm_extract()` + `validate_events()` for correctness guarantees
|
README.md
CHANGED
|
@@ -1,12 +1,60 @@
|
|
| 1 |
---
|
| 2 |
-
title: Robotic
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version:
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
---
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Robotic seminars
|
| 3 |
+
emoji: "🤖"
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: "5.12.0"
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
---
|
| 11 |
|
| 12 |
+
# Robotic seminars (HF Space)
|
| 13 |
+
|
| 14 |
+
This Hugging Face Space aggregates the **next upcoming event** per organization using an LLM via **LiteLLM**. It fetches each page as HTML, extracts a compact text context + candidate links, and asks the model to return structured JSON (up to 3 hops).
|
| 15 |
+
|
| 16 |
+
## Files
|
| 17 |
+
|
| 18 |
+
- [sources.json](sources.json): list of orgs + starting URLs
|
| 19 |
+
- [app.py](app.py): Gradio UI
|
| 20 |
+
- [src/fetcher.py](src/fetcher.py): LiteLLM hop loop, validation, caching
|
| 21 |
+
|
| 22 |
+
## Environment variables (HF Space “Secrets”)
|
| 23 |
+
|
| 24 |
+
- `LITELLM_MODEL` (recommended): LiteLLM model string, e.g. `gemini/gemini-2.0-flash`, `openai/gpt-4o-mini`, `anthropic/claude-3-5-sonnet-20241022`
|
| 25 |
+
- `LITELLM_FALLBACK_MODELS` (optional): comma-separated fallback models to try if the primary hits a rate limit
|
| 26 |
+
- `LLM_MIN_INTERVAL_SECONDS` (optional): minimum delay between LLM calls (useful for very low RPM limits)
|
| 27 |
+
- `LITELLM_API_KEY` (optional): explicit API key to pass to LiteLLM
|
| 28 |
+
- `LITELLM_API_BASE` (optional): custom base URL (useful for proxies/self-hosted endpoints)
|
| 29 |
+
|
| 30 |
+
Backwards-compatible (still accepted):
|
| 31 |
+
- `GEMINI_API_KEY` (optional): used as a fallback for `LITELLM_API_KEY`
|
| 32 |
+
- `GEMINI_MODEL` (optional): used as a fallback for `LITELLM_MODEL`
|
| 33 |
+
|
| 34 |
+
Provider-specific env vars also work (recommended):
|
| 35 |
+
- `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `GOOGLE_API_KEY`, etc.
|
| 36 |
+
|
| 37 |
+
- `CACHE_TTL_HOURS` (optional): defaults to `12`
|
| 38 |
+
|
| 39 |
+
## Cache
|
| 40 |
+
|
| 41 |
+
The app writes its cache to `/data/robotic_seminars/events.json`.
|
| 42 |
+
|
| 43 |
+
- On Hugging Face Spaces: enable **Persistent Storage** so `/data` exists and is writable.
|
| 44 |
+
- Locally: create `/data/robotic_seminars` and ensure it’s writable by your user.
|
| 45 |
+
|
| 46 |
+
## Local run
|
| 47 |
+
|
| 48 |
+
```bash
|
| 49 |
+
python -m venv .venv
|
| 50 |
+
source .venv/bin/activate
|
| 51 |
+
pip install -r requirements.txt
|
| 52 |
+
|
| 53 |
+
# Create a .env from the template and set your keys/model:
|
| 54 |
+
cp .env.example .env
|
| 55 |
+
|
| 56 |
+
# Or export env vars manually if you prefer.
|
| 57 |
+
python app.py
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
+
Open http://localhost:7860
|
app.py
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
from datetime import datetime, timezone
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
import gradio as gr
|
| 9 |
+
from dotenv import load_dotenv
|
| 10 |
+
from fastapi import FastAPI
|
| 11 |
+
|
| 12 |
+
from src.fetcher import CacheStore, LlmConfig, OrgSource, SeminarFetcher
|
| 13 |
+
from src.interested import COLS, inc_interested, results_table
|
| 14 |
+
from src.models import parse_dt_utc
|
| 15 |
+
from src.org_colors import org_colors
|
| 16 |
+
from src.ui_log import bind as bind_log, error, info
|
| 17 |
+
|
| 18 |
+
# import debugpy
|
| 19 |
+
# print("Waiting for debugger attach...")
|
| 20 |
+
# debugpy.listen(5678)
|
| 21 |
+
# debugpy.wait_for_client()
|
| 22 |
+
# print("Debugger attached.")
|
| 23 |
+
|
| 24 |
+
load_dotenv(Path(__file__).with_name(".env"))
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def ts() -> str:
|
| 28 |
+
return datetime.now(timezone.utc).isoformat(timespec="seconds")
|
| 29 |
+
|
| 30 |
+
SOURCES_PATH = os.environ.get("SOURCES_PATH", "sources.json")
|
| 31 |
+
TTL_HOURS = float(os.environ.get("CACHE_TTL_HOURS", "12"))
|
| 32 |
+
LLM = LlmConfig(model=os.environ.get("LITELLM_MODEL", "gemini/gemini-2.0-flash"), api_key=os.environ.get("LITELLM_API_KEY"))
|
| 33 |
+
|
| 34 |
+
SOURCES_RAW = json.loads(Path(SOURCES_PATH).read_text(encoding="utf-8"))
|
| 35 |
+
COLORS = org_colors(SOURCES_RAW)
|
| 36 |
+
|
| 37 |
+
SOURCES_MD = "\n".join(
|
| 38 |
+
f"- [{s['name']}]({s['url']})"
|
| 39 |
+
for s in SOURCES_RAW
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def stream_refresh(force: bool):
|
| 44 |
+
logs: list[str] = []
|
| 45 |
+
results: list[dict] = []
|
| 46 |
+
colors = COLORS
|
| 47 |
+
|
| 48 |
+
def emit(status: str):
|
| 49 |
+
df, row_map = results_table(results, colors)
|
| 50 |
+
return status, df, "<br>\n".join(logs), results, row_map
|
| 51 |
+
|
| 52 |
+
with bind_log(logs):
|
| 53 |
+
info(f"refresh(force={force}, ttl_hours={TTL_HOURS})")
|
| 54 |
+
info(f"model={LLM.model}")
|
| 55 |
+
|
| 56 |
+
try:
|
| 57 |
+
done = 0
|
| 58 |
+
sources = [OrgSource.model_validate(s) for s in SOURCES_RAW]
|
| 59 |
+
|
| 60 |
+
started = ts()
|
| 61 |
+
cache = CacheStore(config=LLM, ttl_hours=TTL_HOURS)
|
| 62 |
+
if not force and cache.is_usable():
|
| 63 |
+
results.extend(json.loads(cache.path.read_text(encoding="utf-8"))["results"])
|
| 64 |
+
info(f"used cache: {cache.path}")
|
| 65 |
+
yield emit(f"Refreshed: {ts()} (started {started})")
|
| 66 |
+
return
|
| 67 |
+
info("cache miss, crawling")
|
| 68 |
+
|
| 69 |
+
old_results = json.loads(cache.path.read_text(encoding="utf-8"))["results"] if cache.path.exists() else []
|
| 70 |
+
prev_by_org = {r["org_id"]: r["events"] for r in old_results}
|
| 71 |
+
|
| 72 |
+
old_by_exact: dict[tuple[str, str, str], int] = {}
|
| 73 |
+
old_by_dt: dict[tuple[str, str], int] = {}
|
| 74 |
+
old_by_dt_n: dict[tuple[str, str], int] = {}
|
| 75 |
+
for r in old_results:
|
| 76 |
+
for ev in r["events"]:
|
| 77 |
+
dt = parse_dt_utc(ev["start_time"]).isoformat(timespec="minutes")
|
| 78 |
+
url = ev["event_url"] or ""
|
| 79 |
+
old_by_exact[(r["org_id"], dt, url)] = ev["interested_count"]
|
| 80 |
+
k2 = (r["org_id"], dt)
|
| 81 |
+
try:
|
| 82 |
+
old_by_dt_n[k2] += 1
|
| 83 |
+
except KeyError:
|
| 84 |
+
old_by_dt_n[k2] = 1
|
| 85 |
+
if k2 not in old_by_dt:
|
| 86 |
+
old_by_dt[k2] = ev["interested_count"]
|
| 87 |
+
|
| 88 |
+
fetcher = SeminarFetcher(config=LLM, now=None, max_hops=3, max_events=3, previous_events_by_org=prev_by_org)
|
| 89 |
+
event_results = []
|
| 90 |
+
for org in sources:
|
| 91 |
+
yield emit(f"Refreshing… {done}/{len(sources)} (started {started})")
|
| 92 |
+
done += 1
|
| 93 |
+
|
| 94 |
+
for r in fetcher.fetch_next_event_for_org_stream(org):
|
| 95 |
+
for ev in r.events:
|
| 96 |
+
dt = parse_dt_utc(ev.start_time).isoformat(timespec="minutes")
|
| 97 |
+
url = ev.event_url or ""
|
| 98 |
+
k1 = (r.org_id, dt, url)
|
| 99 |
+
if k1 in old_by_exact:
|
| 100 |
+
ev.interested_count = old_by_exact[k1]
|
| 101 |
+
continue
|
| 102 |
+
k2 = (r.org_id, dt)
|
| 103 |
+
if k2 in old_by_dt and old_by_dt_n[k2] == 1:
|
| 104 |
+
ev.interested_count = old_by_dt[k2]
|
| 105 |
+
event_results.append(r)
|
| 106 |
+
results.append(r.model_dump())
|
| 107 |
+
|
| 108 |
+
cache.write(results=event_results)
|
| 109 |
+
info(f"wrote cache: {cache.path}")
|
| 110 |
+
yield emit(f"Refreshed: {ts()} (started {started})")
|
| 111 |
+
except Exception:
|
| 112 |
+
import traceback
|
| 113 |
+
|
| 114 |
+
error(f"Unhandled exception:\n{traceback.format_exc()}")
|
| 115 |
+
yield emit(f"Error: {ts()}")
|
| 116 |
+
return
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
with gr.Blocks() as demo:
|
| 120 |
+
gr.Markdown("# Robotic seminars\nNext upcoming event per org.")
|
| 121 |
+
status = gr.Markdown("")
|
| 122 |
+
table = gr.Dataframe(
|
| 123 |
+
headers=COLS,
|
| 124 |
+
datatype=["str", "markdown", "str", "markdown", "str"],
|
| 125 |
+
interactive=True,
|
| 126 |
+
wrap=True,
|
| 127 |
+
)
|
| 128 |
+
results_state = gr.State([])
|
| 129 |
+
row_map_state = gr.State([])
|
| 130 |
+
refresh_btn = gr.Button("Refresh now")
|
| 131 |
+
with gr.Accordion("Sources", open=False):
|
| 132 |
+
gr.Markdown(SOURCES_MD)
|
| 133 |
+
with gr.Accordion("Logs", open=False):
|
| 134 |
+
logs_box = gr.Markdown()
|
| 135 |
+
|
| 136 |
+
def on_select(results: list[dict], row_map: list[tuple[int, int]], evt: gr.SelectData):
|
| 137 |
+
return inc_interested(evt, results, row_map, colors=COLORS, llm=LLM, ttl_hours=TTL_HOURS)
|
| 138 |
+
|
| 139 |
+
demo.load(stream_refresh, inputs=[gr.State(False)], outputs=[status, table, logs_box, results_state, row_map_state])
|
| 140 |
+
refresh_btn.click(stream_refresh, inputs=[gr.State(True)], outputs=[status, table, logs_box, results_state, row_map_state])
|
| 141 |
+
table.select(
|
| 142 |
+
on_select,
|
| 143 |
+
inputs=[results_state, row_map_state],
|
| 144 |
+
outputs=[table, results_state, row_map_state],
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
# HF Spaces runs Gradio apps itself; avoid mounting + running Uvicorn here.
|
| 148 |
+
# app = gr.mount_gradio_app(FastAPI(), demo, path="/")
|
| 149 |
+
if __name__ == "__main__":
|
| 150 |
+
# import uvicorn
|
| 151 |
+
# uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("PORT", "7860")))
|
| 152 |
+
|
| 153 |
+
demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", "7860")))
|
requirements.txt
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio==5.12.0
|
| 2 |
+
fastapi==0.115.6
|
| 3 |
+
uvicorn==0.34.0
|
| 4 |
+
pydantic==2.10.5
|
| 5 |
+
python-dateutil==2.9.0.post0
|
| 6 |
+
pandas==2.2.3
|
| 7 |
+
httpx==0.28.1
|
| 8 |
+
h2>=4.1.0
|
| 9 |
+
litellm>=1.0.0
|
| 10 |
+
python-dotenv>=1.0.0
|
| 11 |
+
beautifulsoup4>=4.12.2
|
| 12 |
+
pytest>=8.0.0
|
| 13 |
+
curl_cffi>=0.14.0
|
sources.json
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"id": "cmu-ri-seminar",
|
| 4 |
+
"name": "Carnegie Mellon University Robotics Institute Seminar Series",
|
| 5 |
+
"url": "https://www.ri.cmu.edu/events/",
|
| 6 |
+
"tags": ["robotics"]
|
| 7 |
+
},
|
| 8 |
+
{
|
| 9 |
+
"id": "rig",
|
| 10 |
+
"name": "Robotics Institute Germany (RIG) Lecture Series",
|
| 11 |
+
"url": "https://robotics-institute-germany.de/rig-lecture-series-weekly-online-lectures-on-robotics/",
|
| 12 |
+
"tags": ["robotics"]
|
| 13 |
+
},
|
| 14 |
+
{
|
| 15 |
+
"id": "stanford-engr319",
|
| 16 |
+
"name": "Stanford Robotics & Autonomous Systems Seminar (ENGR319)",
|
| 17 |
+
"url": "https://stanfordasl.github.io/robotics_seminar/",
|
| 18 |
+
"tags": ["robotics"]
|
| 19 |
+
},
|
| 20 |
+
{
|
| 21 |
+
"id": "utoronto-ri",
|
| 22 |
+
"name": "University of Toronto Robotics Institute Seminar Series",
|
| 23 |
+
"url": "https://robotics.utoronto.ca/seminar-series/",
|
| 24 |
+
"tags": ["robotics"]
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"id": "eth-rvc-talks",
|
| 28 |
+
"name": "ETH Zürich Robotics, Vision, and Controls Talks",
|
| 29 |
+
"url": "https://robotics-talks.com/",
|
| 30 |
+
"tags": ["robotics"]
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"id": "umd-mrc-seminars",
|
| 34 |
+
"name": "Maryland Robotics Center (UMD) Robotics Seminar Series",
|
| 35 |
+
"url": "https://robotics.umd.edu/events/mrc-seminars",
|
| 36 |
+
"tags": ["robotics"]
|
| 37 |
+
},
|
| 38 |
+
{
|
| 39 |
+
"id": "imperial-rl-seminar",
|
| 40 |
+
"name": "Imperial College London Robot Learning Seminar Series",
|
| 41 |
+
"url": "https://www.robot-learning.uk/seminar-series",
|
| 42 |
+
"tags": ["robotics"]
|
| 43 |
+
},
|
| 44 |
+
{
|
| 45 |
+
"id": "gatech-irim-seminar",
|
| 46 |
+
"name": "Georgia Tech IRIM Seminar Series",
|
| 47 |
+
"url": "https://research.gatech.edu/robotics/irim-seminar-series",
|
| 48 |
+
"tags": ["robotics"]
|
| 49 |
+
},
|
| 50 |
+
{ "id": "robot-talk",
|
| 51 |
+
"name": "Robot Talk",
|
| 52 |
+
"url": "https://www.robottalk.org/latest-episodes/",
|
| 53 |
+
"tags": ["robotics"]
|
| 54 |
+
},
|
| 55 |
+
{
|
| 56 |
+
"id": "montreal-robotics",
|
| 57 |
+
"name": "Montréal Robotics / Mila Robot Learning Seminar",
|
| 58 |
+
"url": "https://montrealrobotics.ca/robotlearningseries/",
|
| 59 |
+
"tags": ["robotics"]
|
| 60 |
+
}
|
| 61 |
+
]
|
src/__init__.py
ADDED
|
File without changes
|
src/fetcher.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
from datetime import datetime, timezone
|
| 5 |
+
from typing import Iterator
|
| 6 |
+
|
| 7 |
+
from urllib.parse import urlparse
|
| 8 |
+
from curl_cffi import requests as r # type: ignore
|
| 9 |
+
|
| 10 |
+
from dateutil import parser as dtparser
|
| 11 |
+
|
| 12 |
+
from .models import (
|
| 13 |
+
CacheStore,
|
| 14 |
+
EventResult,
|
| 15 |
+
LlmConfig,
|
| 16 |
+
LlmEvent,
|
| 17 |
+
OrgSource,
|
| 18 |
+
USER_AGENT,
|
| 19 |
+
llm_extract,
|
| 20 |
+
parse_dt_utc,
|
| 21 |
+
)
|
| 22 |
+
from .ui_log import error, info, warn
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class SeminarFetcher:
|
| 26 |
+
def __init__(
|
| 27 |
+
self,
|
| 28 |
+
config: LlmConfig,
|
| 29 |
+
now: datetime | None = None,
|
| 30 |
+
max_hops: int = 3,
|
| 31 |
+
max_events: int = 3,
|
| 32 |
+
previous_events_by_org: dict[str, list[dict]] | None = None,
|
| 33 |
+
):
|
| 34 |
+
self.config = config
|
| 35 |
+
self.max_hops = max_hops
|
| 36 |
+
self.max_events = max_events
|
| 37 |
+
self.previous_events_by_org = previous_events_by_org or {}
|
| 38 |
+
|
| 39 |
+
if now is None:
|
| 40 |
+
raw = (os.environ.get("NOW_ISO") or "").strip()
|
| 41 |
+
now = dtparser.isoparse(raw) if raw else datetime.now(timezone.utc)
|
| 42 |
+
if now.tzinfo is None:
|
| 43 |
+
now = now.replace(tzinfo=timezone.utc)
|
| 44 |
+
self.now = now.astimezone(timezone.utc)
|
| 45 |
+
|
| 46 |
+
def fetch_html(self, url: str) -> str:
|
| 47 |
+
|
| 48 |
+
p = urlparse(url)
|
| 49 |
+
resp = r.get(url, timeout=20, allow_redirects=True, impersonate="chrome120",
|
| 50 |
+
headers={"User-Agent": USER_AGENT, "Accept": "text/html", "Referer": f"{p.scheme}://{p.netloc}/"},
|
| 51 |
+
)
|
| 52 |
+
resp.raise_for_status()
|
| 53 |
+
return resp.text
|
| 54 |
+
|
| 55 |
+
def fetch_next_event_for_org_stream(self, org: OrgSource) -> Iterator[EventResult]:
|
| 56 |
+
checked_at = datetime.now(timezone.utc).isoformat()
|
| 57 |
+
now_iso = self.now.isoformat()
|
| 58 |
+
url = str(org.url)
|
| 59 |
+
visited: list[str] = []
|
| 60 |
+
info(f"Inspecting {org.name}…")
|
| 61 |
+
result = EventResult(org_id=org.id, org_name=org.name, source_url=str(org.url), status="no_upcoming",
|
| 62 |
+
events=[], checked_at=checked_at, hops=self.max_hops, visited_urls=visited, error="Max hops reached.",
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
for hop_i in range(1, self.max_hops + 1):
|
| 66 |
+
visited.append(url)
|
| 67 |
+
info(f"Inspecting {org.name}: fetching {url}")
|
| 68 |
+
|
| 69 |
+
previous_events = self.previous_events_by_org[org.id] if org.id in self.previous_events_by_org else []
|
| 70 |
+
hop = llm_extract(
|
| 71 |
+
config=self.config,
|
| 72 |
+
org=org,
|
| 73 |
+
url=url,
|
| 74 |
+
page_html=self.fetch_html(url),
|
| 75 |
+
now_iso=now_iso,
|
| 76 |
+
previous_events=previous_events,
|
| 77 |
+
)
|
| 78 |
+
if hop.status != "ok":
|
| 79 |
+
next_url = hop.next_url_to_check
|
| 80 |
+
if next_url and next_url not in visited:
|
| 81 |
+
info(f"Inspecting {org.name}: following {next_url}")
|
| 82 |
+
url = next_url
|
| 83 |
+
continue
|
| 84 |
+
(error if hop.status == "error" else warn)(f"Inspecting {org.name}: {hop.status}: {hop.error}")
|
| 85 |
+
result.status = hop.status
|
| 86 |
+
result.hops = hop_i
|
| 87 |
+
result.error = hop.error
|
| 88 |
+
break
|
| 89 |
+
|
| 90 |
+
events: list[LlmEvent] = []
|
| 91 |
+
parsed = [(parse_dt_utc(e.start_time), e) for e in hop.events]
|
| 92 |
+
parsed.sort(key=lambda x: x[0])
|
| 93 |
+
for dt, e in parsed:
|
| 94 |
+
if dt <= self.now:
|
| 95 |
+
continue
|
| 96 |
+
if e.event_url is None:
|
| 97 |
+
e = e.model_copy(update={"event_url": url})
|
| 98 |
+
events.append(e)
|
| 99 |
+
if len(events) >= self.max_events:
|
| 100 |
+
break
|
| 101 |
+
|
| 102 |
+
result.status = "ok" if events else "no_upcoming"
|
| 103 |
+
result.events = events
|
| 104 |
+
result.hops = hop_i
|
| 105 |
+
result.error = hop.error
|
| 106 |
+
if events:
|
| 107 |
+
info(f"Found {len(events)} upcoming event(s) for {org.name}")
|
| 108 |
+
else:
|
| 109 |
+
warn(f"No upcoming events found for {org.name}")
|
| 110 |
+
break
|
| 111 |
+
|
| 112 |
+
yield result
|
| 113 |
+
|
| 114 |
+
def fetch_next_event_for_org(self, org: OrgSource) -> EventResult:
|
| 115 |
+
return next(self.fetch_next_event_for_org_stream(org))
|
src/interested.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import gradio as gr
|
| 4 |
+
import pandas as pd
|
| 5 |
+
|
| 6 |
+
from .fetcher import CacheStore, EventResult, LlmConfig
|
| 7 |
+
from .org_colors import org_tag
|
| 8 |
+
|
| 9 |
+
COLS = ["Date/Time (UTC)", "Title", "Speaker", "Organization", "Interested"]
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def results_table(results: list[dict], colors: dict[str, str]) -> tuple[pd.DataFrame, list[tuple[int, int]]]:
|
| 13 |
+
rows: list[dict[str, object]] = []
|
| 14 |
+
for r_i, r in enumerate(results):
|
| 15 |
+
if r["status"] != "ok":
|
| 16 |
+
continue
|
| 17 |
+
for ev_i, ev in enumerate(r["events"]):
|
| 18 |
+
sp = ev["speaker"] or ""
|
| 19 |
+
aff = ev["affiliation"] or ""
|
| 20 |
+
speaker = f"{sp} ({aff})" if sp and aff else sp or aff
|
| 21 |
+
rows.append(
|
| 22 |
+
{
|
| 23 |
+
"Date/Time (UTC)": ev["start_time"],
|
| 24 |
+
"Title": f"[{ev['title']}]({ev['event_url']})" if ev["event_url"] else ev["title"],
|
| 25 |
+
"Speaker": speaker,
|
| 26 |
+
"Organization": org_tag(r["org_name"], colors),
|
| 27 |
+
"Interested": f"{ev['interested_count']} (+)",
|
| 28 |
+
"_org": r["org_name"],
|
| 29 |
+
"_r": r_i,
|
| 30 |
+
"_e": ev_i,
|
| 31 |
+
}
|
| 32 |
+
)
|
| 33 |
+
if not rows:
|
| 34 |
+
return pd.DataFrame(columns=COLS), []
|
| 35 |
+
df = pd.DataFrame(rows)
|
| 36 |
+
df["_sort"] = pd.to_datetime(df["Date/Time (UTC)"], utc=True, errors="coerce")
|
| 37 |
+
df = df.sort_values(by=["_sort", "_org"], na_position="last").reset_index(drop=True)
|
| 38 |
+
row_map = list(zip(df["_r"].astype(int).tolist(), df["_e"].astype(int).tolist()))
|
| 39 |
+
return df.drop(columns=["_sort", "_org", "_r", "_e"])[COLS], row_map
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def inc_interested(
|
| 43 |
+
evt: gr.SelectData | None,
|
| 44 |
+
results: list[dict],
|
| 45 |
+
row_map: list[tuple[int, int]],
|
| 46 |
+
*,
|
| 47 |
+
colors: dict[str, str],
|
| 48 |
+
llm: LlmConfig,
|
| 49 |
+
ttl_hours: float,
|
| 50 |
+
):
|
| 51 |
+
if evt is None or evt.index is None:
|
| 52 |
+
df, row_map = results_table(results, colors)
|
| 53 |
+
return df, results, row_map
|
| 54 |
+
r = int(evt.index[0])
|
| 55 |
+
c = int(evt.index[1])
|
| 56 |
+
if c != COLS.index("Interested"):
|
| 57 |
+
df, row_map = results_table(results, colors)
|
| 58 |
+
return df, results, row_map
|
| 59 |
+
|
| 60 |
+
r_i, ev_i = row_map[r]
|
| 61 |
+
results[r_i]["events"][ev_i]["interested_count"] += 1
|
| 62 |
+
|
| 63 |
+
cache = CacheStore(config=llm, ttl_hours=ttl_hours)
|
| 64 |
+
cache.write(results=[EventResult.model_validate(x) for x in results])
|
| 65 |
+
|
| 66 |
+
df, row_map = results_table(results, colors)
|
| 67 |
+
return df, results, row_map
|
src/models.py
ADDED
|
@@ -0,0 +1,245 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
from dataclasses import dataclass
|
| 5 |
+
from datetime import datetime, timezone
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import Literal
|
| 8 |
+
|
| 9 |
+
import html
|
| 10 |
+
from urllib.parse import urljoin, urlparse
|
| 11 |
+
|
| 12 |
+
from pydantic import BaseModel, Field, HttpUrl
|
| 13 |
+
|
| 14 |
+
Status = Literal["ok", "no_upcoming", "error"]
|
| 15 |
+
|
| 16 |
+
CACHE_SCHEMA_VERSION = 4
|
| 17 |
+
USER_AGENT = (
|
| 18 |
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
| 19 |
+
"(KHTML, like Gecko) Chrome/120 Safari/537.36"
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
# SYSTEM_PROMPT = (
|
| 23 |
+
# "You are given the page text content and a list of links from the page. "
|
| 24 |
+
# "Find up to the NEXT 3 upcoming talks/events AFTER now, sorted by time ascending. "
|
| 25 |
+
# "If the page lists only a date (no time), output start_time as an ISO-8601 DATE like '2026-01-15'. "
|
| 26 |
+
# "If events are listed as lines with a date (e.g. 'Jan 15, 2026 – ...'), treat each such line as an event. "
|
| 27 |
+
# "If there is no dedicated per-event URL, set event_url to the Source URL. "
|
| 28 |
+
# "If the schedule is not on this page, set next_url_to_check to the single best link to follow (one of LINKS). "
|
| 29 |
+
# "Do NOT invent placeholder titles like 'TBA' unless the page text explicitly contains 'TBA'. "
|
| 30 |
+
# "IMPORTANT: Keep fields clean and separated: "
|
| 31 |
+
# "- events[].title MUST be the talk/event title ONLY (no speaker name, no affiliation, no date/time). "
|
| 32 |
+
# "- Put the speaker/person name in events[].speaker when present in the text (e.g. 'Speaker: …', 'by …', 'Presenter: …'). "
|
| 33 |
+
# "- If the speaker affiliation/institution is present (e.g. 'UC Davis', 'MIT', 'Google DeepMind'), put it in events[].affiliation (do not mix it into title). "
|
| 34 |
+
# "- If a line contains both speaker and title (e.g. 'Jane Doe — Learning Robots' or 'Learning Robots — Jane Doe'), split them correctly. "
|
| 35 |
+
# "Choose the talk title that is best-supported by the page text. Give priority to explicit cues like 'Title:', 'Talk title:', 'Topic:', and text near 'Abstract:'/'Summary:'. "
|
| 36 |
+
# "If a header includes a series label plus a person name/affiliation (e.g. 'Seminar: Jane Doe (MIT)'), treat that as speaker/affiliation (not title) and keep searching the body for the real title. "
|
| 37 |
+
# "e.g. this Robotics Institute Seminar: Mahdi Tavakoli (University of Alberta) is not a title"
|
| 38 |
+
# "Keep person name and affiliation seprate. Each should be put in its own field. "
|
| 39 |
+
# "Never guess; quote evidence from the provided text."
|
| 40 |
+
# "Never reply with any text other than the JSON object. If you don't find any events, still reply with a JSON object containing the no_upcoming status. "
|
| 41 |
+
# "REMEMBER: START TIME IS ALWAYS REQUIRED FOR EACH EVENT. IF THERE IS NO START TIME, DO NOT INCLUDE THE EVENT. "
|
| 42 |
+
|
| 43 |
+
# "Respond in JSON with the following schema: "
|
| 44 |
+
|
| 45 |
+
# "status: ok | no_upcoming | error"
|
| 46 |
+
# "events: array of up to 3 objects (required if ok)"
|
| 47 |
+
# "events[].title: string"
|
| 48 |
+
# "events[].start_time: ISO-8601 string; if only date is known, use YYYY-MM-DD (always required)"
|
| 49 |
+
# "events[].event_url: string URL"
|
| 50 |
+
# "events[].speaker: string "
|
| 51 |
+
# "events[].affiliation: string (if prersent)"
|
| 52 |
+
# "events[].evidence: short snippet from provided text (always required)"
|
| 53 |
+
# "error: short error string (required if error)"
|
| 54 |
+
# "next_url_to_check: string URL (optional - must be one of LINKS if provided)"
|
| 55 |
+
# )
|
| 56 |
+
|
| 57 |
+
SYSTEM_PROMPT = """
|
| 58 |
+
You are a JSON extraction engine. You do NOT write code.
|
| 59 |
+
|
| 60 |
+
CRITICAL OUTPUT CONSTRAINTS (HARD):
|
| 61 |
+
- Your entire reply MUST be valid JSON (RFC 8259).
|
| 62 |
+
- Reply with exactly ONE JSON object.
|
| 63 |
+
- The first non-whitespace character MUST be "{" and the last MUST be "}".
|
| 64 |
+
- Use double quotes for all JSON strings. Never use single quotes.
|
| 65 |
+
- Do NOT include markdown fences (```), explanations, pseudocode, or Python.
|
| 66 |
+
|
| 67 |
+
If you cannot follow these constraints, reply exactly:
|
| 68 |
+
{"status":"error","error":"non_json_or_invalid_schema"}
|
| 69 |
+
|
| 70 |
+
Event extraction rules (HARD):
|
| 71 |
+
- Return up to 3 upcoming events after "now", sorted by start_time ascending.
|
| 72 |
+
- Every event MUST include start_time. If you cannot find a date/time in PAGE_TEXT for an event, DO NOT include that event.
|
| 73 |
+
- If you find zero events with a date/time, return {"status":"no_upcoming"}.
|
| 74 |
+
|
| 75 |
+
Title rule (HARD):
|
| 76 |
+
- title MUST be copied verbatim from PAGE_TEXT (no paraphrasing).
|
| 77 |
+
- title MUST come from the same local event block as the date/time:
|
| 78 |
+
- it must appear within 300 characters of the date/time text you used for start_time.
|
| 79 |
+
- Do NOT use site/series/page headings or navigation as title.
|
| 80 |
+
Examples of INVALID titles: "Seminar Series", "Events", "Robotics", "University of Toronto", page header text.
|
| 81 |
+
- If you cannot find a specific talk/topic title near the date/time, DO NOT include the event.
|
| 82 |
+
|
| 83 |
+
If PREVIOUS_EVENTS is provided:
|
| 84 |
+
- Use PREVIOUS_EVENTS as a strict copy source.
|
| 85 |
+
- If PAGE_TEXT contains an event that matches one in PREVIOUS_EVENTS, you MUST include that event in your output and you MUST copy the entire event object exactly from PREVIOUS_EVENTS.
|
| 86 |
+
- Do NOT omit a matched event. Do NOT say it is "already known". Do NOT reduce the number of returned events because PREVIOUS_EVENTS were provided.
|
| 87 |
+
- If PREVIOUS_EVENTS contains events that are NOT present in PAGE_TEXT, ignore them.
|
| 88 |
+
|
| 89 |
+
Final self-check (HARD, perform before replying):
|
| 90 |
+
- Your reply must be valid JSON only.
|
| 91 |
+
- For each event: verify start_time exists and is a non-empty string.
|
| 92 |
+
- If an event has missing/empty start_time, REMOVE that event.
|
| 93 |
+
- If no events remain, output {"status":"no_upcoming"}.
|
| 94 |
+
|
| 95 |
+
Schema:
|
| 96 |
+
{ "status": "ok"|"no_upcoming"|"error",
|
| 97 |
+
"events": [{"title": "...", "start_time": "...", "event_url": "...", "speaker": "", "affiliation": null, "interested_count": 0}],
|
| 98 |
+
"error": "...",
|
| 99 |
+
"next_url_to_check": "..." }
|
| 100 |
+
|
| 101 |
+
"""
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
class OrgSource(BaseModel):
|
| 105 |
+
id: str
|
| 106 |
+
name: str
|
| 107 |
+
url: HttpUrl
|
| 108 |
+
tags: list[str] = Field(default_factory=list)
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
class LlmEvent(BaseModel):
|
| 112 |
+
title: str
|
| 113 |
+
start_time: str
|
| 114 |
+
event_url: str | None = None
|
| 115 |
+
speaker: str
|
| 116 |
+
affiliation: str | None = None
|
| 117 |
+
interested_count: int = 0
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
class LlmHopResult(BaseModel):
|
| 121 |
+
status: Status
|
| 122 |
+
events: list[LlmEvent] = Field(default_factory=list)
|
| 123 |
+
error: str | None = None
|
| 124 |
+
next_url_to_check: str | None = None
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
class EventResult(BaseModel):
|
| 128 |
+
org_id: str
|
| 129 |
+
org_name: str
|
| 130 |
+
source_url: str
|
| 131 |
+
status: Status
|
| 132 |
+
events: list[LlmEvent] = Field(default_factory=list)
|
| 133 |
+
checked_at: str
|
| 134 |
+
hops: int = 0
|
| 135 |
+
visited_urls: list[str] = Field(default_factory=list)
|
| 136 |
+
error: str | None = None
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
@dataclass
|
| 140 |
+
class LlmConfig:
|
| 141 |
+
model: str
|
| 142 |
+
api_key: str | None = None
|
| 143 |
+
api_base: str | None = None
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
class CacheStore:
|
| 147 |
+
def __init__(self, config: LlmConfig, ttl_hours: float, path: Path = Path("/data/robotic_seminars/events.json")):
|
| 148 |
+
self.config = config
|
| 149 |
+
self.ttl_hours = ttl_hours
|
| 150 |
+
self.path = path
|
| 151 |
+
|
| 152 |
+
def is_fresh(self) -> bool:
|
| 153 |
+
if not self.path.exists():
|
| 154 |
+
return False
|
| 155 |
+
mtime = datetime.fromtimestamp(self.path.stat().st_mtime, tz=timezone.utc)
|
| 156 |
+
return (datetime.now(timezone.utc) - mtime).total_seconds() < self.ttl_hours * 3600
|
| 157 |
+
|
| 158 |
+
def is_usable(self) -> bool:
|
| 159 |
+
if not self.is_fresh():
|
| 160 |
+
return False
|
| 161 |
+
meta = json.loads(self.path.read_text(encoding="utf-8"))["meta"]
|
| 162 |
+
return meta["schema_version"] == CACHE_SCHEMA_VERSION and meta["model"] == self.config.model
|
| 163 |
+
|
| 164 |
+
def write(self, *, results: list[EventResult]) -> None:
|
| 165 |
+
self.path.parent.mkdir(parents=True, exist_ok=True)
|
| 166 |
+
payload = {
|
| 167 |
+
"meta": {
|
| 168 |
+
"model": self.config.model,
|
| 169 |
+
"schema_version": CACHE_SCHEMA_VERSION,
|
| 170 |
+
"cached_at": datetime.now(timezone.utc).isoformat(),
|
| 171 |
+
"ttl_hours": self.ttl_hours,
|
| 172 |
+
},
|
| 173 |
+
"results": [r.model_dump() for r in results],
|
| 174 |
+
}
|
| 175 |
+
self.path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def parse_dt_utc(value: str) -> datetime:
|
| 179 |
+
from dateutil import parser as dtparser
|
| 180 |
+
|
| 181 |
+
dt = dtparser.parse(value)
|
| 182 |
+
if dt.tzinfo is None:
|
| 183 |
+
dt = dt.replace(tzinfo=timezone.utc)
|
| 184 |
+
return dt.astimezone(timezone.utc)
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
def text_and_links(page_html: str, *, base_url: str, limit: int = 40) -> tuple[str, list[str]]:
|
| 188 |
+
from bs4 import BeautifulSoup
|
| 189 |
+
|
| 190 |
+
soup = BeautifulSoup(html.unescape(page_html), "html.parser")
|
| 191 |
+
for tag in soup(["script", "style", "noscript"]):
|
| 192 |
+
tag.decompose()
|
| 193 |
+
text = "\n".join(ln.strip() for ln in soup.get_text("\n").splitlines() if ln.strip())[:24000]
|
| 194 |
+
|
| 195 |
+
base_dom = urlparse(base_url).netloc.lower()
|
| 196 |
+
links: list[str] = []
|
| 197 |
+
for a in soup.find_all("a", href=True):
|
| 198 |
+
u = urljoin(base_url, str(a["href"]).strip())
|
| 199 |
+
p = urlparse(u)
|
| 200 |
+
if p.scheme in {"http", "https"} and p.netloc.lower() == base_dom:
|
| 201 |
+
links.append(u)
|
| 202 |
+
if len(links) >= limit:
|
| 203 |
+
break
|
| 204 |
+
|
| 205 |
+
return text, links
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
def llm_extract(*, config: LlmConfig, org: OrgSource, url: str, page_html: str, now_iso: str, previous_events: list[dict]) -> LlmHopResult:
|
| 209 |
+
from litellm import completion # type: ignore
|
| 210 |
+
|
| 211 |
+
page_text, links = text_and_links(page_html, base_url=url)
|
| 212 |
+
messages = [
|
| 213 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 214 |
+
{"role": "user", "content": json.dumps({"org": org.name, "now": now_iso, "source_url": url, "PREVIOUS_EVENTS": previous_events})},
|
| 215 |
+
{"role": "user", "content": "LINKS:\n" + "\n".join(links)},
|
| 216 |
+
{"role": "user", "content": "PAGE_TEXT_BEGIN\n" + page_text + "\nPAGE_TEXT_END"},
|
| 217 |
+
{"role": "user", "content": "Return ONLY one JSON object (no markdown, no code). "
|
| 218 |
+
"Must start with '{' and end with '}'. Use double quotes only. "
|
| 219 |
+
"Before returning, delete any event missing/empty start_time. "
|
| 220 |
+
"Title must be copied verbatim from PAGE_TEXT near the date/time. "
|
| 221 |
+
"IMPORTANT: PREVIOUS_EVENTS are NOT a reason to omit events. If PAGE_TEXT contains an event that matches PREVIOUS_EVENTS, you MUST re-output it by copying the entire event object exactly from PREVIOUS_EVENTS. "
|
| 222 |
+
"If none remain, return {\"status\":\"no_upcoming\"}."}
|
| 223 |
+
]
|
| 224 |
+
|
| 225 |
+
kwargs: dict[str, object] = {"model": config.model, "temperature": 0}
|
| 226 |
+
if config.api_key:
|
| 227 |
+
kwargs["api_key"] = config.api_key
|
| 228 |
+
if config.api_base:
|
| 229 |
+
kwargs["api_base"] = config.api_base
|
| 230 |
+
|
| 231 |
+
content = completion(messages=messages, **kwargs)["choices"][0]["message"]["content"]
|
| 232 |
+
content = content.replace('`', '').replace('json', '')
|
| 233 |
+
content = content[content.find("{") : content.rfind("}") + 1]
|
| 234 |
+
|
| 235 |
+
try:
|
| 236 |
+
content = json.loads(content)
|
| 237 |
+
except json.JSONDecodeError:
|
| 238 |
+
raise ValueError(f"LLM did not return valid JSON:\n{content}")
|
| 239 |
+
|
| 240 |
+
try:
|
| 241 |
+
LlmHopResult.model_validate(content)
|
| 242 |
+
except Exception as e:
|
| 243 |
+
raise ValueError(f"LLM returned JSON that does not match schema: {e}\n{content}")
|
| 244 |
+
|
| 245 |
+
return LlmHopResult.model_validate(content)
|
src/org_colors.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from html import escape
|
| 4 |
+
|
| 5 |
+
PALETTE = [
|
| 6 |
+
"#e6194b",
|
| 7 |
+
"#3cb44b",
|
| 8 |
+
"#4363d8",
|
| 9 |
+
"#f58231",
|
| 10 |
+
"#911eb4",
|
| 11 |
+
"#46f0f0",
|
| 12 |
+
"#f032e6",
|
| 13 |
+
"#bcf60c",
|
| 14 |
+
"#fabebe",
|
| 15 |
+
"#008080",
|
| 16 |
+
"#e6beff",
|
| 17 |
+
"#9a6324",
|
| 18 |
+
]
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def org_colors(sources: list[dict]) -> dict[str, str]:
|
| 22 |
+
return {s["name"]: PALETTE[i % len(PALETTE)] for i, s in enumerate(sources)}
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def org_tag(org: str, colors: dict[str, str]) -> str:
|
| 26 |
+
return f'<span style="color:{colors[org]}">{escape(org)}</span>'
|
src/ui_log.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from contextlib import contextmanager
|
| 4 |
+
from datetime import datetime, timezone
|
| 5 |
+
|
| 6 |
+
_ACTIVE_LINES: list[str] = []
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def ts() -> str:
|
| 10 |
+
return datetime.now(timezone.utc).isoformat(timespec="seconds")
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def log(level: str, msg: str) -> None:
|
| 14 |
+
html = {
|
| 15 |
+
"INFO": "dodgerblue",
|
| 16 |
+
"WARN": "goldenrod",
|
| 17 |
+
"ERR": "crimson",
|
| 18 |
+
}
|
| 19 |
+
c = html.get(level)
|
| 20 |
+
tag = f"<span style=\"color:{c}\">{level}</span>" if c else level
|
| 21 |
+
_ACTIVE_LINES.append(f"[{ts()}] {tag}: {msg}")
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def info(msg: str) -> None:
|
| 25 |
+
log("INFO", msg)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def warn(msg: str) -> None:
|
| 29 |
+
log("WARN", msg)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def error(msg: str) -> None:
|
| 33 |
+
log("ERR", msg)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
@contextmanager
|
| 37 |
+
def bind(lines: list[str]):
|
| 38 |
+
global _ACTIVE_LINES
|
| 39 |
+
prev = _ACTIVE_LINES
|
| 40 |
+
_ACTIVE_LINES = lines
|
| 41 |
+
try:
|
| 42 |
+
yield
|
| 43 |
+
finally:
|
| 44 |
+
_ACTIVE_LINES = prev
|
tests/__init__.py
ADDED
|
File without changes
|
tests/test_golden.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json, os, shutil, tempfile
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
ROOT = Path(__file__).resolve().parents[1]
|
| 5 |
+
GOLDEN = ROOT / ".data" / "events.json"
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def _load(p: Path) -> dict: return json.loads(p.read_text(encoding="utf-8"))
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def _norm(d: dict) -> dict:
|
| 12 |
+
d = json.loads(json.dumps(d)); (d.get("meta") or {}).pop("cached_at", None)
|
| 13 |
+
for r in d.get("results") or []:
|
| 14 |
+
if isinstance(r, dict): r.pop("checked_at", None)
|
| 15 |
+
return d
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def test_cached_results_match_golden():
|
| 19 |
+
golden = _load(GOLDEN); model = (golden.get("meta") or {}).get("model"); assert isinstance(model, str)
|
| 20 |
+
with tempfile.TemporaryDirectory() as td:
|
| 21 |
+
data_dir = Path(td); shutil.copy2(GOLDEN, data_dir / "events.json")
|
| 22 |
+
from src.fetcher import CacheStore, LlmConfig
|
| 23 |
+
cache = CacheStore(config=LlmConfig(model=model), ttl_hours=9999.0, path=data_dir / "events.json")
|
| 24 |
+
assert cache.is_usable()
|
| 25 |
+
got = _load(cache.path)
|
| 26 |
+
assert got["results"] == golden["results"]
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def test_live_crawl_matches_golden_snapshot():
|
| 30 |
+
if os.environ.get("RUN_LIVE_TESTS") != "1": return
|
| 31 |
+
golden = _load(GOLDEN); model = (golden.get("meta") or {}).get("model"); assert isinstance(model, str)
|
| 32 |
+
with tempfile.TemporaryDirectory() as td:
|
| 33 |
+
data_dir = Path(td)
|
| 34 |
+
os.environ.update({"LLM_TEMPERATURE": "0", "LLM_SEED": "1", "LITELLM_FALLBACK_MODELS": ""})
|
| 35 |
+
cached_at = (golden.get("meta") or {}).get("cached_at")
|
| 36 |
+
if isinstance(cached_at, str) and cached_at.strip(): os.environ["NOW_ISO"] = cached_at.strip()
|
| 37 |
+
from src.fetcher import CacheStore, LlmConfig, OrgSource, SeminarFetcher
|
| 38 |
+
cfg = LlmConfig(model=model)
|
| 39 |
+
raw_sources = json.loads((ROOT / "sources.json").read_text(encoding="utf-8"))
|
| 40 |
+
sources = [OrgSource.model_validate(s) for s in raw_sources]
|
| 41 |
+
fetcher = SeminarFetcher(config=cfg, now=None, max_hops=3, max_events=3)
|
| 42 |
+
results = [fetcher.fetch_next_event_for_org(o) for o in sources]
|
| 43 |
+
cache = CacheStore(config=cfg, ttl_hours=float((golden.get("meta") or {}).get("ttl_hours") or 12.0), path=data_dir / "events.json")
|
| 44 |
+
cache.write(results=results)
|
| 45 |
+
produced = _load(cache.path)
|
| 46 |
+
assert _norm(produced) == _norm(golden)
|