rairo commited on
Commit
0135342
Β·
verified Β·
1 Parent(s): e888fa3

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +208 -2
main.py CHANGED
@@ -12,6 +12,9 @@ from pymysql.err import OperationalError
12
  import threading
13
  warnings.filterwarnings("ignore")
14
 
 
 
 
15
  # ───────────────────────────────────────────────────────────────────────────────
16
  # CONFIG
17
  # ───────────────────────────────────────────────────────────────────────────────
@@ -248,6 +251,140 @@ def gemini_explain(prompt: str, sys: str = None, model: str = EXPLAIN_MODEL) ->
248
  # ───────────────────────────────────────────────────────────────────────────────
249
  # UTIL: Build graph & timeline from events (+ risk overlays)
250
  # ───────────────────────────────────────────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
  def _policy_hits_for_date(d: str):
252
  """Return policy codes a given ISO date falls into."""
253
  if not d:
@@ -430,13 +567,82 @@ def graph(object_id: int):
430
  obj = cur.fetchone()
431
  if not obj:
432
  return jsonify({"ok": False, "error": "not_found"}), 404
 
433
  cur.execute("""SELECT event_type, date_from, date_to, place, actor, source_ref
434
  FROM provenance_events WHERE object_id=%s
435
  ORDER BY COALESCE(date_from,'0001-01-01')""", (object_id,))
436
  events = cur.fetchall()
437
- return jsonify({"ok": True, **build_graph_from_events(obj, events)})
438
 
439
- @app.get("/api/timeline/<int:object_id>")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
440
  @with_db_retry
441
  def timeline(object_id: int):
442
  with cursor() as cur:
 
12
  import threading
13
  warnings.filterwarnings("ignore")
14
 
15
+ # ── NEW: lightweight event inference from sentences ───────────────────────────
16
+ import re
17
+ from typing import List, Dict, Any, Optional
18
  # ───────────────────────────────────────────────────────────────────────────────
19
  # CONFIG
20
  # ───────────────────────────────────────────────────────────────────────────────
 
251
  # ───────────────────────────────────────────────────────────────────────────────
252
  # UTIL: Build graph & timeline from events (+ risk overlays)
253
  # ───────────────────────────────────────────────────────────────────────────────
254
+
255
+
256
+
257
+ EVENT_VERBS = {
258
+ "sold": "SOLD",
259
+ "purchased": "PURCHASED",
260
+ "bought": "PURCHASED",
261
+ "acquired": "ACQUIRED",
262
+ "donated": "DONATED",
263
+ "gifted": "DONATED",
264
+ "bequeathed": "BEQUEATHED",
265
+ "consigned": "CONSIGNED",
266
+ "exhibited": "EXHIBITED",
267
+ "exported": "EXPORTED",
268
+ "imported": "IMPORTED",
269
+ }
270
+
271
+ YEAR_RE = re.compile(r"\b(1[6-9]\d{2}|20\d{2})\b") # 1600–2099
272
+
273
+ def _clean(s: Optional[str]) -> Optional[str]:
274
+ if not s: return None
275
+ s = re.sub(r"\s+", " ", s).strip(" ,.;:-–—")
276
+ return s or None
277
+
278
+ def _infer_from_sentence(txt: str) -> Optional[Dict[str, Any]]:
279
+ """
280
+ Very pragmatic patterns that cover most catalogue phrasing:
281
+ - 'sold to X, <place>, 2000'
282
+ - 'sold to X, by 2000'
283
+ - 'purchased from Y in 1965'
284
+ - 'donated by X, <place>, 1971'
285
+ Returns a dict compatible with provenance_events rows.
286
+ """
287
+ if not txt:
288
+ return None
289
+ low = txt.lower()
290
+
291
+ # find verb
292
+ verb = next((EVENT_VERBS[v] for v in EVENT_VERBS if v in low), None)
293
+ if not verb:
294
+ return None
295
+
296
+ # pull a year (prefers the last year in the string)
297
+ years = YEAR_RE.findall(txt)
298
+ year = years[-1] if years else None
299
+
300
+ actor = None
301
+ place = None
302
+
303
+ # Common pattern: 'sold to X, place, 2000'
304
+ m = re.search(r"\b(sold|purchased|bought|acquired|donated|gifted|bequeathed|consigned)\s+(to|by|from)\s+(.*)$", low)
305
+ if m:
306
+ # Take the fragment after 'to/by/from'
307
+ frag = txt[m.end(2)+1:].strip()
308
+ # Trim trailing year or 'by 2000'
309
+ frag = re.sub(r"(,\s*)?(by\s*)?\b(1[6-9]\d{2}|20\d{2})\b.*$", "", frag, flags=re.IGNORECASE).strip(" ,.;")
310
+ # Split on commas: first token is actor; the rest (if any) is place
311
+ parts = [p.strip() for p in re.split(r",(?![^()]*\))", frag) if p.strip()]
312
+ if parts:
313
+ actor = parts[0]
314
+ if len(parts) > 1:
315
+ place = ", ".join(parts[1:])
316
+
317
+ # Fallback simple 'sold to X' without commas
318
+ if not actor:
319
+ m2 = re.search(r"\bsold\s+to\s+([^,.;]+)", low)
320
+ if m2:
321
+ actor = _clean(txt[m2.start(1):m2.end(1)])
322
+
323
+ return {
324
+ "event_type": verb,
325
+ "date_from": f"{year}-01-01" if year else None,
326
+ "date_to": None,
327
+ "place": _clean(place),
328
+ "actor": _clean(actor),
329
+ "method": None,
330
+ "source_ref": "inferred:sentence"
331
+ }
332
+
333
+ def infer_events_from_sentences(sentences: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
334
+ out: List[Dict[str, Any]] = []
335
+ for s in sentences:
336
+ ev = _infer_from_sentence(s.get("sentence", ""))
337
+ if ev and (ev.get("actor") or ev.get("place")):
338
+ ev["seq"] = s.get("seq")
339
+ out.append(ev)
340
+ # Deduplicate (actor+place+event_type+date_from)
341
+ seen = set()
342
+ uniq = []
343
+ for e in out:
344
+ key = (e.get("actor"), e.get("place"), e.get("event_type"), e.get("date_from"))
345
+ if key in seen:
346
+ continue
347
+ seen.add(key)
348
+ uniq.append(e)
349
+ return uniq
350
+
351
+ # ── OPTIONAL: simple geocode cache for map pins ───────────────────────────────
352
+ def geocode_place_cached(place: str):
353
+ """Cache in DB: places_cache(place TEXT PRIMARY KEY, lat DOUBLE, lon DOUBLE, updated_at TIMESTAMP)"""
354
+ if not place:
355
+ return None
356
+ with cursor() as cur:
357
+ cur.execute("CREATE TABLE IF NOT EXISTS places_cache (place VARCHAR(255) PRIMARY KEY, lat DOUBLE, lon DOUBLE, updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP)")
358
+ cur.execute("SELECT lat, lon FROM places_cache WHERE place=%s", (place,))
359
+ row = cur.fetchone()
360
+ if row and row.get("lat") is not None and row.get("lon") is not None:
361
+ return row
362
+
363
+ # Try Nominatim (best effort). If outbound HTTP is blocked, just skip.
364
+ try:
365
+ r = requests.get(
366
+ "https://nominatim.openstreetmap.org/search",
367
+ params={"q": place, "format": "json", "limit": 1},
368
+ headers={"User-Agent": "provenance-radar/1.0"},
369
+ timeout=6,
370
+ )
371
+ j = r.json()
372
+ if j:
373
+ lat, lon = float(j[0]["lat"]), float(j[0]["lon"])
374
+ else:
375
+ lat, lon = None, None
376
+ except Exception:
377
+ lat, lon = None, None
378
+
379
+ with cursor() as cur:
380
+ cur.execute(
381
+ "INSERT INTO places_cache (place, lat, lon) VALUES (%s,%s,%s) ON DUPLICATE KEY UPDATE lat=VALUES(lat), lon=VALUES(lon), updated_at=CURRENT_TIMESTAMP",
382
+ (place, lat, lon),
383
+ )
384
+ if lat is None or lon is None:
385
+ return None
386
+ return {"lat": lat, "lon": lon}
387
+
388
  def _policy_hits_for_date(d: str):
389
  """Return policy codes a given ISO date falls into."""
390
  if not d:
 
567
  obj = cur.fetchone()
568
  if not obj:
569
  return jsonify({"ok": False, "error": "not_found"}), 404
570
+
571
  cur.execute("""SELECT event_type, date_from, date_to, place, actor, source_ref
572
  FROM provenance_events WHERE object_id=%s
573
  ORDER BY COALESCE(date_from,'0001-01-01')""", (object_id,))
574
  events = cur.fetchall()
 
575
 
576
+ cur.execute("SELECT seq, sentence FROM provenance_sentences WHERE object_id=%s ORDER BY seq", (object_id,))
577
+ sents = cur.fetchall()
578
+
579
+ inferred = infer_events_from_sentences(sents)
580
+
581
+ # Prefer stored events; fill with inferred where stored is thin
582
+ merged = list(events)
583
+ if not merged or all((not e.get("actor") and not e.get("place")) for e in merged):
584
+ merged = inferred
585
+ else:
586
+ # add inferred items that add missing actor/place for the same year
587
+ have = {(e.get("actor"), e.get("place"), e.get("event_type"), to_iso(e.get("date_from"))): True for e in merged}
588
+ for e in inferred:
589
+ key = (e.get("actor"), e.get("place"), e.get("event_type"), to_iso(e.get("date_from")))
590
+ if key not in have:
591
+ merged.append(e)
592
+
593
+ g = build_graph_from_events(obj, merged)
594
+
595
+ # NEW: link successive actors to show chain of custody
596
+ actors_in_time = [ (to_iso(e.get("date_from")) or "0001-01-01", e.get("actor")) for e in merged if e.get("actor") ]
597
+ actors_in_time.sort(key=lambda x: x[0])
598
+ for i in range(len(actors_in_time) - 1):
599
+ a1 = actors_in_time[i][1]; a2 = actors_in_time[i+1][1]
600
+ if a1 and a2 and a1 != a2:
601
+ g["edges"].append({
602
+ "source": f"actor:{a1}",
603
+ "target": f"actor:{a2}",
604
+ "label": "TRANSFER",
605
+ "date": actors_in_time[i+1][0],
606
+ "weight": 0.8,
607
+ "policy": _policy_hits_for_date(actors_in_time[i+1][0]),
608
+ "source_ref": "link:sequence"
609
+ })
610
+
611
+ return jsonify({"ok": True, **g})
612
+
613
+ @app.get("/api/places/<int:object_id>")
614
+ @with_db_retry
615
+ def places(object_id: int):
616
+ with cursor() as cur:
617
+ cur.execute("""SELECT place, date_from FROM provenance_events WHERE object_id=%s""", (object_id,))
618
+ ev = cur.fetchall()
619
+ cur.execute("SELECT seq, sentence FROM provenance_sentences WHERE object_id=%s ORDER BY seq", (object_id,))
620
+ sents = cur.fetchall()
621
+
622
+ inferred = infer_events_from_sentences(sents)
623
+ all_places = []
624
+ for e in ev + inferred:
625
+ p = _clean(e.get("place"))
626
+ if p:
627
+ all_places.append({"place": p, "date": to_iso(e.get("date_from"))})
628
+
629
+ # unique by place, keep earliest date
630
+ agg = {}
631
+ for r in all_places:
632
+ d = r["date"] or "9999-12-31"
633
+ if r["place"] not in agg or d < (agg[r["place"]].get("date") or "9999-12-31"):
634
+ agg[r["place"]] = r
635
+
636
+ out = []
637
+ for p, info in agg.items():
638
+ geo = geocode_place_cached(p) # may be None if geocoding blocked
639
+ out.append({"place": p, "date": info.get("date"), "lat": (geo or {}).get("lat"), "lon": (geo or {}).get("lon")})
640
+
641
+ # order chronologically for path drawing
642
+ out.sort(key=lambda x: x.get("date") or "9999-12-31")
643
+ return jsonify({"ok": True, "places": out})
644
+
645
+ @app.get("/api/timeline/<int:object_id>"
646
  @with_db_retry
647
  def timeline(object_id: int):
648
  with cursor() as cur: