Spaces:
Paused
Paused
Update main.py
Browse files
main.py
CHANGED
|
@@ -12,6 +12,9 @@ from pymysql.err import OperationalError
|
|
| 12 |
import threading
|
| 13 |
warnings.filterwarnings("ignore")
|
| 14 |
|
|
|
|
|
|
|
|
|
|
| 15 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 16 |
# CONFIG
|
| 17 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -248,6 +251,140 @@ def gemini_explain(prompt: str, sys: str = None, model: str = EXPLAIN_MODEL) ->
|
|
| 248 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 249 |
# UTIL: Build graph & timeline from events (+ risk overlays)
|
| 250 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
def _policy_hits_for_date(d: str):
|
| 252 |
"""Return policy codes a given ISO date falls into."""
|
| 253 |
if not d:
|
|
@@ -430,13 +567,82 @@ def graph(object_id: int):
|
|
| 430 |
obj = cur.fetchone()
|
| 431 |
if not obj:
|
| 432 |
return jsonify({"ok": False, "error": "not_found"}), 404
|
|
|
|
| 433 |
cur.execute("""SELECT event_type, date_from, date_to, place, actor, source_ref
|
| 434 |
FROM provenance_events WHERE object_id=%s
|
| 435 |
ORDER BY COALESCE(date_from,'0001-01-01')""", (object_id,))
|
| 436 |
events = cur.fetchall()
|
| 437 |
-
return jsonify({"ok": True, **build_graph_from_events(obj, events)})
|
| 438 |
|
| 439 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 440 |
@with_db_retry
|
| 441 |
def timeline(object_id: int):
|
| 442 |
with cursor() as cur:
|
|
|
|
| 12 |
import threading
|
| 13 |
warnings.filterwarnings("ignore")
|
| 14 |
|
| 15 |
+
# ββ NEW: lightweight event inference from sentences βββββββββββββββββββββββββββ
|
| 16 |
+
import re
|
| 17 |
+
from typing import List, Dict, Any, Optional
|
| 18 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 19 |
# CONFIG
|
| 20 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 251 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 252 |
# UTIL: Build graph & timeline from events (+ risk overlays)
|
| 253 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
EVENT_VERBS = {
|
| 258 |
+
"sold": "SOLD",
|
| 259 |
+
"purchased": "PURCHASED",
|
| 260 |
+
"bought": "PURCHASED",
|
| 261 |
+
"acquired": "ACQUIRED",
|
| 262 |
+
"donated": "DONATED",
|
| 263 |
+
"gifted": "DONATED",
|
| 264 |
+
"bequeathed": "BEQUEATHED",
|
| 265 |
+
"consigned": "CONSIGNED",
|
| 266 |
+
"exhibited": "EXHIBITED",
|
| 267 |
+
"exported": "EXPORTED",
|
| 268 |
+
"imported": "IMPORTED",
|
| 269 |
+
}
|
| 270 |
+
|
| 271 |
+
YEAR_RE = re.compile(r"\b(1[6-9]\d{2}|20\d{2})\b") # 1600β2099
|
| 272 |
+
|
| 273 |
+
def _clean(s: Optional[str]) -> Optional[str]:
|
| 274 |
+
if not s: return None
|
| 275 |
+
s = re.sub(r"\s+", " ", s).strip(" ,.;:-ββ")
|
| 276 |
+
return s or None
|
| 277 |
+
|
| 278 |
+
def _infer_from_sentence(txt: str) -> Optional[Dict[str, Any]]:
|
| 279 |
+
"""
|
| 280 |
+
Very pragmatic patterns that cover most catalogue phrasing:
|
| 281 |
+
- 'sold to X, <place>, 2000'
|
| 282 |
+
- 'sold to X, by 2000'
|
| 283 |
+
- 'purchased from Y in 1965'
|
| 284 |
+
- 'donated by X, <place>, 1971'
|
| 285 |
+
Returns a dict compatible with provenance_events rows.
|
| 286 |
+
"""
|
| 287 |
+
if not txt:
|
| 288 |
+
return None
|
| 289 |
+
low = txt.lower()
|
| 290 |
+
|
| 291 |
+
# find verb
|
| 292 |
+
verb = next((EVENT_VERBS[v] for v in EVENT_VERBS if v in low), None)
|
| 293 |
+
if not verb:
|
| 294 |
+
return None
|
| 295 |
+
|
| 296 |
+
# pull a year (prefers the last year in the string)
|
| 297 |
+
years = YEAR_RE.findall(txt)
|
| 298 |
+
year = years[-1] if years else None
|
| 299 |
+
|
| 300 |
+
actor = None
|
| 301 |
+
place = None
|
| 302 |
+
|
| 303 |
+
# Common pattern: 'sold to X, place, 2000'
|
| 304 |
+
m = re.search(r"\b(sold|purchased|bought|acquired|donated|gifted|bequeathed|consigned)\s+(to|by|from)\s+(.*)$", low)
|
| 305 |
+
if m:
|
| 306 |
+
# Take the fragment after 'to/by/from'
|
| 307 |
+
frag = txt[m.end(2)+1:].strip()
|
| 308 |
+
# Trim trailing year or 'by 2000'
|
| 309 |
+
frag = re.sub(r"(,\s*)?(by\s*)?\b(1[6-9]\d{2}|20\d{2})\b.*$", "", frag, flags=re.IGNORECASE).strip(" ,.;")
|
| 310 |
+
# Split on commas: first token is actor; the rest (if any) is place
|
| 311 |
+
parts = [p.strip() for p in re.split(r",(?![^()]*\))", frag) if p.strip()]
|
| 312 |
+
if parts:
|
| 313 |
+
actor = parts[0]
|
| 314 |
+
if len(parts) > 1:
|
| 315 |
+
place = ", ".join(parts[1:])
|
| 316 |
+
|
| 317 |
+
# Fallback simple 'sold to X' without commas
|
| 318 |
+
if not actor:
|
| 319 |
+
m2 = re.search(r"\bsold\s+to\s+([^,.;]+)", low)
|
| 320 |
+
if m2:
|
| 321 |
+
actor = _clean(txt[m2.start(1):m2.end(1)])
|
| 322 |
+
|
| 323 |
+
return {
|
| 324 |
+
"event_type": verb,
|
| 325 |
+
"date_from": f"{year}-01-01" if year else None,
|
| 326 |
+
"date_to": None,
|
| 327 |
+
"place": _clean(place),
|
| 328 |
+
"actor": _clean(actor),
|
| 329 |
+
"method": None,
|
| 330 |
+
"source_ref": "inferred:sentence"
|
| 331 |
+
}
|
| 332 |
+
|
| 333 |
+
def infer_events_from_sentences(sentences: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
| 334 |
+
out: List[Dict[str, Any]] = []
|
| 335 |
+
for s in sentences:
|
| 336 |
+
ev = _infer_from_sentence(s.get("sentence", ""))
|
| 337 |
+
if ev and (ev.get("actor") or ev.get("place")):
|
| 338 |
+
ev["seq"] = s.get("seq")
|
| 339 |
+
out.append(ev)
|
| 340 |
+
# Deduplicate (actor+place+event_type+date_from)
|
| 341 |
+
seen = set()
|
| 342 |
+
uniq = []
|
| 343 |
+
for e in out:
|
| 344 |
+
key = (e.get("actor"), e.get("place"), e.get("event_type"), e.get("date_from"))
|
| 345 |
+
if key in seen:
|
| 346 |
+
continue
|
| 347 |
+
seen.add(key)
|
| 348 |
+
uniq.append(e)
|
| 349 |
+
return uniq
|
| 350 |
+
|
| 351 |
+
# ββ OPTIONAL: simple geocode cache for map pins βββββββββββββββββββββββββββββββ
|
| 352 |
+
def geocode_place_cached(place: str):
|
| 353 |
+
"""Cache in DB: places_cache(place TEXT PRIMARY KEY, lat DOUBLE, lon DOUBLE, updated_at TIMESTAMP)"""
|
| 354 |
+
if not place:
|
| 355 |
+
return None
|
| 356 |
+
with cursor() as cur:
|
| 357 |
+
cur.execute("CREATE TABLE IF NOT EXISTS places_cache (place VARCHAR(255) PRIMARY KEY, lat DOUBLE, lon DOUBLE, updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP)")
|
| 358 |
+
cur.execute("SELECT lat, lon FROM places_cache WHERE place=%s", (place,))
|
| 359 |
+
row = cur.fetchone()
|
| 360 |
+
if row and row.get("lat") is not None and row.get("lon") is not None:
|
| 361 |
+
return row
|
| 362 |
+
|
| 363 |
+
# Try Nominatim (best effort). If outbound HTTP is blocked, just skip.
|
| 364 |
+
try:
|
| 365 |
+
r = requests.get(
|
| 366 |
+
"https://nominatim.openstreetmap.org/search",
|
| 367 |
+
params={"q": place, "format": "json", "limit": 1},
|
| 368 |
+
headers={"User-Agent": "provenance-radar/1.0"},
|
| 369 |
+
timeout=6,
|
| 370 |
+
)
|
| 371 |
+
j = r.json()
|
| 372 |
+
if j:
|
| 373 |
+
lat, lon = float(j[0]["lat"]), float(j[0]["lon"])
|
| 374 |
+
else:
|
| 375 |
+
lat, lon = None, None
|
| 376 |
+
except Exception:
|
| 377 |
+
lat, lon = None, None
|
| 378 |
+
|
| 379 |
+
with cursor() as cur:
|
| 380 |
+
cur.execute(
|
| 381 |
+
"INSERT INTO places_cache (place, lat, lon) VALUES (%s,%s,%s) ON DUPLICATE KEY UPDATE lat=VALUES(lat), lon=VALUES(lon), updated_at=CURRENT_TIMESTAMP",
|
| 382 |
+
(place, lat, lon),
|
| 383 |
+
)
|
| 384 |
+
if lat is None or lon is None:
|
| 385 |
+
return None
|
| 386 |
+
return {"lat": lat, "lon": lon}
|
| 387 |
+
|
| 388 |
def _policy_hits_for_date(d: str):
|
| 389 |
"""Return policy codes a given ISO date falls into."""
|
| 390 |
if not d:
|
|
|
|
| 567 |
obj = cur.fetchone()
|
| 568 |
if not obj:
|
| 569 |
return jsonify({"ok": False, "error": "not_found"}), 404
|
| 570 |
+
|
| 571 |
cur.execute("""SELECT event_type, date_from, date_to, place, actor, source_ref
|
| 572 |
FROM provenance_events WHERE object_id=%s
|
| 573 |
ORDER BY COALESCE(date_from,'0001-01-01')""", (object_id,))
|
| 574 |
events = cur.fetchall()
|
|
|
|
| 575 |
|
| 576 |
+
cur.execute("SELECT seq, sentence FROM provenance_sentences WHERE object_id=%s ORDER BY seq", (object_id,))
|
| 577 |
+
sents = cur.fetchall()
|
| 578 |
+
|
| 579 |
+
inferred = infer_events_from_sentences(sents)
|
| 580 |
+
|
| 581 |
+
# Prefer stored events; fill with inferred where stored is thin
|
| 582 |
+
merged = list(events)
|
| 583 |
+
if not merged or all((not e.get("actor") and not e.get("place")) for e in merged):
|
| 584 |
+
merged = inferred
|
| 585 |
+
else:
|
| 586 |
+
# add inferred items that add missing actor/place for the same year
|
| 587 |
+
have = {(e.get("actor"), e.get("place"), e.get("event_type"), to_iso(e.get("date_from"))): True for e in merged}
|
| 588 |
+
for e in inferred:
|
| 589 |
+
key = (e.get("actor"), e.get("place"), e.get("event_type"), to_iso(e.get("date_from")))
|
| 590 |
+
if key not in have:
|
| 591 |
+
merged.append(e)
|
| 592 |
+
|
| 593 |
+
g = build_graph_from_events(obj, merged)
|
| 594 |
+
|
| 595 |
+
# NEW: link successive actors to show chain of custody
|
| 596 |
+
actors_in_time = [ (to_iso(e.get("date_from")) or "0001-01-01", e.get("actor")) for e in merged if e.get("actor") ]
|
| 597 |
+
actors_in_time.sort(key=lambda x: x[0])
|
| 598 |
+
for i in range(len(actors_in_time) - 1):
|
| 599 |
+
a1 = actors_in_time[i][1]; a2 = actors_in_time[i+1][1]
|
| 600 |
+
if a1 and a2 and a1 != a2:
|
| 601 |
+
g["edges"].append({
|
| 602 |
+
"source": f"actor:{a1}",
|
| 603 |
+
"target": f"actor:{a2}",
|
| 604 |
+
"label": "TRANSFER",
|
| 605 |
+
"date": actors_in_time[i+1][0],
|
| 606 |
+
"weight": 0.8,
|
| 607 |
+
"policy": _policy_hits_for_date(actors_in_time[i+1][0]),
|
| 608 |
+
"source_ref": "link:sequence"
|
| 609 |
+
})
|
| 610 |
+
|
| 611 |
+
return jsonify({"ok": True, **g})
|
| 612 |
+
|
| 613 |
+
@app.get("/api/places/<int:object_id>")
|
| 614 |
+
@with_db_retry
|
| 615 |
+
def places(object_id: int):
|
| 616 |
+
with cursor() as cur:
|
| 617 |
+
cur.execute("""SELECT place, date_from FROM provenance_events WHERE object_id=%s""", (object_id,))
|
| 618 |
+
ev = cur.fetchall()
|
| 619 |
+
cur.execute("SELECT seq, sentence FROM provenance_sentences WHERE object_id=%s ORDER BY seq", (object_id,))
|
| 620 |
+
sents = cur.fetchall()
|
| 621 |
+
|
| 622 |
+
inferred = infer_events_from_sentences(sents)
|
| 623 |
+
all_places = []
|
| 624 |
+
for e in ev + inferred:
|
| 625 |
+
p = _clean(e.get("place"))
|
| 626 |
+
if p:
|
| 627 |
+
all_places.append({"place": p, "date": to_iso(e.get("date_from"))})
|
| 628 |
+
|
| 629 |
+
# unique by place, keep earliest date
|
| 630 |
+
agg = {}
|
| 631 |
+
for r in all_places:
|
| 632 |
+
d = r["date"] or "9999-12-31"
|
| 633 |
+
if r["place"] not in agg or d < (agg[r["place"]].get("date") or "9999-12-31"):
|
| 634 |
+
agg[r["place"]] = r
|
| 635 |
+
|
| 636 |
+
out = []
|
| 637 |
+
for p, info in agg.items():
|
| 638 |
+
geo = geocode_place_cached(p) # may be None if geocoding blocked
|
| 639 |
+
out.append({"place": p, "date": info.get("date"), "lat": (geo or {}).get("lat"), "lon": (geo or {}).get("lon")})
|
| 640 |
+
|
| 641 |
+
# order chronologically for path drawing
|
| 642 |
+
out.sort(key=lambda x: x.get("date") or "9999-12-31")
|
| 643 |
+
return jsonify({"ok": True, "places": out})
|
| 644 |
+
|
| 645 |
+
@app.get("/api/timeline/<int:object_id>"
|
| 646 |
@with_db_retry
|
| 647 |
def timeline(object_id: int):
|
| 648 |
with cursor() as cur:
|