Upload 70 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +3 -0
- NorthMaconPark.pdf +3 -0
- __init__.py +0 -0
- app.py +628 -0
- config.py +40 -0
- data/BUILDING_CODE.json +0 -0
- data/FUEL_GAS_CODE.json +0 -0
- data/GENERAL_ADMINISTRATIVE_PROVISIONS.json +3 -0
- data/MECHANICAL_CODE.json +0 -0
- data/PLUMBING_CODE.json +0 -0
- data/ingest_chromadb.py +120 -0
- data/nyc_code_db/chroma.sqlite3 +3 -0
- data/nyc_code_db/d5ad1fca-6483-43d1-b3bd-08c280e227d1/data_level0.bin +3 -0
- data/nyc_code_db/d5ad1fca-6483-43d1-b3bd-08c280e227d1/header.bin +3 -0
- data/nyc_code_db/d5ad1fca-6483-43d1-b3bd-08c280e227d1/index_metadata.pickle +3 -0
- data/nyc_code_db/d5ad1fca-6483-43d1-b3bd-08c280e227d1/length.bin +3 -0
- data/nyc_code_db/d5ad1fca-6483-43d1-b3bd-08c280e227d1/link_lists.bin +3 -0
- data/nyc_code_db/f32247b2-1c25-42c9-9177-5ccfe22bb0b1/data_level0.bin +3 -0
- data/nyc_code_db/f32247b2-1c25-42c9-9177-5ccfe22bb0b1/header.bin +3 -0
- data/nyc_code_db/f32247b2-1c25-42c9-9177-5ccfe22bb0b1/index_metadata.pickle +3 -0
- data/nyc_code_db/f32247b2-1c25-42c9-9177-5ccfe22bb0b1/length.bin +3 -0
- data/nyc_code_db/f32247b2-1c25-42c9-9177-5ccfe22bb0b1/link_lists.bin +3 -0
- data/preprocess_codes.py +253 -0
- graph.py +147 -0
- nodes/__init__.py +0 -0
- nodes/__pycache__/__init__.cpython-313.pyc +0 -0
- nodes/__pycache__/annotator.cpython-313.pyc +0 -0
- nodes/__pycache__/code_lookup.cpython-313.pyc +0 -0
- nodes/__pycache__/compliance_analyst.cpython-313.pyc +0 -0
- nodes/__pycache__/compliance_planner.cpython-313.pyc +0 -0
- nodes/__pycache__/cropper.cpython-313.pyc +0 -0
- nodes/__pycache__/deliberation.cpython-313.pyc +0 -0
- nodes/__pycache__/final_verdict.cpython-313.pyc +0 -0
- nodes/__pycache__/metadata_generator.cpython-313.pyc +0 -0
- nodes/annotator.py +117 -0
- nodes/code_lookup.py +286 -0
- nodes/compliance_analyst.py +188 -0
- nodes/compliance_planner.py +130 -0
- nodes/cropper.py +234 -0
- nodes/deliberation.py +85 -0
- nodes/final_verdict.py +107 -0
- nodes/metadata_generator.py +211 -0
- prompts/__init__.py +0 -0
- prompts/__pycache__/__init__.cpython-313.pyc +0 -0
- prompts/__pycache__/annotator.cpython-313.pyc +0 -0
- prompts/__pycache__/code_lookup.cpython-313.pyc +0 -0
- prompts/__pycache__/compliance_analyst.cpython-313.pyc +0 -0
- prompts/__pycache__/compliance_planner.cpython-313.pyc +0 -0
- prompts/__pycache__/cropper.cpython-313.pyc +0 -0
- prompts/__pycache__/deliberation.cpython-313.pyc +0 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
data/GENERAL_ADMINISTRATIVE_PROVISIONS.json filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
data/nyc_code_db/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
NorthMaconPark.pdf filter=lfs diff=lfs merge=lfs -text
|
NorthMaconPark.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9aed76b73fbe205e1579e3a00be6e95b7564e72594b1fdb83311819d447f8fc4
|
| 3 |
+
size 39114794
|
__init__.py
ADDED
|
File without changes
|
app.py
ADDED
|
@@ -0,0 +1,628 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Streamlit UI for NYC Code Compliance Bot — with agent discussion panel."""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import json
|
| 5 |
+
import logging
|
| 6 |
+
import tempfile
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
import os
|
| 9 |
+
import sys
|
| 10 |
+
import streamlit as st
|
| 11 |
+
from PIL import Image
|
| 12 |
+
|
| 13 |
+
from config import MAX_INVESTIGATION_ROUNDS
|
| 14 |
+
from graph import compile_compliance_graph
|
| 15 |
+
from tools.chroma_tools import warmup_collection, is_warmed_up
|
| 16 |
+
from tools.crop_cache import CropCache
|
| 17 |
+
from tools.image_store import ImageStore
|
| 18 |
+
from tools.metadata_cache import MetadataState, get_cached_metadata
|
| 19 |
+
from tools.pdf_processor import render_pages
|
| 20 |
+
|
| 21 |
+
logger = logging.getLogger(__name__)
|
| 22 |
+
|
| 23 |
+
# ---------------------------------------------------------------------------
|
| 24 |
+
# Page config
|
| 25 |
+
# ---------------------------------------------------------------------------
|
| 26 |
+
st.set_page_config(
|
| 27 |
+
page_title="NYC Code Compliance Bot",
|
| 28 |
+
page_icon=":building_construction:",
|
| 29 |
+
layout="wide",
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
# ---------------------------------------------------------------------------
|
| 33 |
+
# Custom CSS for agent discussion panel
|
| 34 |
+
# ---------------------------------------------------------------------------
|
| 35 |
+
st.markdown("""
|
| 36 |
+
<style>
|
| 37 |
+
.agent-msg {
|
| 38 |
+
padding: 8px 12px;
|
| 39 |
+
margin: 4px 0;
|
| 40 |
+
border-radius: 8px;
|
| 41 |
+
font-size: 0.9em;
|
| 42 |
+
}
|
| 43 |
+
.agent-planner {
|
| 44 |
+
background-color: #e3f2fd;
|
| 45 |
+
border-left: 4px solid #1565c0;
|
| 46 |
+
}
|
| 47 |
+
.agent-code_analyst {
|
| 48 |
+
background-color: #fff3e0;
|
| 49 |
+
border-left: 4px solid #e65100;
|
| 50 |
+
}
|
| 51 |
+
.agent-compliance_analyst {
|
| 52 |
+
background-color: #e8f5e9;
|
| 53 |
+
border-left: 4px solid #2e7d32;
|
| 54 |
+
}
|
| 55 |
+
.agent-reviewer {
|
| 56 |
+
background-color: #f3e5f5;
|
| 57 |
+
border-left: 4px solid #6a1b9a;
|
| 58 |
+
}
|
| 59 |
+
.agent-icon {
|
| 60 |
+
font-weight: bold;
|
| 61 |
+
margin-right: 6px;
|
| 62 |
+
}
|
| 63 |
+
.agent-timestamp {
|
| 64 |
+
color: #666;
|
| 65 |
+
font-size: 0.8em;
|
| 66 |
+
}
|
| 67 |
+
</style>
|
| 68 |
+
""", unsafe_allow_html=True)
|
| 69 |
+
|
| 70 |
+
AGENT_ICONS = {
|
| 71 |
+
"planner": "\U0001f4cb",
|
| 72 |
+
"code_analyst": "\u2696\ufe0f",
|
| 73 |
+
"compliance_analyst": "\U0001f50d",
|
| 74 |
+
"reviewer": "\U0001f91d",
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
AGENT_LABELS = {
|
| 78 |
+
"planner": "Planner",
|
| 79 |
+
"code_analyst": "Code Analyst",
|
| 80 |
+
"compliance_analyst": "Compliance Analyst",
|
| 81 |
+
"reviewer": "Reviewer",
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
# ---------------------------------------------------------------------------
|
| 85 |
+
# Session state defaults
|
| 86 |
+
# ---------------------------------------------------------------------------
|
| 87 |
+
if "pdf_loaded" not in st.session_state:
|
| 88 |
+
st.session_state.pdf_loaded = False
|
| 89 |
+
if "chat_history" not in st.session_state:
|
| 90 |
+
st.session_state.chat_history = []
|
| 91 |
+
if "image_store" not in st.session_state:
|
| 92 |
+
st.session_state.image_store = None
|
| 93 |
+
if "ingest_state" not in st.session_state:
|
| 94 |
+
st.session_state.ingest_state = {}
|
| 95 |
+
if "pdf_bytes" not in st.session_state:
|
| 96 |
+
st.session_state.pdf_bytes = None
|
| 97 |
+
if "metadata_state" not in st.session_state:
|
| 98 |
+
st.session_state.metadata_state = MetadataState()
|
| 99 |
+
if "crop_cache" not in st.session_state:
|
| 100 |
+
st.session_state.crop_cache = CropCache()
|
| 101 |
+
if "discussion_log" not in st.session_state:
|
| 102 |
+
st.session_state.discussion_log = []
|
| 103 |
+
if "code_report" not in st.session_state:
|
| 104 |
+
st.session_state.code_report = ""
|
| 105 |
+
if "code_sections" not in st.session_state:
|
| 106 |
+
st.session_state.code_sections = []
|
| 107 |
+
if "image_refs" not in st.session_state:
|
| 108 |
+
st.session_state.image_refs = []
|
| 109 |
+
if "db_ready" not in st.session_state:
|
| 110 |
+
st.session_state.db_ready = False
|
| 111 |
+
|
| 112 |
+
# ---------------------------------------------------------------------------
|
| 113 |
+
# Startup: warm up embedding model + ChromaDB
|
| 114 |
+
# ---------------------------------------------------------------------------
|
| 115 |
+
if not st.session_state.db_ready:
|
| 116 |
+
with st.status("Loading NYC Code Database...", expanded=True) as _db_status:
|
| 117 |
+
st.write(":brain: Loading embedding model (bge-large-en-v1.5)...")
|
| 118 |
+
st.write("_This is a one-time download (~1.3 GB) on first run._")
|
| 119 |
+
ok = warmup_collection()
|
| 120 |
+
if ok:
|
| 121 |
+
st.session_state.db_ready = True
|
| 122 |
+
_db_status.update(label="NYC Code Database ready", state="complete")
|
| 123 |
+
else:
|
| 124 |
+
_db_status.update(
|
| 125 |
+
label="NYC Code Database not available — code lookup will be disabled",
|
| 126 |
+
state="error",
|
| 127 |
+
)
|
| 128 |
+
st.session_state.db_ready = False
|
| 129 |
+
if st.session_state.db_ready:
|
| 130 |
+
st.rerun()
|
| 131 |
+
|
| 132 |
+
# ---------------------------------------------------------------------------
|
| 133 |
+
# Sidebar
|
| 134 |
+
# ---------------------------------------------------------------------------
|
| 135 |
+
with st.sidebar:
|
| 136 |
+
st.title(":building_construction: NYC Code Compliance Bot")
|
| 137 |
+
st.markdown(
|
| 138 |
+
"Upload a construction drawing PDF and ask compliance questions. "
|
| 139 |
+
"The system uses **agentic vision** + **NYC code database** to "
|
| 140 |
+
"verify code compliance."
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
st.divider()
|
| 144 |
+
|
| 145 |
+
# PDF upload
|
| 146 |
+
uploaded_file = st.file_uploader("Upload Drawing PDF", type=["pdf"])
|
| 147 |
+
|
| 148 |
+
# Default drawing button
|
| 149 |
+
_DEFAULT_PDF = Path(__file__).parent / "NorthMaconPark.pdf"
|
| 150 |
+
if _DEFAULT_PDF.exists() and not st.session_state.pdf_loaded:
|
| 151 |
+
st.markdown("**— or —**")
|
| 152 |
+
if st.button("Use Default Drawing", use_container_width=True):
|
| 153 |
+
st.session_state._use_default_pdf = True
|
| 154 |
+
st.rerun()
|
| 155 |
+
|
| 156 |
+
st.divider()
|
| 157 |
+
|
| 158 |
+
# Settings
|
| 159 |
+
st.subheader("Settings")
|
| 160 |
+
enable_consensus = st.checkbox(
|
| 161 |
+
"Enable peer review (Gemini + GPT)",
|
| 162 |
+
value=False,
|
| 163 |
+
help="GPT reviews Gemini's compliance analysis. Slower but more thorough.",
|
| 164 |
+
)
|
| 165 |
+
enable_annotation = st.checkbox(
|
| 166 |
+
"Enable annotation",
|
| 167 |
+
value=False,
|
| 168 |
+
help="Annotate crops with numbered highlights before analysis.",
|
| 169 |
+
)
|
| 170 |
+
max_rounds = st.slider(
|
| 171 |
+
"Max investigation rounds",
|
| 172 |
+
min_value=1,
|
| 173 |
+
max_value=5,
|
| 174 |
+
value=MAX_INVESTIGATION_ROUNDS,
|
| 175 |
+
help="Maximum crop-analyze loops before forcing a final verdict.",
|
| 176 |
+
)
|
| 177 |
+
|
| 178 |
+
st.divider()
|
| 179 |
+
st.caption("Powered by LangGraph + Gemini + GPT + ChromaDB")
|
| 180 |
+
|
| 181 |
+
# ---------------------------------------------------------------------------
|
| 182 |
+
# PDF ingestion — Phase A: render pages
|
| 183 |
+
# ---------------------------------------------------------------------------
|
| 184 |
+
# Determine if we have a PDF to process (uploaded or default)
|
| 185 |
+
_pending_pdf: tuple[str, bytes] | None = None
|
| 186 |
+
if not st.session_state.pdf_loaded:
|
| 187 |
+
if uploaded_file is not None:
|
| 188 |
+
_pending_pdf = (uploaded_file.name, uploaded_file.getvalue())
|
| 189 |
+
elif st.session_state.get("_use_default_pdf"):
|
| 190 |
+
_default = Path(__file__).parent / "NorthMaconPark.pdf"
|
| 191 |
+
if _default.exists():
|
| 192 |
+
_pending_pdf = (_default.name, _default.read_bytes())
|
| 193 |
+
|
| 194 |
+
if _pending_pdf is not None and not st.session_state.pdf_loaded:
|
| 195 |
+
pdf_name, pdf_bytes = _pending_pdf
|
| 196 |
+
with st.status("Converting PDF to images...", expanded=True) as status:
|
| 197 |
+
tmp_dir = tempfile.mkdtemp(prefix="compliance_bot_")
|
| 198 |
+
pdf_path = Path(tmp_dir) / pdf_name
|
| 199 |
+
pdf_path.write_bytes(pdf_bytes)
|
| 200 |
+
|
| 201 |
+
st.session_state.pdf_bytes = pdf_bytes
|
| 202 |
+
st.session_state.crop_cache = CropCache()
|
| 203 |
+
|
| 204 |
+
image_store = ImageStore(str(Path(tmp_dir) / "images"))
|
| 205 |
+
st.session_state.image_store = image_store
|
| 206 |
+
page_image_dir = str(image_store._pages_dir)
|
| 207 |
+
|
| 208 |
+
# Check for cached metadata
|
| 209 |
+
cached = get_cached_metadata(pdf_bytes)
|
| 210 |
+
if cached is not None:
|
| 211 |
+
st.session_state.metadata_state.set_ready(json.dumps(cached, indent=2))
|
| 212 |
+
st.write("Page index loaded from cache")
|
| 213 |
+
|
| 214 |
+
st.write("Rendering pages...")
|
| 215 |
+
num_pages = render_pages(str(pdf_path), page_image_dir)
|
| 216 |
+
|
| 217 |
+
st.session_state.ingest_state = {
|
| 218 |
+
"pdf_path": str(pdf_path),
|
| 219 |
+
"page_image_dir": page_image_dir,
|
| 220 |
+
"num_pages": num_pages,
|
| 221 |
+
}
|
| 222 |
+
st.session_state.pdf_loaded = True
|
| 223 |
+
st.session_state.pop("_use_default_pdf", None)
|
| 224 |
+
st.write(f"Converted {num_pages} pages to images.")
|
| 225 |
+
status.update(label=f"PDF ready: {num_pages} pages", state="complete")
|
| 226 |
+
st.rerun()
|
| 227 |
+
|
| 228 |
+
# ---------------------------------------------------------------------------
|
| 229 |
+
# PDF ingestion — Phase B: generate page index
|
| 230 |
+
# ---------------------------------------------------------------------------
|
| 231 |
+
if st.session_state.pdf_loaded:
|
| 232 |
+
meta = st.session_state.metadata_state
|
| 233 |
+
if meta.status == "not_started":
|
| 234 |
+
if st.session_state.pdf_bytes is not None:
|
| 235 |
+
with st.expander(":page_facing_up: PDF Viewer", expanded=False):
|
| 236 |
+
st.pdf(st.session_state.pdf_bytes, height=400)
|
| 237 |
+
|
| 238 |
+
ingest = st.session_state.ingest_state
|
| 239 |
+
num_pages = ingest["num_pages"]
|
| 240 |
+
|
| 241 |
+
st.write("**Generating page index...**")
|
| 242 |
+
progress_bar = st.progress(0, text="Analyzing pages to build searchable index...")
|
| 243 |
+
|
| 244 |
+
def _index_progress(completed: int, total: int, label: str):
|
| 245 |
+
pct = completed / total
|
| 246 |
+
progress_bar.progress(pct, text=f"Indexing: {label} ({completed}/{total} batches)")
|
| 247 |
+
|
| 248 |
+
meta.generate_sync(
|
| 249 |
+
ingest["pdf_path"],
|
| 250 |
+
num_pages,
|
| 251 |
+
st.session_state.pdf_bytes,
|
| 252 |
+
progress_callback=_index_progress,
|
| 253 |
+
)
|
| 254 |
+
if meta.is_ready:
|
| 255 |
+
progress_bar.progress(1.0, text="Page index ready!")
|
| 256 |
+
else:
|
| 257 |
+
progress_bar.progress(1.0, text="Indexing failed — using full PDF mode")
|
| 258 |
+
st.rerun()
|
| 259 |
+
|
| 260 |
+
# ---------------------------------------------------------------------------
|
| 261 |
+
# Main layout (pre-upload welcome)
|
| 262 |
+
# ---------------------------------------------------------------------------
|
| 263 |
+
if not st.session_state.pdf_loaded:
|
| 264 |
+
_left, center, _right = st.columns([1, 2, 1])
|
| 265 |
+
with center:
|
| 266 |
+
st.markdown(
|
| 267 |
+
"<h1 style='text-align: center;'>:building_construction: NYC Code Compliance Bot</h1>",
|
| 268 |
+
unsafe_allow_html=True,
|
| 269 |
+
)
|
| 270 |
+
st.markdown(
|
| 271 |
+
"<p style='text-align: center; color: grey;'>"
|
| 272 |
+
"Upload a construction drawing PDF in the sidebar to get started.<br>"
|
| 273 |
+
"This tool uses <b>agentic vision</b> and the <b>NYC Building Code database</b> "
|
| 274 |
+
"to verify code compliance in your drawings."
|
| 275 |
+
"</p>",
|
| 276 |
+
unsafe_allow_html=True,
|
| 277 |
+
)
|
| 278 |
+
st.stop()
|
| 279 |
+
|
| 280 |
+
# ---------------------------------------------------------------------------
|
| 281 |
+
# PDF viewer
|
| 282 |
+
# ---------------------------------------------------------------------------
|
| 283 |
+
if st.session_state.pdf_bytes is not None:
|
| 284 |
+
with st.expander(":page_facing_up: PDF Viewer", expanded=False):
|
| 285 |
+
st.pdf(st.session_state.pdf_bytes, height=400)
|
| 286 |
+
|
| 287 |
+
# ---------------------------------------------------------------------------
|
| 288 |
+
# Three-column layout: chat | discussion | images+code
|
| 289 |
+
# ---------------------------------------------------------------------------
|
| 290 |
+
chat_col, discuss_col, evidence_col = st.columns([2, 2, 2])
|
| 291 |
+
|
| 292 |
+
# ---------------------------------------------------------------------------
|
| 293 |
+
# Discussion panel (agent conversation)
|
| 294 |
+
# ---------------------------------------------------------------------------
|
| 295 |
+
def render_discussion_log(container, discussion_log: list[dict]):
|
| 296 |
+
"""Render the agent discussion log with styled messages."""
|
| 297 |
+
with container:
|
| 298 |
+
for msg in discussion_log:
|
| 299 |
+
agent = msg.get("agent", "unknown")
|
| 300 |
+
icon = AGENT_ICONS.get(agent, "\U0001f916")
|
| 301 |
+
label = AGENT_LABELS.get(agent, agent)
|
| 302 |
+
css_class = f"agent-{agent}"
|
| 303 |
+
|
| 304 |
+
st.markdown(
|
| 305 |
+
f'<div class="agent-msg {css_class}">'
|
| 306 |
+
f'<span class="agent-timestamp">[{msg.get("timestamp", "")}]</span> '
|
| 307 |
+
f'<span class="agent-icon">{icon} {label}</span><br>'
|
| 308 |
+
f'{msg.get("summary", "")}'
|
| 309 |
+
f'</div>',
|
| 310 |
+
unsafe_allow_html=True,
|
| 311 |
+
)
|
| 312 |
+
|
| 313 |
+
# ---------------------------------------------------------------------------
|
| 314 |
+
# Chat history display
|
| 315 |
+
# ---------------------------------------------------------------------------
|
| 316 |
+
with chat_col:
|
| 317 |
+
st.subheader(":speech_balloon: Chat")
|
| 318 |
+
|
| 319 |
+
meta = st.session_state.metadata_state
|
| 320 |
+
if meta.is_ready:
|
| 321 |
+
st.caption("Page index ready — fast planning enabled")
|
| 322 |
+
elif meta.status == "failed":
|
| 323 |
+
st.caption("Page indexing failed — using full PDF mode")
|
| 324 |
+
|
| 325 |
+
for role, content, _refs in st.session_state.chat_history:
|
| 326 |
+
with st.chat_message(role):
|
| 327 |
+
st.markdown(content)
|
| 328 |
+
|
| 329 |
+
question = st.chat_input("Ask a compliance question about the drawing...")
|
| 330 |
+
|
| 331 |
+
# ---------------------------------------------------------------------------
|
| 332 |
+
# Discussion panel
|
| 333 |
+
# ---------------------------------------------------------------------------
|
| 334 |
+
with discuss_col:
|
| 335 |
+
st.subheader(":busts_in_silhouette: Agent Discussion")
|
| 336 |
+
discussion_container = st.container()
|
| 337 |
+
|
| 338 |
+
if st.session_state.discussion_log:
|
| 339 |
+
render_discussion_log(discussion_container, st.session_state.discussion_log)
|
| 340 |
+
else:
|
| 341 |
+
st.info("Agent discussions will appear here during analysis.")
|
| 342 |
+
|
| 343 |
+
# ---------------------------------------------------------------------------
|
| 344 |
+
# Evidence panel (images + code)
|
| 345 |
+
# ---------------------------------------------------------------------------
|
| 346 |
+
with evidence_col:
|
| 347 |
+
st.subheader(":framed_picture: Evidence")
|
| 348 |
+
|
| 349 |
+
evidence_tabs = st.tabs(["Drawing Crops", "Code Sections"])
|
| 350 |
+
|
| 351 |
+
with evidence_tabs[0]:
|
| 352 |
+
if st.session_state.image_refs:
|
| 353 |
+
for ref in st.session_state.image_refs:
|
| 354 |
+
try:
|
| 355 |
+
img = Image.open(ref["path"])
|
| 356 |
+
st.image(img, caption=ref["label"], use_container_width=True)
|
| 357 |
+
except Exception:
|
| 358 |
+
st.warning(f"Could not load: {ref['label']}")
|
| 359 |
+
elif st.session_state.chat_history:
|
| 360 |
+
st.info("No images for this question.")
|
| 361 |
+
else:
|
| 362 |
+
st.info("Ask a question to see drawing crops here.")
|
| 363 |
+
|
| 364 |
+
with evidence_tabs[1]:
|
| 365 |
+
if st.session_state.code_sections:
|
| 366 |
+
for sec in st.session_state.code_sections:
|
| 367 |
+
with st.expander(
|
| 368 |
+
f":balance_scale: {sec.get('code_type', '?')} §{sec.get('section_full', '?')}",
|
| 369 |
+
expanded=False,
|
| 370 |
+
):
|
| 371 |
+
if sec.get("relevance"):
|
| 372 |
+
st.caption(sec["relevance"])
|
| 373 |
+
st.markdown(sec.get("text", "")[:1500])
|
| 374 |
+
if st.session_state.code_report:
|
| 375 |
+
with st.expander(":page_facing_up: Full Code Report", expanded=False):
|
| 376 |
+
st.markdown(st.session_state.code_report[:5000])
|
| 377 |
+
else:
|
| 378 |
+
st.info("Code sections retrieved during analysis will appear here.")
|
| 379 |
+
|
| 380 |
+
|
| 381 |
+
# ---------------------------------------------------------------------------
|
| 382 |
+
# Question processing
|
| 383 |
+
# ---------------------------------------------------------------------------
|
| 384 |
+
if question:
|
| 385 |
+
# Add user message to history
|
| 386 |
+
st.session_state.chat_history.append(("user", question, []))
|
| 387 |
+
st.session_state.discussion_log = [] # Reset discussion for new question
|
| 388 |
+
st.session_state.code_report = "" # Reset code report for new question
|
| 389 |
+
st.session_state.code_sections = [] # Reset code sections for new question
|
| 390 |
+
st.session_state.image_refs = [] # Reset image refs for new question
|
| 391 |
+
|
| 392 |
+
with chat_col:
|
| 393 |
+
with st.chat_message("user"):
|
| 394 |
+
st.markdown(question)
|
| 395 |
+
|
| 396 |
+
# Build initial state
|
| 397 |
+
ingest = st.session_state.ingest_state
|
| 398 |
+
image_store = st.session_state.image_store
|
| 399 |
+
|
| 400 |
+
meta = st.session_state.metadata_state
|
| 401 |
+
metadata_json = meta.data_json if meta.is_ready else ""
|
| 402 |
+
|
| 403 |
+
question_state = {
|
| 404 |
+
"messages": [],
|
| 405 |
+
"question": question,
|
| 406 |
+
"pdf_path": ingest.get("pdf_path", ""),
|
| 407 |
+
"page_image_dir": ingest.get("page_image_dir", ""),
|
| 408 |
+
"num_pages": ingest.get("num_pages", 0),
|
| 409 |
+
"page_metadata_json": metadata_json,
|
| 410 |
+
"legend_pages": [],
|
| 411 |
+
"target_pages": [],
|
| 412 |
+
"crop_tasks": [],
|
| 413 |
+
"code_queries": [],
|
| 414 |
+
"image_refs": [],
|
| 415 |
+
"code_sections": [],
|
| 416 |
+
"code_report": "",
|
| 417 |
+
"code_chapters_fetched": [],
|
| 418 |
+
"compliance_analysis": "",
|
| 419 |
+
"reviewer_analysis": "",
|
| 420 |
+
"final_verdict": "",
|
| 421 |
+
"discussion_log": [],
|
| 422 |
+
"additional_crop_tasks": [],
|
| 423 |
+
"additional_code_queries": [],
|
| 424 |
+
"needs_more_investigation": False,
|
| 425 |
+
"investigation_round": 0,
|
| 426 |
+
"max_rounds": max_rounds,
|
| 427 |
+
"enable_consensus": enable_consensus,
|
| 428 |
+
"enable_annotation": enable_annotation,
|
| 429 |
+
"status_message": [],
|
| 430 |
+
}
|
| 431 |
+
|
| 432 |
+
# ------------------------------------------------------------------
|
| 433 |
+
# Live progress
|
| 434 |
+
# ------------------------------------------------------------------
|
| 435 |
+
crop_cache = st.session_state.crop_cache
|
| 436 |
+
|
| 437 |
+
with evidence_col:
|
| 438 |
+
with evidence_tabs[0]:
|
| 439 |
+
crop_counter_placeholder = st.empty()
|
| 440 |
+
crop_image_container = st.container()
|
| 441 |
+
|
| 442 |
+
def on_crop_progress(
|
| 443 |
+
completed_ref, crop_task, source: str, completed_count: int, total_count: int,
|
| 444 |
+
) -> None:
|
| 445 |
+
source_tag = " (cached)" if source == "cached" else ""
|
| 446 |
+
crop_counter_placeholder.markdown(
|
| 447 |
+
f"**Crop {completed_count}/{total_count}**{source_tag} \n"
|
| 448 |
+
f"Latest: *{crop_task.get('label', 'Crop')}*"
|
| 449 |
+
)
|
| 450 |
+
with crop_image_container:
|
| 451 |
+
try:
|
| 452 |
+
img = Image.open(completed_ref["path"])
|
| 453 |
+
caption = completed_ref["label"]
|
| 454 |
+
if source == "cached":
|
| 455 |
+
caption += " (cached)"
|
| 456 |
+
st.image(img, caption=caption, use_container_width=True)
|
| 457 |
+
except Exception:
|
| 458 |
+
st.warning(f"Could not load: {completed_ref['label']}")
|
| 459 |
+
|
| 460 |
+
# Compile graph
|
| 461 |
+
compliance_graph = compile_compliance_graph(image_store, crop_cache, on_crop_progress)
|
| 462 |
+
|
| 463 |
+
# Node progress labels
|
| 464 |
+
PROGRESS_LABELS = {
|
| 465 |
+
"compliance_planner": "Planning investigation...",
|
| 466 |
+
"execute_crops": "Cropping drawing images...",
|
| 467 |
+
"annotate_crops": "Annotating crops...",
|
| 468 |
+
"initial_code_lookup": "Searching NYC code database...",
|
| 469 |
+
"compliance_analyst": "Analyzing compliance...",
|
| 470 |
+
"targeted_code_lookup": "Follow-up code search...",
|
| 471 |
+
"deliberation": "Running peer review...",
|
| 472 |
+
"final_verdict": "Synthesizing verdict...",
|
| 473 |
+
}
|
| 474 |
+
|
| 475 |
+
with chat_col:
|
| 476 |
+
with st.status("Investigating compliance...", expanded=True) as status:
|
| 477 |
+
all_image_refs: list[dict] = []
|
| 478 |
+
all_discussion: list[dict] = []
|
| 479 |
+
final_verdict_text = ""
|
| 480 |
+
code_report_text = ""
|
| 481 |
+
|
| 482 |
+
st.write(PROGRESS_LABELS["compliance_planner"])
|
| 483 |
+
|
| 484 |
+
# Placeholder for parallel-branch status (updated after planner completes)
|
| 485 |
+
parallel_status = st.empty()
|
| 486 |
+
|
| 487 |
+
for event in compliance_graph.stream(question_state, stream_mode="updates"):
|
| 488 |
+
node_name = list(event.keys())[0]
|
| 489 |
+
update = event[node_name]
|
| 490 |
+
|
| 491 |
+
# Status messages (list, since parallel nodes can both emit)
|
| 492 |
+
status_msgs = update.get("status_message", [])
|
| 493 |
+
for status_msg in status_msgs:
|
| 494 |
+
if status_msg:
|
| 495 |
+
st.write(f":white_check_mark: {status_msg}")
|
| 496 |
+
|
| 497 |
+
# Collect discussion messages
|
| 498 |
+
new_discussion = update.get("discussion_log", [])
|
| 499 |
+
if new_discussion:
|
| 500 |
+
all_discussion.extend(new_discussion)
|
| 501 |
+
st.session_state.discussion_log = all_discussion
|
| 502 |
+
# Re-render discussion panel
|
| 503 |
+
render_discussion_log(discussion_container, all_discussion)
|
| 504 |
+
|
| 505 |
+
# Node-specific handling
|
| 506 |
+
if node_name == "compliance_planner":
|
| 507 |
+
target_pages = update.get("target_pages", [])
|
| 508 |
+
crop_tasks = update.get("crop_tasks", [])
|
| 509 |
+
code_queries = update.get("code_queries", [])
|
| 510 |
+
|
| 511 |
+
with st.expander(":clipboard: Investigation Plan", expanded=True):
|
| 512 |
+
if target_pages:
|
| 513 |
+
st.markdown(f"**Target pages:** {', '.join(str(p + 1) for p in target_pages)}")
|
| 514 |
+
if crop_tasks:
|
| 515 |
+
st.markdown(f"**Image crops ({len(crop_tasks)}):**")
|
| 516 |
+
for i, task in enumerate(crop_tasks, 1):
|
| 517 |
+
display_page = task.get("page_num", 0) + 1
|
| 518 |
+
st.markdown(f" {i}. {task.get('label', 'Crop')} (p.{display_page})")
|
| 519 |
+
if code_queries:
|
| 520 |
+
st.markdown(f"**Code queries ({len(code_queries)}):**")
|
| 521 |
+
for i, q in enumerate(code_queries, 1):
|
| 522 |
+
st.markdown(f" {i}. [{q.get('focus_area', '?')}] {q.get('query', '')[:80]}...")
|
| 523 |
+
|
| 524 |
+
if crop_tasks:
|
| 525 |
+
crop_counter_placeholder.markdown(f"**Crop 0/{len(crop_tasks)}** — starting...")
|
| 526 |
+
|
| 527 |
+
# Show parallel execution message (this appears while both branches run)
|
| 528 |
+
parallel_status.info(
|
| 529 |
+
":arrows_counterclockwise: Running in parallel: "
|
| 530 |
+
f"**Cropping {len(crop_tasks)} images** + "
|
| 531 |
+
f"**Searching {len(code_queries)} code queries**. "
|
| 532 |
+
"This may take 30-60 seconds..."
|
| 533 |
+
)
|
| 534 |
+
|
| 535 |
+
elif node_name in ("initial_code_lookup", "execute_crops"):
|
| 536 |
+
# Clear the parallel status once a branch finishes
|
| 537 |
+
parallel_status.empty()
|
| 538 |
+
|
| 539 |
+
if node_name in ("initial_code_lookup", "targeted_code_lookup"):
|
| 540 |
+
report = update.get("code_report", "")
|
| 541 |
+
new_sections = update.get("code_sections", [])
|
| 542 |
+
if report:
|
| 543 |
+
code_report_text = report
|
| 544 |
+
st.session_state.code_report = report
|
| 545 |
+
if new_sections:
|
| 546 |
+
st.session_state.code_sections.extend(new_sections)
|
| 547 |
+
# Render each new section in the evidence panel in real-time
|
| 548 |
+
with evidence_col:
|
| 549 |
+
with evidence_tabs[1]:
|
| 550 |
+
for sec in new_sections:
|
| 551 |
+
with st.expander(
|
| 552 |
+
f":balance_scale: {sec.get('code_type', '?')} "
|
| 553 |
+
f"§{sec.get('section_full', '?')}",
|
| 554 |
+
expanded=False,
|
| 555 |
+
):
|
| 556 |
+
if sec.get("relevance"):
|
| 557 |
+
st.caption(sec["relevance"])
|
| 558 |
+
st.markdown(sec.get("text", "")[:1500])
|
| 559 |
+
|
| 560 |
+
elif node_name == "compliance_analyst":
|
| 561 |
+
analysis = update.get("compliance_analysis", "")
|
| 562 |
+
needs_more = update.get("needs_more_investigation", False)
|
| 563 |
+
round_num = update.get("investigation_round", 1)
|
| 564 |
+
|
| 565 |
+
if analysis:
|
| 566 |
+
label = f":mag: Compliance Analysis (Round {round_num})"
|
| 567 |
+
if needs_more:
|
| 568 |
+
label += " — requesting more evidence"
|
| 569 |
+
with st.expander(label, expanded=False):
|
| 570 |
+
st.markdown(analysis[:5000])
|
| 571 |
+
|
| 572 |
+
elif node_name == "deliberation":
|
| 573 |
+
review = update.get("reviewer_analysis", "")
|
| 574 |
+
if review:
|
| 575 |
+
with st.expander(":handshake: Peer Review", expanded=False):
|
| 576 |
+
st.markdown(review[:3000])
|
| 577 |
+
|
| 578 |
+
# Collect images — persist to session state and render in evidence panel
|
| 579 |
+
new_refs = update.get("image_refs", [])
|
| 580 |
+
if new_refs:
|
| 581 |
+
all_image_refs.extend(new_refs)
|
| 582 |
+
st.session_state.image_refs.extend(new_refs)
|
| 583 |
+
# Render each new crop in the evidence panel in real-time
|
| 584 |
+
with evidence_col:
|
| 585 |
+
with evidence_tabs[0]:
|
| 586 |
+
for ref in new_refs:
|
| 587 |
+
try:
|
| 588 |
+
img = Image.open(ref["path"])
|
| 589 |
+
st.image(img, caption=ref["label"], use_container_width=True)
|
| 590 |
+
except Exception:
|
| 591 |
+
st.warning(f"Could not load: {ref['label']}")
|
| 592 |
+
|
| 593 |
+
# Capture final verdict
|
| 594 |
+
if "final_verdict" in update and update["final_verdict"]:
|
| 595 |
+
final_verdict_text = update["final_verdict"]
|
| 596 |
+
|
| 597 |
+
# Show next step label
|
| 598 |
+
if node_name in PROGRESS_LABELS:
|
| 599 |
+
next_labels = {
|
| 600 |
+
"compliance_planner": ["execute_crops", "initial_code_lookup"],
|
| 601 |
+
"execute_crops": ["compliance_analyst"],
|
| 602 |
+
"annotate_crops": ["compliance_analyst"],
|
| 603 |
+
"initial_code_lookup": ["compliance_analyst"],
|
| 604 |
+
"compliance_analyst": ["final_verdict"],
|
| 605 |
+
"targeted_code_lookup": ["compliance_analyst"],
|
| 606 |
+
"deliberation": ["final_verdict"],
|
| 607 |
+
}
|
| 608 |
+
for next_node in next_labels.get(node_name, []):
|
| 609 |
+
if next_node in PROGRESS_LABELS:
|
| 610 |
+
st.write(PROGRESS_LABELS[next_node])
|
| 611 |
+
|
| 612 |
+
if crop_cache.size > 0:
|
| 613 |
+
st.caption(f":file_folder: {crop_cache.stats}")
|
| 614 |
+
status.update(label="Compliance investigation complete", state="complete")
|
| 615 |
+
|
| 616 |
+
# Display final verdict
|
| 617 |
+
if final_verdict_text:
|
| 618 |
+
with chat_col:
|
| 619 |
+
with st.chat_message("assistant"):
|
| 620 |
+
st.markdown(final_verdict_text)
|
| 621 |
+
|
| 622 |
+
st.session_state.chat_history[-1] = ("user", question, [])
|
| 623 |
+
st.session_state.chat_history.append(("assistant", final_verdict_text, all_image_refs))
|
| 624 |
+
else:
|
| 625 |
+
with chat_col:
|
| 626 |
+
st.error("No verdict was generated. Please try again.")
|
| 627 |
+
|
| 628 |
+
st.rerun()
|
config.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
|
| 4 |
+
# Explicit path so .env is found regardless of the working directory
|
| 5 |
+
_THIS_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 6 |
+
load_dotenv(os.path.join(_THIS_DIR, ".env"))
|
| 7 |
+
|
| 8 |
+
# --- API Keys ---
|
| 9 |
+
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY", "")
|
| 10 |
+
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "")
|
| 11 |
+
|
| 12 |
+
# --- Model Names ---
|
| 13 |
+
PLANNER_MODEL = "gemini-3-flash-preview"
|
| 14 |
+
CROPPER_MODEL = "gemini-3-flash-preview"
|
| 15 |
+
ANNOTATOR_MODEL = "gemini-2.5-flash-image"
|
| 16 |
+
ANALYZER_MODEL = "gemini-3-flash-preview"
|
| 17 |
+
CODE_LOOKUP_MODEL = "gpt-5-mini"
|
| 18 |
+
DELIBERATION_MODEL = "gpt-5.2"
|
| 19 |
+
VERDICT_MODEL = "gemini-3-flash-preview"
|
| 20 |
+
METADATA_MODEL = "gemini-3-flash-preview"
|
| 21 |
+
|
| 22 |
+
# --- ChromaDB ---
|
| 23 |
+
CHROMA_DB_PATH = os.path.join(os.path.dirname(__file__), "data", "nyc_code_db")
|
| 24 |
+
CHROMA_COLLECTION_NAME = "nyc_building_codes"
|
| 25 |
+
EMBEDDING_MODEL_NAME = "BAAI/bge-large-en-v1.5"
|
| 26 |
+
|
| 27 |
+
# --- Processing Constants ---
|
| 28 |
+
PDF_RENDER_DPI = 100
|
| 29 |
+
MAX_INVESTIGATION_ROUNDS = 3
|
| 30 |
+
CROP_PADDING_PX = 40
|
| 31 |
+
|
| 32 |
+
# --- Code Lookup Budgets (per individual query, queries run in parallel) ---
|
| 33 |
+
MAX_DISCOVER_CALLS = 1
|
| 34 |
+
MAX_FETCH_CALLS = 1
|
| 35 |
+
MAX_CODE_LOOKUP_TURNS = 5
|
| 36 |
+
|
| 37 |
+
# --- Search Defaults ---
|
| 38 |
+
DISCOVER_N_RESULTS = 50 # Retrieve more candidates for re-ranking
|
| 39 |
+
RERANK_TOP_K = 20 # Return top-K after re-ranking
|
| 40 |
+
FETCH_MAX_SECTIONS = 30 # Max sections per chapter fetch
|
data/BUILDING_CODE.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/FUEL_GAS_CODE.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/GENERAL_ADMINISTRATIVE_PROVISIONS.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a2060b78a8e7c0061ee1dab1afe66a2a3e4cb28694af0495a8b3340a26c69940
|
| 3 |
+
size 20920937
|
data/MECHANICAL_CODE.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/PLUMBING_CODE.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/ingest_chromadb.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Ingest preprocessed NYC code JSON files into ChromaDB with bge-large-en-v1.5."""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import json
|
| 5 |
+
import os
|
| 6 |
+
import sys
|
| 7 |
+
|
| 8 |
+
import chromadb
|
| 9 |
+
from chromadb.utils import embedding_functions
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
EMBEDDING_MODEL = "BAAI/bge-large-en-v1.5"
|
| 13 |
+
COLLECTION_NAME = "nyc_building_codes"
|
| 14 |
+
DB_PATH = os.path.join(os.path.dirname(__file__), "nyc_code_db")
|
| 15 |
+
|
| 16 |
+
# Map of JSON files to their code types
|
| 17 |
+
CODE_FILES = {
|
| 18 |
+
"BUILDING_CODE.json": "Building",
|
| 19 |
+
"FUEL_GAS_CODE.json": "FuelGas",
|
| 20 |
+
"GENERAL_ADMINISTRATIVE_PROVISIONS.json": "Administrative",
|
| 21 |
+
"MECHANICAL_CODE.json": "Mechanical",
|
| 22 |
+
"PLUMBING_CODE.json": "Plumbing",
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def create_collection(db_path: str = DB_PATH, reset: bool = True):
|
| 27 |
+
"""Create or reset the ChromaDB collection."""
|
| 28 |
+
client = chromadb.PersistentClient(path=db_path)
|
| 29 |
+
embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
|
| 30 |
+
model_name=EMBEDDING_MODEL,
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
if reset:
|
| 34 |
+
try:
|
| 35 |
+
client.delete_collection(name=COLLECTION_NAME)
|
| 36 |
+
print(f"Deleted existing collection '{COLLECTION_NAME}'.")
|
| 37 |
+
except Exception:
|
| 38 |
+
pass
|
| 39 |
+
|
| 40 |
+
collection = client.create_collection(
|
| 41 |
+
name=COLLECTION_NAME,
|
| 42 |
+
embedding_function=embedding_fn,
|
| 43 |
+
)
|
| 44 |
+
return client, collection
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def ingest_json_file(collection, json_path: str, code_type: str) -> int:
|
| 48 |
+
"""Ingest a single JSON file into the collection. Returns count of sections added."""
|
| 49 |
+
with open(json_path, "r", encoding="utf-8") as f:
|
| 50 |
+
data = json.load(f)
|
| 51 |
+
|
| 52 |
+
documents = []
|
| 53 |
+
metadatas = []
|
| 54 |
+
ids = []
|
| 55 |
+
seen_ids: set[str] = set()
|
| 56 |
+
|
| 57 |
+
for entry in data:
|
| 58 |
+
meta = entry["metadata"]
|
| 59 |
+
# Ensure code_type is set (should already be from preprocessing)
|
| 60 |
+
meta["code_type"] = code_type
|
| 61 |
+
|
| 62 |
+
unique_id = f"{code_type}_{entry['id']}"
|
| 63 |
+
if unique_id in seen_ids:
|
| 64 |
+
continue
|
| 65 |
+
|
| 66 |
+
# Flatten list-type metadata for ChromaDB (only supports str/int/float/bool)
|
| 67 |
+
flat_meta = {}
|
| 68 |
+
for k, v in meta.items():
|
| 69 |
+
if isinstance(v, list):
|
| 70 |
+
flat_meta[k] = ", ".join(str(x) for x in v) if v else ""
|
| 71 |
+
elif isinstance(v, bool):
|
| 72 |
+
flat_meta[k] = v
|
| 73 |
+
elif isinstance(v, (int, float)):
|
| 74 |
+
flat_meta[k] = v
|
| 75 |
+
else:
|
| 76 |
+
flat_meta[k] = str(v)
|
| 77 |
+
|
| 78 |
+
documents.append(entry["text"])
|
| 79 |
+
metadatas.append(flat_meta)
|
| 80 |
+
ids.append(unique_id)
|
| 81 |
+
seen_ids.add(unique_id)
|
| 82 |
+
|
| 83 |
+
# Batch upsert
|
| 84 |
+
batch_size = 200 # Smaller batches for larger embeddings
|
| 85 |
+
for i in range(0, len(documents), batch_size):
|
| 86 |
+
batch_end = min(i + batch_size, len(documents))
|
| 87 |
+
collection.upsert(
|
| 88 |
+
documents=documents[i:batch_end],
|
| 89 |
+
metadatas=metadatas[i:batch_end],
|
| 90 |
+
ids=ids[i:batch_end],
|
| 91 |
+
)
|
| 92 |
+
print(f" Batch {i // batch_size + 1}: upserted {batch_end - i} sections")
|
| 93 |
+
|
| 94 |
+
return len(ids)
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
def ingest_all(data_dir: str, db_path: str = DB_PATH) -> dict[str, int]:
|
| 98 |
+
"""Ingest all code JSON files into a fresh ChromaDB collection."""
|
| 99 |
+
print(f"Creating ChromaDB at {db_path} with embedding model: {EMBEDDING_MODEL}")
|
| 100 |
+
_client, collection = create_collection(db_path, reset=True)
|
| 101 |
+
|
| 102 |
+
counts: dict[str, int] = {}
|
| 103 |
+
for filename, code_type in CODE_FILES.items():
|
| 104 |
+
json_path = os.path.join(data_dir, filename)
|
| 105 |
+
if os.path.exists(json_path):
|
| 106 |
+
print(f"\nIngesting {filename} as '{code_type}'...")
|
| 107 |
+
count = ingest_json_file(collection, json_path, code_type)
|
| 108 |
+
counts[code_type] = count
|
| 109 |
+
print(f" -> {count} sections ingested")
|
| 110 |
+
else:
|
| 111 |
+
print(f"WARNING: {json_path} not found, skipping.")
|
| 112 |
+
|
| 113 |
+
total = sum(counts.values())
|
| 114 |
+
print(f"\nIngestion complete. Total: {total} sections across {len(counts)} code types.")
|
| 115 |
+
return counts
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
if __name__ == "__main__":
|
| 119 |
+
data_dir = sys.argv[1] if len(sys.argv) > 1 else os.path.dirname(__file__)
|
| 120 |
+
ingest_all(data_dir)
|
data/nyc_code_db/chroma.sqlite3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8cba84d670f2a081621d233004d1dc55d6e6766c0d9b3a5dbae28e4337aed86b
|
| 3 |
+
size 118640640
|
data/nyc_code_db/d5ad1fca-6483-43d1-b3bd-08c280e227d1/data_level0.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b13fd3171ed75d30bbc76630532fc4ff49c459ac97735349fc8670bc5402056e
|
| 3 |
+
size 21180000
|
data/nyc_code_db/d5ad1fca-6483-43d1-b3bd-08c280e227d1/header.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4976cac8d856d3914be9284cb6405a0b16019a1c69f0d98c21d517da4492fed0
|
| 3 |
+
size 100
|
data/nyc_code_db/d5ad1fca-6483-43d1-b3bd-08c280e227d1/index_metadata.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:abf1da4a172d264eca520e35093cd9d1dbfc1a2174c5cc08f2bf14e40dc27c0c
|
| 3 |
+
size 266462
|
data/nyc_code_db/d5ad1fca-6483-43d1-b3bd-08c280e227d1/length.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c59d245953aaf1e3e56e16a29fc026d3d3b255c07909bdf1345af07e98e51a70
|
| 3 |
+
size 20000
|
data/nyc_code_db/d5ad1fca-6483-43d1-b3bd-08c280e227d1/link_lists.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2798c188a968c06bd2a7b8bf9886e25e9f0ef8de40118f57f92647e11c7e58cf
|
| 3 |
+
size 42916
|
data/nyc_code_db/f32247b2-1c25-42c9-9177-5ccfe22bb0b1/data_level0.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f7351c3a89b1d41bdbc04c8cda4131784642164ac6289029f21ea79369ecebfd
|
| 3 |
+
size 43050468
|
data/nyc_code_db/f32247b2-1c25-42c9-9177-5ccfe22bb0b1/header.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7de83396c6ea38709c2085abdd0ad029742aff3877b5cf4499cafca3bd2d3977
|
| 3 |
+
size 100
|
data/nyc_code_db/f32247b2-1c25-42c9-9177-5ccfe22bb0b1/index_metadata.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fa1c7ef73ee5c32a15415bf8d5e8d43ec77e26cfbf77cfe04e7ccbb6b295b705
|
| 3 |
+
size 557222
|
data/nyc_code_db/f32247b2-1c25-42c9-9177-5ccfe22bb0b1/length.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a246c61293a0e64dc3582af64fbb6af58c2e9c26228c15c5023f6fd32d4dcff1
|
| 3 |
+
size 40652
|
data/nyc_code_db/f32247b2-1c25-42c9-9177-5ccfe22bb0b1/link_lists.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a17b8f2739f3b77723cdb5a3c0de9fcbd4e64d65b16a9be028ba214ecab449e2
|
| 3 |
+
size 86212
|
data/preprocess_codes.py
ADDED
|
@@ -0,0 +1,253 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Improved NYC code preprocessing — fixes duplicates, improves metadata, preserves structure."""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import hashlib
|
| 5 |
+
import json
|
| 6 |
+
import os
|
| 7 |
+
import re
|
| 8 |
+
from collections import Counter, OrderedDict
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
# ---------------------------------------------------------------------------
|
| 12 |
+
# Text cleaning
|
| 13 |
+
# ---------------------------------------------------------------------------
|
| 14 |
+
|
| 15 |
+
def clean_and_flatten(text: str) -> str:
|
| 16 |
+
"""Fix mid-word line breaks and collapse whitespace while preserving list structure."""
|
| 17 |
+
# Fix words split by hyphens across lines (e.g., "accord-\nance")
|
| 18 |
+
text = re.sub(r"(\w+)-\s*\n\s*(\w+)", r"\1\2", text)
|
| 19 |
+
# Preserve numbered list items by inserting a marker before cleanup
|
| 20 |
+
text = re.sub(r"\n\s*(\d+\.)\s+", r" __LISTBREAK__ \1 ", text)
|
| 21 |
+
text = re.sub(r"\n\s*(Exception(?:s)?[\s:.])", r" __LISTBREAK__ \1", text)
|
| 22 |
+
text = text.replace("\n", " ")
|
| 23 |
+
# Clean spacing around dashes in section numbers (e.g., 28 - 101)
|
| 24 |
+
text = re.sub(r"(\d+)\s*-\s*(\d+)", r"\1-\2", text)
|
| 25 |
+
text = re.sub(r"\s+", " ", text).strip()
|
| 26 |
+
# Restore list breaks as newlines
|
| 27 |
+
text = text.replace("__LISTBREAK__", "\n")
|
| 28 |
+
return text
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
# ---------------------------------------------------------------------------
|
| 32 |
+
# Anchor / section detection
|
| 33 |
+
# ---------------------------------------------------------------------------
|
| 34 |
+
|
| 35 |
+
def get_dominant_anchor(content: str) -> str | None:
|
| 36 |
+
"""Detect the dominant chapter digit (1-9) or Appendix letter (A-Z)."""
|
| 37 |
+
anchors = re.findall(
|
| 38 |
+
r"(?m)^(?:\*?\s?§?\s?)(?:([1-9])\d{2,3}\.|([A-Z])(?:\d{2,3})?\.)",
|
| 39 |
+
content,
|
| 40 |
+
)
|
| 41 |
+
found = [item for sublist in anchors for item in sublist if item]
|
| 42 |
+
if not found:
|
| 43 |
+
return None
|
| 44 |
+
return Counter(found).most_common(1)[0][0]
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
# ---------------------------------------------------------------------------
|
| 48 |
+
# Metadata extraction from section text
|
| 49 |
+
# ---------------------------------------------------------------------------
|
| 50 |
+
|
| 51 |
+
_OCCUPANCY_RE = re.compile(
|
| 52 |
+
r"\b(?:Group|Occupancy|Classification)\s+"
|
| 53 |
+
r"([A-Z]-?\d?(?:\s*,\s*[A-Z]-?\d?)*)",
|
| 54 |
+
re.IGNORECASE,
|
| 55 |
+
)
|
| 56 |
+
_CONSTRUCTION_TYPE_RE = re.compile(
|
| 57 |
+
r"\bType\s+(I[A-B]?|II[A-B]?|III[A-B]?|IV[A-B]?|V[A-B]?)\b",
|
| 58 |
+
re.IGNORECASE,
|
| 59 |
+
)
|
| 60 |
+
_EXCEPTION_RE = re.compile(r"\bException(?:s)?\s*[:.]", re.IGNORECASE)
|
| 61 |
+
_CROSS_REF_RE = re.compile(
|
| 62 |
+
r"(?:Section|Sections|§)\s+(\d{2,4}(?:\.\d+)*(?:\s*(?:,|and|through)\s*\d{2,4}(?:\.\d+)*)*)",
|
| 63 |
+
re.IGNORECASE,
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def extract_rich_metadata(section_id: str, text: str, code_type: str) -> dict:
|
| 68 |
+
"""Extract enhanced metadata from section text for better filtering."""
|
| 69 |
+
id_parts = section_id.split(".")
|
| 70 |
+
parent_major = id_parts[0]
|
| 71 |
+
parent_minor = ".".join(id_parts[:2]) if len(id_parts) > 1 else parent_major
|
| 72 |
+
|
| 73 |
+
# Occupancy classes mentioned
|
| 74 |
+
occ_matches = _OCCUPANCY_RE.findall(text)
|
| 75 |
+
occupancy_classes = []
|
| 76 |
+
for m in occ_matches:
|
| 77 |
+
for cls in re.split(r"\s*,\s*", m):
|
| 78 |
+
cls = cls.strip().upper()
|
| 79 |
+
if cls and cls not in occupancy_classes:
|
| 80 |
+
occupancy_classes.append(cls)
|
| 81 |
+
|
| 82 |
+
# Construction types mentioned
|
| 83 |
+
const_matches = _CONSTRUCTION_TYPE_RE.findall(text)
|
| 84 |
+
construction_types = sorted(set(m.upper() for m in const_matches))
|
| 85 |
+
|
| 86 |
+
# Exception detection
|
| 87 |
+
has_exceptions = bool(_EXCEPTION_RE.search(text))
|
| 88 |
+
exception_count = len(_EXCEPTION_RE.findall(text))
|
| 89 |
+
|
| 90 |
+
# Cross-references
|
| 91 |
+
xref_matches = _CROSS_REF_RE.findall(text)
|
| 92 |
+
cross_references = []
|
| 93 |
+
for m in xref_matches:
|
| 94 |
+
for ref in re.split(r"\s*(?:,|and|through)\s*", m):
|
| 95 |
+
ref = ref.strip()
|
| 96 |
+
if ref and ref != section_id and ref not in cross_references:
|
| 97 |
+
cross_references.append(ref)
|
| 98 |
+
|
| 99 |
+
return {
|
| 100 |
+
"section_full": section_id,
|
| 101 |
+
"parent_major": parent_major,
|
| 102 |
+
"parent_minor": parent_minor,
|
| 103 |
+
"code_type": code_type,
|
| 104 |
+
"occupancy_classes": occupancy_classes,
|
| 105 |
+
"construction_types": construction_types,
|
| 106 |
+
"has_exceptions": has_exceptions,
|
| 107 |
+
"exception_count": exception_count,
|
| 108 |
+
"cross_references": cross_references,
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
# ---------------------------------------------------------------------------
|
| 113 |
+
# Core extraction with deduplication
|
| 114 |
+
# ---------------------------------------------------------------------------
|
| 115 |
+
|
| 116 |
+
def extract_trade_sections(
|
| 117 |
+
file_path: str,
|
| 118 |
+
global_dict: OrderedDict,
|
| 119 |
+
code_type: str,
|
| 120 |
+
seen_hashes: dict[str, set[str]],
|
| 121 |
+
) -> OrderedDict:
|
| 122 |
+
"""Extract code sections from a single source file with deduplication."""
|
| 123 |
+
if not os.path.exists(file_path):
|
| 124 |
+
return global_dict
|
| 125 |
+
|
| 126 |
+
with open(file_path, "r", encoding="utf-8") as f:
|
| 127 |
+
content = f.read().replace("\xa0", " ")
|
| 128 |
+
|
| 129 |
+
anchor = get_dominant_anchor(content)
|
| 130 |
+
if not anchor:
|
| 131 |
+
return global_dict
|
| 132 |
+
|
| 133 |
+
# Build section-matching regex
|
| 134 |
+
if anchor.isalpha():
|
| 135 |
+
id_pattern = rf"[A-Z]?{re.escape(anchor)}\d*(?:\.\d+)+"
|
| 136 |
+
else:
|
| 137 |
+
id_pattern = rf"{re.escape(anchor)}\d{{2,3}}(?:\.\d+)+"
|
| 138 |
+
|
| 139 |
+
pattern = rf"(?m)^\s*[\*§]?\s*({id_pattern})\s+([A-Z\w]+)"
|
| 140 |
+
matches = list(re.finditer(pattern, content))
|
| 141 |
+
|
| 142 |
+
skip_words = {
|
| 143 |
+
"and", "through", "to", "or", "sections", "the", "of", "in", "under", "as",
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
for i in range(len(matches)):
|
| 147 |
+
clean_id = matches[i].group(1).strip()
|
| 148 |
+
first_word = matches[i].group(2)
|
| 149 |
+
|
| 150 |
+
if first_word.lower() in skip_words:
|
| 151 |
+
continue
|
| 152 |
+
|
| 153 |
+
start_pos = matches[i].start()
|
| 154 |
+
end_pos = matches[i + 1].start() if i + 1 < len(matches) else len(content)
|
| 155 |
+
|
| 156 |
+
raw_body = content[start_pos:end_pos]
|
| 157 |
+
clean_body = clean_and_flatten(raw_body)
|
| 158 |
+
|
| 159 |
+
if len(clean_body) < 60:
|
| 160 |
+
continue
|
| 161 |
+
|
| 162 |
+
# ------ DEDUPLICATION via content hashing ------
|
| 163 |
+
block_hash = hashlib.md5(clean_body.encode()).hexdigest()
|
| 164 |
+
|
| 165 |
+
if clean_id in global_dict:
|
| 166 |
+
# Check if this block is a genuine duplicate
|
| 167 |
+
if clean_id not in seen_hashes:
|
| 168 |
+
seen_hashes[clean_id] = set()
|
| 169 |
+
if block_hash in seen_hashes[clean_id]:
|
| 170 |
+
continue # Skip exact duplicate
|
| 171 |
+
seen_hashes[clean_id].add(block_hash)
|
| 172 |
+
|
| 173 |
+
global_dict[clean_id]["text"] += f" [CONT.]: {clean_body}"
|
| 174 |
+
source_name = os.path.basename(file_path)
|
| 175 |
+
if source_name not in global_dict[clean_id]["metadata"]["source"]:
|
| 176 |
+
global_dict[clean_id]["metadata"]["source"] += f", {source_name}"
|
| 177 |
+
else:
|
| 178 |
+
seen_hashes[clean_id] = {block_hash}
|
| 179 |
+
metadata = extract_rich_metadata(clean_id, clean_body, code_type)
|
| 180 |
+
metadata["source"] = os.path.basename(file_path)
|
| 181 |
+
|
| 182 |
+
global_dict[clean_id] = {
|
| 183 |
+
"id": clean_id,
|
| 184 |
+
"text": f"CONTEXT: {metadata['parent_major']} > {metadata['parent_minor']} | CONTENT: {clean_id} {clean_body}",
|
| 185 |
+
"metadata": metadata,
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
return global_dict
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
# ---------------------------------------------------------------------------
|
| 192 |
+
# Main pipeline
|
| 193 |
+
# ---------------------------------------------------------------------------
|
| 194 |
+
|
| 195 |
+
# File ranges per code type (same as original, but parameterized)
|
| 196 |
+
CODE_CONFIGS = {
|
| 197 |
+
"Building": {
|
| 198 |
+
"file_range": [i for i in range(58, 112) if i not in {90, 91, 92, 93, 94, 100, 101, 103, 106, 107}],
|
| 199 |
+
"output_file": "BUILDING_CODE.json",
|
| 200 |
+
},
|
| 201 |
+
"FuelGas": {
|
| 202 |
+
"file_range": [i for i in range(43, 58) if i not in {50, 51, 52, 53, 54, 56}],
|
| 203 |
+
"output_file": "FUEL_GAS_CODE.json",
|
| 204 |
+
},
|
| 205 |
+
"Mechanical": {
|
| 206 |
+
"file_range": [i for i in range(24, 43) if i not in {30, 31}],
|
| 207 |
+
"output_file": "MECHANICAL_CODE.json",
|
| 208 |
+
},
|
| 209 |
+
"Plumbing": {
|
| 210 |
+
"file_range": list(range(1, 24)),
|
| 211 |
+
"output_file": "PLUMBING_CODE.json",
|
| 212 |
+
},
|
| 213 |
+
"Administrative": {
|
| 214 |
+
"file_range": list(range(112, 160)),
|
| 215 |
+
"output_file": "GENERAL_ADMINISTRATIVE_PROVISIONS.json",
|
| 216 |
+
},
|
| 217 |
+
}
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
def preprocess_all(text_dir: str, output_dir: str) -> dict[str, int]:
|
| 221 |
+
"""Run preprocessing for all code types. Returns counts per type."""
|
| 222 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 223 |
+
counts: dict[str, int] = {}
|
| 224 |
+
|
| 225 |
+
for code_type, cfg in CODE_CONFIGS.items():
|
| 226 |
+
master_dict: OrderedDict = OrderedDict()
|
| 227 |
+
seen_hashes: dict[str, set[str]] = {}
|
| 228 |
+
|
| 229 |
+
for file_num in cfg["file_range"]:
|
| 230 |
+
path = os.path.join(text_dir, f"{file_num:03d}.txt")
|
| 231 |
+
if os.path.exists(path):
|
| 232 |
+
print(f"[{code_type}] Processing {path}...")
|
| 233 |
+
extract_trade_sections(path, master_dict, code_type, seen_hashes)
|
| 234 |
+
|
| 235 |
+
result = list(master_dict.values())
|
| 236 |
+
output_path = os.path.join(output_dir, cfg["output_file"])
|
| 237 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
| 238 |
+
json.dump(result, f, indent=2, ensure_ascii=False)
|
| 239 |
+
|
| 240 |
+
counts[code_type] = len(result)
|
| 241 |
+
print(f"[{code_type}] Wrote {len(result)} sections to {output_path}")
|
| 242 |
+
|
| 243 |
+
return counts
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
if __name__ == "__main__":
|
| 247 |
+
import sys
|
| 248 |
+
|
| 249 |
+
text_dir = sys.argv[1] if len(sys.argv) > 1 else "Text"
|
| 250 |
+
output_dir = sys.argv[2] if len(sys.argv) > 2 else "data"
|
| 251 |
+
|
| 252 |
+
counts = preprocess_all(text_dir, output_dir)
|
| 253 |
+
print(f"\nPreprocessing complete: {counts}")
|
graph.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""LangGraph definition — compliance workflow with parallel fan-out/fan-in and investigation loop."""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
from langgraph.graph import END, StateGraph
|
| 5 |
+
|
| 6 |
+
from config import MAX_INVESTIGATION_ROUNDS
|
| 7 |
+
from nodes.code_lookup import initial_code_lookup, targeted_code_lookup
|
| 8 |
+
from nodes.compliance_analyst import compliance_analyst
|
| 9 |
+
from nodes.compliance_planner import compliance_planner
|
| 10 |
+
from nodes.cropper import ProgressCallback, execute_crops
|
| 11 |
+
from nodes.annotator import annotate_crops
|
| 12 |
+
from nodes.deliberation import deliberation
|
| 13 |
+
from nodes.final_verdict import final_verdict
|
| 14 |
+
from state import ComplianceState
|
| 15 |
+
from tools.crop_cache import CropCache
|
| 16 |
+
from tools.image_store import ImageStore
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def _build_compliance_graph(
|
| 20 |
+
image_store: ImageStore,
|
| 21 |
+
crop_cache: CropCache | None = None,
|
| 22 |
+
progress_callback: ProgressCallback | None = None,
|
| 23 |
+
) -> StateGraph:
|
| 24 |
+
"""Build the compliance analysis graph with parallel fan-out/fan-in.
|
| 25 |
+
|
| 26 |
+
Architecture:
|
| 27 |
+
compliance_planner
|
| 28 |
+
├── execute_crops ──► annotate_crops (optional) ──┐
|
| 29 |
+
└── initial_code_lookup ─────────────────────────┤
|
| 30 |
+
▼
|
| 31 |
+
compliance_analyst ◄──┐
|
| 32 |
+
│ │ │ │
|
| 33 |
+
│ │ │ │
|
| 34 |
+
(crops)(code)(done) │
|
| 35 |
+
│ │ │ │
|
| 36 |
+
└────┘ │ │
|
| 37 |
+
│ │ │
|
| 38 |
+
▼ ▼ │
|
| 39 |
+
deliberation (opt) │
|
| 40 |
+
│ │
|
| 41 |
+
final_verdict ───────┘
|
| 42 |
+
│
|
| 43 |
+
END
|
| 44 |
+
"""
|
| 45 |
+
|
| 46 |
+
# ---- Wrap nodes that need ImageStore / CropCache / callback ----
|
| 47 |
+
def _execute_crops(state: ComplianceState) -> dict:
|
| 48 |
+
return execute_crops(state, image_store, crop_cache, progress_callback)
|
| 49 |
+
|
| 50 |
+
def _annotate_crops(state: ComplianceState) -> dict:
|
| 51 |
+
return annotate_crops(state, image_store)
|
| 52 |
+
|
| 53 |
+
def _compliance_analyst(state: ComplianceState) -> dict:
|
| 54 |
+
return compliance_analyst(state, image_store)
|
| 55 |
+
|
| 56 |
+
def _deliberation(state: ComplianceState) -> dict:
|
| 57 |
+
return deliberation(state, image_store)
|
| 58 |
+
|
| 59 |
+
# ---- Build graph ----
|
| 60 |
+
graph = StateGraph(ComplianceState)
|
| 61 |
+
|
| 62 |
+
# Add all nodes
|
| 63 |
+
graph.add_node("compliance_planner", compliance_planner)
|
| 64 |
+
graph.add_node("execute_crops", _execute_crops)
|
| 65 |
+
graph.add_node("annotate_crops", _annotate_crops)
|
| 66 |
+
graph.add_node("initial_code_lookup", initial_code_lookup)
|
| 67 |
+
graph.add_node("compliance_analyst", _compliance_analyst)
|
| 68 |
+
graph.add_node("targeted_code_lookup", targeted_code_lookup)
|
| 69 |
+
graph.add_node("deliberation", _deliberation)
|
| 70 |
+
graph.add_node("final_verdict", final_verdict)
|
| 71 |
+
|
| 72 |
+
# ---- Edges ----
|
| 73 |
+
|
| 74 |
+
# Entry: planner is always first
|
| 75 |
+
graph.set_entry_point("compliance_planner")
|
| 76 |
+
|
| 77 |
+
# Parallel fan-out: planner → both execute_crops AND initial_code_lookup
|
| 78 |
+
graph.add_edge("compliance_planner", "execute_crops")
|
| 79 |
+
graph.add_edge("compliance_planner", "initial_code_lookup")
|
| 80 |
+
|
| 81 |
+
# After crops: optionally annotate, then go to compliance_analyst
|
| 82 |
+
def _after_crops(state: ComplianceState) -> str:
|
| 83 |
+
if not state.get("enable_annotation", True):
|
| 84 |
+
return "compliance_analyst"
|
| 85 |
+
crop_tasks = state.get("crop_tasks", [])
|
| 86 |
+
if not any(t.get("annotate") and t.get("annotation_prompt") for t in crop_tasks):
|
| 87 |
+
return "compliance_analyst"
|
| 88 |
+
return "annotate_crops"
|
| 89 |
+
|
| 90 |
+
graph.add_conditional_edges(
|
| 91 |
+
"execute_crops",
|
| 92 |
+
_after_crops,
|
| 93 |
+
{"annotate_crops": "annotate_crops", "compliance_analyst": "compliance_analyst"},
|
| 94 |
+
)
|
| 95 |
+
graph.add_edge("annotate_crops", "compliance_analyst")
|
| 96 |
+
|
| 97 |
+
# Code lookup also feeds into compliance_analyst (fan-in)
|
| 98 |
+
graph.add_edge("initial_code_lookup", "compliance_analyst")
|
| 99 |
+
|
| 100 |
+
# After analysis: loop for more evidence, deliberate, or go to verdict
|
| 101 |
+
def _after_analyst(state: ComplianceState) -> str:
|
| 102 |
+
needs_more = state.get("needs_more_investigation", False)
|
| 103 |
+
round_num = state.get("investigation_round", 0)
|
| 104 |
+
max_rounds = state.get("max_rounds", MAX_INVESTIGATION_ROUNDS)
|
| 105 |
+
has_additional_crops = bool(state.get("additional_crop_tasks", []))
|
| 106 |
+
has_additional_code = bool(state.get("additional_code_queries", []))
|
| 107 |
+
enable_consensus = state.get("enable_consensus", False)
|
| 108 |
+
|
| 109 |
+
if needs_more and round_num < max_rounds:
|
| 110 |
+
if has_additional_code:
|
| 111 |
+
return "targeted_code_lookup"
|
| 112 |
+
if has_additional_crops:
|
| 113 |
+
return "execute_crops"
|
| 114 |
+
if enable_consensus:
|
| 115 |
+
return "deliberation"
|
| 116 |
+
return "final_verdict"
|
| 117 |
+
|
| 118 |
+
graph.add_conditional_edges(
|
| 119 |
+
"compliance_analyst",
|
| 120 |
+
_after_analyst,
|
| 121 |
+
{
|
| 122 |
+
"execute_crops": "execute_crops",
|
| 123 |
+
"targeted_code_lookup": "targeted_code_lookup",
|
| 124 |
+
"deliberation": "deliberation",
|
| 125 |
+
"final_verdict": "final_verdict",
|
| 126 |
+
},
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
# Targeted code lookup feeds back to analyst
|
| 130 |
+
graph.add_edge("targeted_code_lookup", "compliance_analyst")
|
| 131 |
+
|
| 132 |
+
# Deliberation → verdict
|
| 133 |
+
graph.add_edge("deliberation", "final_verdict")
|
| 134 |
+
|
| 135 |
+
# Verdict → END
|
| 136 |
+
graph.add_edge("final_verdict", END)
|
| 137 |
+
|
| 138 |
+
return graph
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
def compile_compliance_graph(
|
| 142 |
+
image_store: ImageStore,
|
| 143 |
+
crop_cache: CropCache | None = None,
|
| 144 |
+
progress_callback: ProgressCallback | None = None,
|
| 145 |
+
):
|
| 146 |
+
"""Return a compiled, ready-to-invoke compliance graph."""
|
| 147 |
+
return _build_compliance_graph(image_store, crop_cache, progress_callback).compile()
|
nodes/__init__.py
ADDED
|
File without changes
|
nodes/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (230 Bytes). View file
|
|
|
nodes/__pycache__/annotator.cpython-313.pyc
ADDED
|
Binary file (5.18 kB). View file
|
|
|
nodes/__pycache__/code_lookup.cpython-313.pyc
ADDED
|
Binary file (10.6 kB). View file
|
|
|
nodes/__pycache__/compliance_analyst.cpython-313.pyc
ADDED
|
Binary file (8.35 kB). View file
|
|
|
nodes/__pycache__/compliance_planner.cpython-313.pyc
ADDED
|
Binary file (6.46 kB). View file
|
|
|
nodes/__pycache__/cropper.cpython-313.pyc
ADDED
|
Binary file (8.87 kB). View file
|
|
|
nodes/__pycache__/deliberation.cpython-313.pyc
ADDED
|
Binary file (3.32 kB). View file
|
|
|
nodes/__pycache__/final_verdict.cpython-313.pyc
ADDED
|
Binary file (3.74 kB). View file
|
|
|
nodes/__pycache__/metadata_generator.cpython-313.pyc
ADDED
|
Binary file (7.52 kB). View file
|
|
|
nodes/annotator.py
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""annotate_crops node — nano-banana (Gemini image generation) for semantic annotation."""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import io
|
| 5 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 6 |
+
|
| 7 |
+
from google import genai
|
| 8 |
+
from google.genai import types
|
| 9 |
+
from PIL import Image
|
| 10 |
+
|
| 11 |
+
from config import ANNOTATOR_MODEL, GOOGLE_API_KEY
|
| 12 |
+
from prompts.annotator import ANNOTATION_WRAPPER
|
| 13 |
+
from state import DrawingReaderState, ImageRef
|
| 14 |
+
from tools.image_store import ImageStore
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def _extract_generated_image(response) -> Image.Image | None:
|
| 18 |
+
"""Extract the generated image from a Gemini image-generation response."""
|
| 19 |
+
for part in response.candidates[0].content.parts:
|
| 20 |
+
if part.inline_data is not None:
|
| 21 |
+
return Image.open(io.BytesIO(part.inline_data.data))
|
| 22 |
+
return None
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def _annotate_single_crop_sync(
|
| 26 |
+
client: genai.Client,
|
| 27 |
+
crop_ref: ImageRef,
|
| 28 |
+
annotation_prompt: str,
|
| 29 |
+
image_store: ImageStore,
|
| 30 |
+
) -> ImageRef | None:
|
| 31 |
+
"""Annotate one crop using nano-banana (synchronous)."""
|
| 32 |
+
crop_bytes = image_store.load_bytes(crop_ref)
|
| 33 |
+
|
| 34 |
+
full_prompt = ANNOTATION_WRAPPER.format(annotation_prompt=annotation_prompt)
|
| 35 |
+
|
| 36 |
+
response = client.models.generate_content(
|
| 37 |
+
model=ANNOTATOR_MODEL,
|
| 38 |
+
contents=[
|
| 39 |
+
types.Part.from_bytes(data=crop_bytes, mime_type="image/png"),
|
| 40 |
+
full_prompt,
|
| 41 |
+
],
|
| 42 |
+
config=types.GenerateContentConfig(
|
| 43 |
+
response_modalities=["TEXT", "IMAGE"],
|
| 44 |
+
),
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
annotated_image = _extract_generated_image(response)
|
| 48 |
+
if annotated_image is None:
|
| 49 |
+
return None
|
| 50 |
+
|
| 51 |
+
ref = image_store.save_annotated(crop_ref, annotated_image)
|
| 52 |
+
return ref
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def annotate_crops(state: DrawingReaderState, image_store: ImageStore) -> dict:
|
| 56 |
+
"""Run nano-banana annotation on crops that need it."""
|
| 57 |
+
crop_tasks = state.get("crop_tasks", [])
|
| 58 |
+
image_refs = state.get("image_refs", [])
|
| 59 |
+
|
| 60 |
+
# Build a mapping: find crops that need annotation.
|
| 61 |
+
# The most recent batch of crops corresponds to the current crop_tasks.
|
| 62 |
+
# Take the LAST len(crop_tasks) crops from image_refs to match by position,
|
| 63 |
+
# so that on loop-back rounds we only match against the newest crops.
|
| 64 |
+
crops_needing_annotation: list[tuple[ImageRef, str]] = []
|
| 65 |
+
|
| 66 |
+
all_crops = [r for r in image_refs if r["crop_type"] == "crop"]
|
| 67 |
+
# Only the tail — the most recent batch produced by execute_crops
|
| 68 |
+
recent_crops = all_crops[-len(crop_tasks):] if crop_tasks else []
|
| 69 |
+
|
| 70 |
+
for i, task in enumerate(crop_tasks):
|
| 71 |
+
if task["annotate"] and task["annotation_prompt"] and i < len(recent_crops):
|
| 72 |
+
crops_needing_annotation.append(
|
| 73 |
+
(recent_crops[i], task["annotation_prompt"])
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
if not crops_needing_annotation:
|
| 77 |
+
return {"status_message": ["No annotation needed for these crops."]}
|
| 78 |
+
|
| 79 |
+
client = genai.Client(api_key=GOOGLE_API_KEY)
|
| 80 |
+
|
| 81 |
+
# Use a thread pool instead of asyncio to avoid event-loop conflicts
|
| 82 |
+
# with Streamlit's own event loop.
|
| 83 |
+
results: list[ImageRef | None | Exception] = [None] * len(crops_needing_annotation)
|
| 84 |
+
|
| 85 |
+
with ThreadPoolExecutor(max_workers=min(len(crops_needing_annotation), 4)) as pool:
|
| 86 |
+
future_to_idx = {}
|
| 87 |
+
for i, (ref, prompt) in enumerate(crops_needing_annotation):
|
| 88 |
+
future = pool.submit(
|
| 89 |
+
_annotate_single_crop_sync, client, ref, prompt, image_store,
|
| 90 |
+
)
|
| 91 |
+
future_to_idx[future] = i
|
| 92 |
+
|
| 93 |
+
for future in as_completed(future_to_idx):
|
| 94 |
+
idx = future_to_idx[future]
|
| 95 |
+
try:
|
| 96 |
+
results[idx] = future.result()
|
| 97 |
+
except Exception as e:
|
| 98 |
+
results[idx] = e
|
| 99 |
+
|
| 100 |
+
annotated_refs: list[ImageRef] = []
|
| 101 |
+
errors: list[str] = []
|
| 102 |
+
for i, result in enumerate(results):
|
| 103 |
+
if isinstance(result, Exception):
|
| 104 |
+
errors.append(f"Annotation {i} failed: {result}")
|
| 105 |
+
elif result is not None:
|
| 106 |
+
annotated_refs.append(result)
|
| 107 |
+
else:
|
| 108 |
+
errors.append(f"Annotation {i} returned no image")
|
| 109 |
+
|
| 110 |
+
status = f"Annotated {len(annotated_refs)} of {len(crops_needing_annotation)} crops."
|
| 111 |
+
if errors:
|
| 112 |
+
status += f" Issues: {'; '.join(errors)}"
|
| 113 |
+
|
| 114 |
+
return {
|
| 115 |
+
"image_refs": annotated_refs,
|
| 116 |
+
"status_message": [status],
|
| 117 |
+
}
|
nodes/code_lookup.py
ADDED
|
@@ -0,0 +1,286 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""code_lookup node — lightweight snippet reviewer (no multi-turn tool loop).
|
| 2 |
+
|
| 3 |
+
Flow:
|
| 4 |
+
1. discover_code_locations() — ChromaDB semantic search (~1-2 sec per query)
|
| 5 |
+
2. GPT reviews the raw snippets in a SINGLE call — flags relevant ones with
|
| 6 |
+
a relevance tag and brief note
|
| 7 |
+
3. Raw flagged snippets + GPT notes go to the compliance analyst
|
| 8 |
+
|
| 9 |
+
No fetch_full_chapter in the initial pass. The compliance analyst can request
|
| 10 |
+
targeted chapter fetches via additional_code_queries if it needs more context.
|
| 11 |
+
"""
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
|
| 14 |
+
import json
|
| 15 |
+
import logging
|
| 16 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 17 |
+
from datetime import datetime
|
| 18 |
+
|
| 19 |
+
from openai import OpenAI
|
| 20 |
+
|
| 21 |
+
from config import CODE_LOOKUP_MODEL, OPENAI_API_KEY
|
| 22 |
+
from prompts.code_lookup import CODE_REVIEWER_SYSTEM_PROMPT
|
| 23 |
+
from state import AgentMessage, CodeQuery, CodeSection, ComplianceState
|
| 24 |
+
from tools.chroma_tools import QueryCache, discover_code_locations
|
| 25 |
+
|
| 26 |
+
logger = logging.getLogger(__name__)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
# ---------------------------------------------------------------------------
|
| 30 |
+
# Single-call snippet reviewer (replaces the multi-turn tool loop)
|
| 31 |
+
# ---------------------------------------------------------------------------
|
| 32 |
+
|
| 33 |
+
def _review_snippets(
|
| 34 |
+
research_goal: str,
|
| 35 |
+
discover_report: str,
|
| 36 |
+
) -> tuple[str, list[CodeSection]]:
|
| 37 |
+
"""GPT reviews discover results in ONE call.
|
| 38 |
+
|
| 39 |
+
Returns (brief_review, flagged_sections).
|
| 40 |
+
"""
|
| 41 |
+
client = OpenAI(api_key=OPENAI_API_KEY)
|
| 42 |
+
|
| 43 |
+
response = client.chat.completions.create(
|
| 44 |
+
model=CODE_LOOKUP_MODEL,
|
| 45 |
+
messages=[
|
| 46 |
+
{"role": "system", "content": CODE_REVIEWER_SYSTEM_PROMPT},
|
| 47 |
+
{
|
| 48 |
+
"role": "user",
|
| 49 |
+
"content": (
|
| 50 |
+
f"## Research Goal\n{research_goal}\n\n"
|
| 51 |
+
f"## Code Snippets from Database\n{discover_report}"
|
| 52 |
+
),
|
| 53 |
+
},
|
| 54 |
+
],
|
| 55 |
+
response_format={"type": "json_object"},
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
raw = response.choices[0].message.content or "{}"
|
| 59 |
+
try:
|
| 60 |
+
parsed = json.loads(raw)
|
| 61 |
+
except json.JSONDecodeError:
|
| 62 |
+
logger.warning("GPT snippet review returned invalid JSON, using raw text.")
|
| 63 |
+
return raw, []
|
| 64 |
+
|
| 65 |
+
flagged_sections: list[CodeSection] = []
|
| 66 |
+
for item in parsed.get("relevant_sections", []):
|
| 67 |
+
flagged_sections.append(
|
| 68 |
+
CodeSection(
|
| 69 |
+
section_full=item.get("section_id", "?"),
|
| 70 |
+
code_type=item.get("code_type", "?"),
|
| 71 |
+
parent_major=item.get("chapter", "?"),
|
| 72 |
+
text=item.get("snippet", "")[:1500],
|
| 73 |
+
relevance=item.get("relevance_note", ""),
|
| 74 |
+
)
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
summary = parsed.get("summary", "No summary provided.")
|
| 78 |
+
return summary, flagged_sections
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def _run_single_lookup(
|
| 82 |
+
cq: CodeQuery,
|
| 83 |
+
query_cache: QueryCache | None = None,
|
| 84 |
+
) -> tuple[str, list[CodeSection], str]:
|
| 85 |
+
"""Run discover + review for ONE code query.
|
| 86 |
+
|
| 87 |
+
Returns (summary, flagged_sections, discover_report_raw).
|
| 88 |
+
"""
|
| 89 |
+
research_goal = f"{cq['query']} (Context: {cq['context']})"
|
| 90 |
+
logger.info("ChromaDB query: %s", research_goal)
|
| 91 |
+
|
| 92 |
+
# Step 1: ChromaDB discover (fast, ~1-2s)
|
| 93 |
+
discover_report = discover_code_locations(research_goal, cache=query_cache)
|
| 94 |
+
|
| 95 |
+
# Step 2: GPT reviews snippets in a single call
|
| 96 |
+
summary, flagged = _review_snippets(research_goal, discover_report)
|
| 97 |
+
|
| 98 |
+
return summary, flagged, discover_report
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
# ---------------------------------------------------------------------------
|
| 102 |
+
# LangGraph node functions
|
| 103 |
+
# ---------------------------------------------------------------------------
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def initial_code_lookup(state: ComplianceState) -> dict:
|
| 107 |
+
"""Run the initial code lookup based on planner's code_queries.
|
| 108 |
+
|
| 109 |
+
All queries run in PARALLEL via ThreadPoolExecutor.
|
| 110 |
+
Each query = 1 discover + 1 GPT review call (no multi-turn loop).
|
| 111 |
+
"""
|
| 112 |
+
code_queries = state.get("code_queries", [])
|
| 113 |
+
if not code_queries:
|
| 114 |
+
return {
|
| 115 |
+
"code_report": "No code queries were planned.",
|
| 116 |
+
"code_sections": [],
|
| 117 |
+
"discussion_log": [
|
| 118 |
+
AgentMessage(
|
| 119 |
+
timestamp=datetime.now().strftime("%H:%M:%S"),
|
| 120 |
+
agent="code_analyst",
|
| 121 |
+
action="search_code",
|
| 122 |
+
summary="No code queries to execute.",
|
| 123 |
+
detail="The planner did not generate any code queries.",
|
| 124 |
+
evidence_refs=[],
|
| 125 |
+
)
|
| 126 |
+
],
|
| 127 |
+
"status_message": ["No code queries to execute."],
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
query_cache = QueryCache()
|
| 131 |
+
discussion_messages: list[AgentMessage] = []
|
| 132 |
+
|
| 133 |
+
# Add "searching" messages for all queries upfront
|
| 134 |
+
for cq in code_queries:
|
| 135 |
+
discussion_messages.append(
|
| 136 |
+
AgentMessage(
|
| 137 |
+
timestamp=datetime.now().strftime("%H:%M:%S"),
|
| 138 |
+
agent="code_analyst",
|
| 139 |
+
action="search_code",
|
| 140 |
+
summary=f"Searching: {cq['query'][:80]}...",
|
| 141 |
+
detail=f"Focus area: {cq['focus_area']}\nContext: {cq['context']}",
|
| 142 |
+
evidence_refs=[],
|
| 143 |
+
)
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
# Execute ALL queries concurrently
|
| 147 |
+
results: dict[int, tuple[str, list[CodeSection], str]] = {}
|
| 148 |
+
|
| 149 |
+
with ThreadPoolExecutor(max_workers=min(len(code_queries), 4)) as pool:
|
| 150 |
+
futures = {
|
| 151 |
+
pool.submit(_run_single_lookup, cq, query_cache): i
|
| 152 |
+
for i, cq in enumerate(code_queries)
|
| 153 |
+
}
|
| 154 |
+
for future in as_completed(futures):
|
| 155 |
+
i = futures[future]
|
| 156 |
+
try:
|
| 157 |
+
summary, flagged, _raw = future.result()
|
| 158 |
+
results[i] = (summary, flagged, _raw)
|
| 159 |
+
cq = code_queries[i]
|
| 160 |
+
section_ids = ", ".join(
|
| 161 |
+
s["section_full"] for s in flagged
|
| 162 |
+
)
|
| 163 |
+
discussion_messages.append(
|
| 164 |
+
AgentMessage(
|
| 165 |
+
timestamp=datetime.now().strftime("%H:%M:%S"),
|
| 166 |
+
agent="code_analyst",
|
| 167 |
+
action="search_code",
|
| 168 |
+
summary=(
|
| 169 |
+
f"Flagged {len(flagged)} sections "
|
| 170 |
+
f"for '{cq['query']}'"
|
| 171 |
+
),
|
| 172 |
+
detail=(
|
| 173 |
+
f"**Query:** {cq['query']}\n"
|
| 174 |
+
f"**Focus:** {cq['focus_area']}\n\n"
|
| 175 |
+
f"**Sections:** {section_ids}\n\n"
|
| 176 |
+
f"{summary[:800]}"
|
| 177 |
+
),
|
| 178 |
+
evidence_refs=[s["section_full"] for s in flagged],
|
| 179 |
+
)
|
| 180 |
+
)
|
| 181 |
+
except Exception as e:
|
| 182 |
+
logger.error("Code query %d failed: %s", i, e)
|
| 183 |
+
results[i] = (f"Error: {e}", [], "")
|
| 184 |
+
discussion_messages.append(
|
| 185 |
+
AgentMessage(
|
| 186 |
+
timestamp=datetime.now().strftime("%H:%M:%S"),
|
| 187 |
+
agent="code_analyst",
|
| 188 |
+
action="search_code",
|
| 189 |
+
summary=f"Query {i + 1} failed: {e}",
|
| 190 |
+
detail=str(e),
|
| 191 |
+
evidence_refs=[],
|
| 192 |
+
)
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
# Reassemble in original order
|
| 196 |
+
report_parts: list[str] = []
|
| 197 |
+
all_sections: list[CodeSection] = []
|
| 198 |
+
for i in range(len(code_queries)):
|
| 199 |
+
summary, flagged, _raw = results.get(i, ("No result", [], ""))
|
| 200 |
+
cq = code_queries[i]
|
| 201 |
+
report_parts.append(
|
| 202 |
+
f"### Query {i + 1}: {cq['focus_area']}\n{summary}"
|
| 203 |
+
)
|
| 204 |
+
all_sections.extend(flagged)
|
| 205 |
+
|
| 206 |
+
combined_report = "\n\n---\n\n".join(report_parts)
|
| 207 |
+
|
| 208 |
+
return {
|
| 209 |
+
"code_report": combined_report,
|
| 210 |
+
"code_sections": all_sections,
|
| 211 |
+
"discussion_log": discussion_messages,
|
| 212 |
+
"status_message": [
|
| 213 |
+
f"Code lookup complete. {len(all_sections)} relevant sections "
|
| 214 |
+
f"flagged across {len(code_queries)} queries."
|
| 215 |
+
],
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
def targeted_code_lookup(state: ComplianceState) -> dict:
|
| 220 |
+
"""Run additional code lookups requested by the compliance analyst.
|
| 221 |
+
|
| 222 |
+
These may use fetch_full_chapter for deeper context when the analyst
|
| 223 |
+
needs full exception text or cross-reference detail.
|
| 224 |
+
"""
|
| 225 |
+
additional_queries = state.get("additional_code_queries", [])
|
| 226 |
+
if not additional_queries:
|
| 227 |
+
return {
|
| 228 |
+
"status_message": ["No additional code queries."],
|
| 229 |
+
}
|
| 230 |
+
|
| 231 |
+
query_cache = QueryCache()
|
| 232 |
+
all_sections: list[CodeSection] = []
|
| 233 |
+
report_parts: list[str] = []
|
| 234 |
+
discussion_messages: list[AgentMessage] = []
|
| 235 |
+
|
| 236 |
+
for i, cq in enumerate(additional_queries):
|
| 237 |
+
discussion_messages.append(
|
| 238 |
+
AgentMessage(
|
| 239 |
+
timestamp=datetime.now().strftime("%H:%M:%S"),
|
| 240 |
+
agent="code_analyst",
|
| 241 |
+
action="search_code",
|
| 242 |
+
summary=f"Follow-up search: {cq['query'][:80]}...",
|
| 243 |
+
detail=f"Requested by compliance analyst.\nFocus: {cq['focus_area']}",
|
| 244 |
+
evidence_refs=[],
|
| 245 |
+
)
|
| 246 |
+
)
|
| 247 |
+
|
| 248 |
+
summary, flagged, _raw = _run_single_lookup(cq, query_cache)
|
| 249 |
+
report_parts.append(summary)
|
| 250 |
+
all_sections.extend(flagged)
|
| 251 |
+
|
| 252 |
+
section_ids = ", ".join(s["section_full"] for s in flagged)
|
| 253 |
+
discussion_messages.append(
|
| 254 |
+
AgentMessage(
|
| 255 |
+
timestamp=datetime.now().strftime("%H:%M:%S"),
|
| 256 |
+
agent="code_analyst",
|
| 257 |
+
action="search_code",
|
| 258 |
+
summary=(
|
| 259 |
+
f"Follow-up: flagged {len(flagged)} sections "
|
| 260 |
+
f"for '{cq['query']}'"
|
| 261 |
+
),
|
| 262 |
+
detail=(
|
| 263 |
+
f"**Query:** {cq['query']}\n"
|
| 264 |
+
f"**Focus:** {cq['focus_area']}\n\n"
|
| 265 |
+
f"**Sections:** {section_ids}\n\n"
|
| 266 |
+
f"{summary[:800]}"
|
| 267 |
+
),
|
| 268 |
+
evidence_refs=[s["section_full"] for s in flagged],
|
| 269 |
+
)
|
| 270 |
+
)
|
| 271 |
+
|
| 272 |
+
# Append to existing report
|
| 273 |
+
existing_report = state.get("code_report", "")
|
| 274 |
+
new_report = "\n\n---\n\n".join(report_parts)
|
| 275 |
+
combined_report = f"{existing_report}\n\n## FOLLOW-UP CODE RESEARCH\n\n{new_report}"
|
| 276 |
+
|
| 277 |
+
return {
|
| 278 |
+
"code_report": combined_report,
|
| 279 |
+
"code_sections": all_sections,
|
| 280 |
+
"additional_code_queries": [], # Clear after processing
|
| 281 |
+
"discussion_log": discussion_messages,
|
| 282 |
+
"status_message": [
|
| 283 |
+
f"Targeted code lookup complete. {len(all_sections)} additional sections "
|
| 284 |
+
f"from {len(additional_queries)} follow-up queries."
|
| 285 |
+
],
|
| 286 |
+
}
|
nodes/compliance_analyst.py
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""compliance_analyst node — multimodal fusion of images + code for compliance determination."""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import json
|
| 5 |
+
import re
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
|
| 8 |
+
from google import genai
|
| 9 |
+
from google.genai import types
|
| 10 |
+
|
| 11 |
+
from config import ANALYZER_MODEL, GOOGLE_API_KEY
|
| 12 |
+
from prompts.compliance_analyst import COMPLIANCE_ANALYST_SYSTEM_PROMPT
|
| 13 |
+
from state import AgentMessage, CodeQuery, ComplianceState, CropTask
|
| 14 |
+
from tools.image_store import ImageStore
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def compliance_analyst(state: ComplianceState, image_store: ImageStore) -> dict:
|
| 18 |
+
"""Review all cropped images AND code sections to produce compliance findings."""
|
| 19 |
+
question = state["question"]
|
| 20 |
+
image_refs = state.get("image_refs", [])
|
| 21 |
+
code_report = state.get("code_report", "")
|
| 22 |
+
legend_pages = set(state.get("legend_pages", []))
|
| 23 |
+
investigation_round = state.get("investigation_round", 0)
|
| 24 |
+
discussion_log = state.get("discussion_log", [])
|
| 25 |
+
|
| 26 |
+
client = genai.Client(api_key=GOOGLE_API_KEY)
|
| 27 |
+
|
| 28 |
+
# Build multimodal content
|
| 29 |
+
content_parts: list[types.Part] = []
|
| 30 |
+
|
| 31 |
+
# 1. User question
|
| 32 |
+
content_parts.append(types.Part.from_text(text=f"USER COMPLIANCE QUESTION: {question}"))
|
| 33 |
+
|
| 34 |
+
# 2. Code report (legal requirements)
|
| 35 |
+
if code_report:
|
| 36 |
+
content_parts.append(
|
| 37 |
+
types.Part.from_text(
|
| 38 |
+
text=f"\n=== LEGAL REQUIREMENTS FROM NYC CODE ===\n{code_report}"
|
| 39 |
+
)
|
| 40 |
+
)
|
| 41 |
+
else:
|
| 42 |
+
content_parts.append(
|
| 43 |
+
types.Part.from_text(text="\n=== NO CODE SECTIONS RETRIEVED ===\n")
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
# 3. Discussion log summary (what previous agents found)
|
| 47 |
+
if discussion_log:
|
| 48 |
+
log_summary = "\n".join(
|
| 49 |
+
f"[{m['timestamp']}] {m['agent']}: {m['summary']}"
|
| 50 |
+
for m in discussion_log[-10:] # Last 10 messages
|
| 51 |
+
)
|
| 52 |
+
content_parts.append(
|
| 53 |
+
types.Part.from_text(
|
| 54 |
+
text=f"\n=== AGENT DISCUSSION LOG ===\n{log_summary}"
|
| 55 |
+
)
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
# 4. Images — legends first, then detail crops, then annotated
|
| 59 |
+
legend_refs = [r for r in image_refs if r["page_num"] in legend_pages and r["crop_type"] == "crop"]
|
| 60 |
+
detail_crops = [r for r in image_refs if r["page_num"] not in legend_pages and r["crop_type"] == "crop"]
|
| 61 |
+
annotated_refs = [r for r in image_refs if r["crop_type"] == "annotated"]
|
| 62 |
+
|
| 63 |
+
ordered_refs = legend_refs + detail_crops + annotated_refs
|
| 64 |
+
|
| 65 |
+
if legend_refs:
|
| 66 |
+
content_parts.append(
|
| 67 |
+
types.Part.from_text(text="\n=== LEGEND / SCHEDULE CROPS (study these first) ===")
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
first_detail_id = detail_crops[0]["id"] if detail_crops else None
|
| 71 |
+
first_annotated_id = annotated_refs[0]["id"] if annotated_refs else None
|
| 72 |
+
|
| 73 |
+
for ref in ordered_refs:
|
| 74 |
+
if first_detail_id is not None and ref["id"] == first_detail_id:
|
| 75 |
+
content_parts.append(types.Part.from_text(text="\n=== DETAIL CROPS ==="))
|
| 76 |
+
if first_annotated_id is not None and ref["id"] == first_annotated_id:
|
| 77 |
+
content_parts.append(
|
| 78 |
+
types.Part.from_text(text="\n=== ANNOTATED CROPS (highlighted versions) ===")
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
content_parts.append(types.Part.from_text(text=f"\nImage: {ref['label']}"))
|
| 82 |
+
try:
|
| 83 |
+
content_parts.append(image_store.to_gemini_part(ref))
|
| 84 |
+
except Exception as e:
|
| 85 |
+
content_parts.append(
|
| 86 |
+
types.Part.from_text(text=f"(Could not load image: {e})")
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
# 5. Investigation round context
|
| 90 |
+
content_parts.append(
|
| 91 |
+
types.Part.from_text(
|
| 92 |
+
text=(
|
| 93 |
+
f"\nThis is investigation round {investigation_round + 1}. "
|
| 94 |
+
"Analyze the drawings against the code requirements. "
|
| 95 |
+
"If you need more evidence (crops or code lookups), include a JSON block at the end."
|
| 96 |
+
)
|
| 97 |
+
)
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
# Call Gemini
|
| 101 |
+
response = client.models.generate_content(
|
| 102 |
+
model=ANALYZER_MODEL,
|
| 103 |
+
contents=[types.Content(role="user", parts=content_parts)],
|
| 104 |
+
config=types.GenerateContentConfig(
|
| 105 |
+
system_instruction=COMPLIANCE_ANALYST_SYSTEM_PROMPT,
|
| 106 |
+
),
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
analysis_text = response.text
|
| 110 |
+
|
| 111 |
+
# Parse additional investigation requests
|
| 112 |
+
needs_more = False
|
| 113 |
+
additional_crops: list[CropTask] = []
|
| 114 |
+
additional_code_queries: list[CodeQuery] = []
|
| 115 |
+
|
| 116 |
+
json_match = re.search(
|
| 117 |
+
r"```json\s*(\{.*?\"needs_more\"\s*:\s*true.*?\})\s*```",
|
| 118 |
+
analysis_text,
|
| 119 |
+
re.DOTALL,
|
| 120 |
+
)
|
| 121 |
+
if json_match:
|
| 122 |
+
try:
|
| 123 |
+
extra = json.loads(json_match.group(1))
|
| 124 |
+
if extra.get("needs_more"):
|
| 125 |
+
needs_more = True
|
| 126 |
+
|
| 127 |
+
for t in extra.get("additional_crops", []):
|
| 128 |
+
raw_page = int(t.get("page_num", 1))
|
| 129 |
+
additional_crops.append(
|
| 130 |
+
CropTask(
|
| 131 |
+
page_num=raw_page - 1,
|
| 132 |
+
crop_instruction=t.get("crop_instruction", ""),
|
| 133 |
+
annotate=bool(t.get("annotate", False)),
|
| 134 |
+
annotation_prompt=t.get("annotation_prompt", ""),
|
| 135 |
+
label=t.get("label", "Additional crop"),
|
| 136 |
+
priority=int(t.get("priority", 1)),
|
| 137 |
+
)
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
for q in extra.get("additional_code_queries", []):
|
| 141 |
+
additional_code_queries.append(
|
| 142 |
+
CodeQuery(
|
| 143 |
+
query=q.get("query", ""),
|
| 144 |
+
focus_area=q.get("focus_area", ""),
|
| 145 |
+
context=q.get("context", ""),
|
| 146 |
+
priority=int(q.get("priority", 0)),
|
| 147 |
+
)
|
| 148 |
+
)
|
| 149 |
+
except (json.JSONDecodeError, KeyError):
|
| 150 |
+
pass
|
| 151 |
+
|
| 152 |
+
# Clean the JSON block from the analysis text
|
| 153 |
+
analysis_text = analysis_text[: json_match.start()].strip()
|
| 154 |
+
|
| 155 |
+
# Build discussion message
|
| 156 |
+
if needs_more:
|
| 157 |
+
summary = (
|
| 158 |
+
f"Round {investigation_round + 1} analysis complete. "
|
| 159 |
+
f"Requesting {len(additional_crops)} more crops and "
|
| 160 |
+
f"{len(additional_code_queries)} more code lookups."
|
| 161 |
+
)
|
| 162 |
+
else:
|
| 163 |
+
summary = f"Round {investigation_round + 1} compliance analysis complete."
|
| 164 |
+
|
| 165 |
+
discussion_msg = AgentMessage(
|
| 166 |
+
timestamp=datetime.now().strftime("%H:%M:%S"),
|
| 167 |
+
agent="compliance_analyst",
|
| 168 |
+
action="analyze" if not needs_more else "request_more",
|
| 169 |
+
summary=summary,
|
| 170 |
+
detail=analysis_text[:1500],
|
| 171 |
+
evidence_refs=[ref["id"] for ref in image_refs[:5]],
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
result: dict = {
|
| 175 |
+
"compliance_analysis": analysis_text,
|
| 176 |
+
"investigation_round": investigation_round + 1,
|
| 177 |
+
"needs_more_investigation": needs_more,
|
| 178 |
+
"discussion_log": [discussion_msg],
|
| 179 |
+
"status_message": [summary],
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
if additional_crops:
|
| 183 |
+
result["crop_tasks"] = additional_crops
|
| 184 |
+
result["additional_crop_tasks"] = additional_crops
|
| 185 |
+
if additional_code_queries:
|
| 186 |
+
result["additional_code_queries"] = additional_code_queries
|
| 187 |
+
|
| 188 |
+
return result
|
nodes/compliance_planner.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""compliance_planner node — dual-plan generation (crops + code queries)."""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import json
|
| 5 |
+
import re
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
|
| 8 |
+
from google import genai
|
| 9 |
+
from google.genai import types
|
| 10 |
+
|
| 11 |
+
from config import GOOGLE_API_KEY, PLANNER_MODEL
|
| 12 |
+
from prompts.compliance_planner import COMPLIANCE_PLANNER_SYSTEM_PROMPT
|
| 13 |
+
from state import AgentMessage, CodeQuery, ComplianceState, CropTask
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def compliance_planner(state: ComplianceState) -> dict:
|
| 17 |
+
"""Analyze page metadata + user question and produce dual plans for
|
| 18 |
+
image cropping AND code lookup."""
|
| 19 |
+
question = state["question"]
|
| 20 |
+
num_pages = state.get("num_pages", 0)
|
| 21 |
+
page_metadata_json = state.get("page_metadata_json", "")
|
| 22 |
+
investigation_round = state.get("investigation_round", 0)
|
| 23 |
+
|
| 24 |
+
client = genai.Client(api_key=GOOGLE_API_KEY)
|
| 25 |
+
|
| 26 |
+
question_text = (
|
| 27 |
+
f"USER COMPLIANCE QUESTION: {question}\n\n"
|
| 28 |
+
f"The PDF has {num_pages} pages (1-indexed, from page 1 to page {num_pages}).\n"
|
| 29 |
+
f"This is investigation round {investigation_round + 1}.\n\n"
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
if page_metadata_json:
|
| 33 |
+
question_text += f"PAGE METADATA:\n{page_metadata_json}"
|
| 34 |
+
else:
|
| 35 |
+
question_text += (
|
| 36 |
+
"No page metadata available. Based on the question alone, "
|
| 37 |
+
"plan what code lookups are needed. Crop tasks will use default pages."
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
response = client.models.generate_content(
|
| 41 |
+
model=PLANNER_MODEL,
|
| 42 |
+
contents=[types.Content(role="user", parts=[types.Part.from_text(text=question_text)])],
|
| 43 |
+
config=types.GenerateContentConfig(
|
| 44 |
+
system_instruction=COMPLIANCE_PLANNER_SYSTEM_PROMPT,
|
| 45 |
+
),
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
response_text = response.text.strip()
|
| 49 |
+
|
| 50 |
+
# Parse JSON response
|
| 51 |
+
json_match = re.search(r"\{.*\}", response_text, re.DOTALL)
|
| 52 |
+
|
| 53 |
+
target_pages: list[int] = []
|
| 54 |
+
legend_pages: list[int] = []
|
| 55 |
+
crop_tasks: list[CropTask] = []
|
| 56 |
+
code_queries: list[CodeQuery] = []
|
| 57 |
+
|
| 58 |
+
if json_match:
|
| 59 |
+
try:
|
| 60 |
+
parsed = json.loads(json_match.group())
|
| 61 |
+
valid_0indexed = set(range(num_pages))
|
| 62 |
+
|
| 63 |
+
target_pages = [
|
| 64 |
+
int(p) - 1 for p in parsed.get("target_pages", [])
|
| 65 |
+
if int(p) - 1 in valid_0indexed
|
| 66 |
+
]
|
| 67 |
+
legend_pages = [
|
| 68 |
+
int(p) - 1 for p in parsed.get("legend_pages", [])
|
| 69 |
+
if int(p) - 1 in valid_0indexed
|
| 70 |
+
]
|
| 71 |
+
|
| 72 |
+
for t in parsed.get("crop_tasks", []):
|
| 73 |
+
raw_page = int(t.get("page_num", 1))
|
| 74 |
+
crop_tasks.append(
|
| 75 |
+
CropTask(
|
| 76 |
+
page_num=raw_page - 1,
|
| 77 |
+
crop_instruction=t.get("crop_instruction", ""),
|
| 78 |
+
annotate=bool(t.get("annotate", False)),
|
| 79 |
+
annotation_prompt=t.get("annotation_prompt", ""),
|
| 80 |
+
label=t.get("label", f"Page {raw_page} crop"),
|
| 81 |
+
priority=int(t.get("priority", 1)),
|
| 82 |
+
)
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
for q in parsed.get("code_queries", []):
|
| 86 |
+
code_queries.append(
|
| 87 |
+
CodeQuery(
|
| 88 |
+
query=q.get("query", ""),
|
| 89 |
+
focus_area=q.get("focus_area", ""),
|
| 90 |
+
context=q.get("context", ""),
|
| 91 |
+
priority=int(q.get("priority", 0)),
|
| 92 |
+
)
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
except (json.JSONDecodeError, ValueError, KeyError):
|
| 96 |
+
pass
|
| 97 |
+
|
| 98 |
+
# Sort crop tasks by priority
|
| 99 |
+
crop_tasks.sort(key=lambda t: t["priority"])
|
| 100 |
+
|
| 101 |
+
# Fallback: if nothing identified, use first 5 pages
|
| 102 |
+
if not target_pages and not crop_tasks:
|
| 103 |
+
target_pages = list(range(min(num_pages, 5)))
|
| 104 |
+
|
| 105 |
+
# Build discussion log message
|
| 106 |
+
crop_summary = f"{len(crop_tasks)} crop tasks on pages {', '.join(str(p + 1) for p in target_pages[:5])}"
|
| 107 |
+
code_summary = f"{len(code_queries)} code queries"
|
| 108 |
+
if code_queries:
|
| 109 |
+
code_summary += f" ({', '.join(q['focus_area'] for q in code_queries[:3])})"
|
| 110 |
+
|
| 111 |
+
discussion_msg = AgentMessage(
|
| 112 |
+
timestamp=datetime.now().strftime("%H:%M:%S"),
|
| 113 |
+
agent="planner",
|
| 114 |
+
action="plan",
|
| 115 |
+
summary=f"Planned {crop_summary} and {code_summary}.",
|
| 116 |
+
detail=response_text,
|
| 117 |
+
evidence_refs=[],
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
return {
|
| 121 |
+
"target_pages": target_pages,
|
| 122 |
+
"legend_pages": legend_pages,
|
| 123 |
+
"crop_tasks": crop_tasks,
|
| 124 |
+
"code_queries": code_queries,
|
| 125 |
+
"discussion_log": [discussion_msg],
|
| 126 |
+
"status_message": [
|
| 127 |
+
f"Selected {len(target_pages)} pages ({len(legend_pages)} legends), "
|
| 128 |
+
f"planned {len(crop_tasks)} crop tasks, {len(code_queries)} code queries."
|
| 129 |
+
],
|
| 130 |
+
}
|
nodes/cropper.py
ADDED
|
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""execute_crops node — Gemini code_execution for agentic cropping (PoC 1 style)."""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import io
|
| 5 |
+
import logging
|
| 6 |
+
import time
|
| 7 |
+
import uuid
|
| 8 |
+
from collections.abc import Callable
|
| 9 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 10 |
+
|
| 11 |
+
from google import genai
|
| 12 |
+
from google.genai import types
|
| 13 |
+
from PIL import Image
|
| 14 |
+
|
| 15 |
+
from config import CROPPER_MODEL, GOOGLE_API_KEY
|
| 16 |
+
from prompts.cropper import CROPPER_PROMPT_TEMPLATE
|
| 17 |
+
from state import CropTask, DrawingReaderState, ImageRef
|
| 18 |
+
from tools.crop_cache import CropCache
|
| 19 |
+
from tools.image_store import ImageStore
|
| 20 |
+
from tools.pdf_processor import get_page_image_bytes
|
| 21 |
+
|
| 22 |
+
logger = logging.getLogger(__name__)
|
| 23 |
+
|
| 24 |
+
# Type alias for the progress callback.
|
| 25 |
+
# Signature: (completed_ref, crop_task, source, completed_count, total_count)
|
| 26 |
+
ProgressCallback = Callable[[ImageRef, CropTask, str, int, int], None]
|
| 27 |
+
|
| 28 |
+
# Retry settings for transient API errors (429 / 503)
|
| 29 |
+
MAX_RETRIES = 3
|
| 30 |
+
RETRY_BASE_DELAY = 2.0 # seconds
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def _extract_last_image(response) -> Image.Image | None:
|
| 34 |
+
"""Extract the last generated image from a Gemini code_execution response."""
|
| 35 |
+
last_image = None
|
| 36 |
+
for part in response.candidates[0].content.parts:
|
| 37 |
+
# Try as_image() first
|
| 38 |
+
try:
|
| 39 |
+
img_data = part.as_image()
|
| 40 |
+
if img_data is not None:
|
| 41 |
+
last_image = Image.open(io.BytesIO(img_data.image_bytes))
|
| 42 |
+
continue
|
| 43 |
+
except Exception:
|
| 44 |
+
pass
|
| 45 |
+
# Fallback: inline_data
|
| 46 |
+
try:
|
| 47 |
+
if hasattr(part, "inline_data") and part.inline_data is not None:
|
| 48 |
+
img_bytes = part.inline_data.data
|
| 49 |
+
last_image = Image.open(io.BytesIO(img_bytes))
|
| 50 |
+
except Exception:
|
| 51 |
+
pass
|
| 52 |
+
return last_image
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def _execute_single_crop_sync(
|
| 56 |
+
client: genai.Client,
|
| 57 |
+
page_image_bytes: bytes,
|
| 58 |
+
crop_task: CropTask,
|
| 59 |
+
image_store: ImageStore,
|
| 60 |
+
) -> tuple[ImageRef, bool]:
|
| 61 |
+
"""Execute one crop via Gemini code_execution (synchronous).
|
| 62 |
+
|
| 63 |
+
Includes retry logic for transient 503/429 errors.
|
| 64 |
+
|
| 65 |
+
Returns
|
| 66 |
+
-------
|
| 67 |
+
(image_ref, is_fallback)
|
| 68 |
+
``is_fallback`` is True when Gemini failed to produce a crop and the
|
| 69 |
+
full page image was returned instead. Fallbacks should NOT be cached.
|
| 70 |
+
"""
|
| 71 |
+
prompt = CROPPER_PROMPT_TEMPLATE.format(
|
| 72 |
+
crop_instruction=crop_task["crop_instruction"],
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
image_part = types.Part.from_bytes(data=page_image_bytes, mime_type="image/png")
|
| 76 |
+
|
| 77 |
+
# Retry loop for transient API errors
|
| 78 |
+
response = None
|
| 79 |
+
for attempt in range(MAX_RETRIES):
|
| 80 |
+
try:
|
| 81 |
+
response = client.models.generate_content(
|
| 82 |
+
model=CROPPER_MODEL,
|
| 83 |
+
contents=[image_part, prompt],
|
| 84 |
+
config=types.GenerateContentConfig(
|
| 85 |
+
tools=[types.Tool(code_execution=types.ToolCodeExecution)]
|
| 86 |
+
),
|
| 87 |
+
)
|
| 88 |
+
break
|
| 89 |
+
except Exception as e:
|
| 90 |
+
err_str = str(e)
|
| 91 |
+
if ("503" in err_str or "429" in err_str or "UNAVAILABLE" in err_str):
|
| 92 |
+
delay = RETRY_BASE_DELAY * (2 ** attempt)
|
| 93 |
+
logger.warning(
|
| 94 |
+
"Crop API error (attempt %d/%d): %s — retrying in %.1fs",
|
| 95 |
+
attempt + 1, MAX_RETRIES, err_str[:120], delay,
|
| 96 |
+
)
|
| 97 |
+
time.sleep(delay)
|
| 98 |
+
else:
|
| 99 |
+
raise
|
| 100 |
+
|
| 101 |
+
is_fallback = True
|
| 102 |
+
if response is not None:
|
| 103 |
+
final_image = _extract_last_image(response)
|
| 104 |
+
if final_image is not None:
|
| 105 |
+
is_fallback = False
|
| 106 |
+
else:
|
| 107 |
+
final_image = Image.open(io.BytesIO(page_image_bytes))
|
| 108 |
+
else:
|
| 109 |
+
# All retries exhausted
|
| 110 |
+
final_image = Image.open(io.BytesIO(page_image_bytes))
|
| 111 |
+
|
| 112 |
+
crop_id = f"crop_{uuid.uuid4().hex[:6]}"
|
| 113 |
+
ref = image_store.save_crop(
|
| 114 |
+
page_num=crop_task["page_num"],
|
| 115 |
+
crop_id=crop_id,
|
| 116 |
+
image=final_image,
|
| 117 |
+
label=crop_task["label"],
|
| 118 |
+
)
|
| 119 |
+
return ref, is_fallback
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
def execute_crops(
|
| 123 |
+
state: DrawingReaderState,
|
| 124 |
+
image_store: ImageStore,
|
| 125 |
+
crop_cache: CropCache | None = None,
|
| 126 |
+
progress_callback: ProgressCallback | None = None,
|
| 127 |
+
) -> dict:
|
| 128 |
+
"""Execute all crop tasks concurrently, reusing cached crops when possible.
|
| 129 |
+
|
| 130 |
+
Parameters
|
| 131 |
+
----------
|
| 132 |
+
progress_callback
|
| 133 |
+
Optional callback invoked on the **main thread** each time a crop
|
| 134 |
+
completes (or is served from cache). Called with
|
| 135 |
+
``(image_ref, crop_task, source, completed_count, total_count)``
|
| 136 |
+
where *source* is ``"cached"``, ``"completed"``, or ``"fallback"``.
|
| 137 |
+
"""
|
| 138 |
+
crop_tasks = state.get("crop_tasks", [])
|
| 139 |
+
page_image_dir = state["page_image_dir"]
|
| 140 |
+
|
| 141 |
+
if not crop_tasks:
|
| 142 |
+
return {"status_message": ["No crop tasks to execute."]}
|
| 143 |
+
|
| 144 |
+
total_count = len(crop_tasks)
|
| 145 |
+
completed_count = 0
|
| 146 |
+
|
| 147 |
+
# ----- Phase 1: Separate cache hits from tasks that need API calls -----
|
| 148 |
+
image_refs: list[ImageRef] = [] # final ordered results
|
| 149 |
+
tasks_to_execute: list[tuple[int, CropTask]] = [] # (original_index, task)
|
| 150 |
+
cache_hits = 0
|
| 151 |
+
|
| 152 |
+
for i, ct in enumerate(crop_tasks):
|
| 153 |
+
if crop_cache is not None:
|
| 154 |
+
cached_ref = crop_cache.lookup(ct["page_num"], ct["crop_instruction"])
|
| 155 |
+
if cached_ref is not None:
|
| 156 |
+
image_refs.append(cached_ref)
|
| 157 |
+
cache_hits += 1
|
| 158 |
+
completed_count += 1
|
| 159 |
+
logger.info(
|
| 160 |
+
"Reusing cached crop for '%s' (page %d)",
|
| 161 |
+
ct["label"], ct["page_num"],
|
| 162 |
+
)
|
| 163 |
+
# Notify the UI immediately for each cache hit
|
| 164 |
+
if progress_callback is not None:
|
| 165 |
+
progress_callback(
|
| 166 |
+
cached_ref, ct, "cached", completed_count, total_count,
|
| 167 |
+
)
|
| 168 |
+
continue
|
| 169 |
+
# Not cached — needs an API call
|
| 170 |
+
tasks_to_execute.append((i, ct))
|
| 171 |
+
|
| 172 |
+
# ----- Phase 2: Execute uncached crops via Gemini -----
|
| 173 |
+
errors: list[str] = []
|
| 174 |
+
|
| 175 |
+
if tasks_to_execute:
|
| 176 |
+
client = genai.Client(api_key=GOOGLE_API_KEY)
|
| 177 |
+
|
| 178 |
+
with ThreadPoolExecutor(max_workers=min(len(tasks_to_execute), 4)) as pool:
|
| 179 |
+
future_to_idx: dict = {}
|
| 180 |
+
for exec_idx, (_, ct) in enumerate(tasks_to_execute):
|
| 181 |
+
page_bytes = get_page_image_bytes(page_image_dir, ct["page_num"])
|
| 182 |
+
future = pool.submit(
|
| 183 |
+
_execute_single_crop_sync, client, page_bytes, ct, image_store,
|
| 184 |
+
)
|
| 185 |
+
future_to_idx[future] = exec_idx
|
| 186 |
+
|
| 187 |
+
# Process results as they arrive — this runs on the MAIN thread,
|
| 188 |
+
# so we can safely invoke the Streamlit progress callback here.
|
| 189 |
+
for future in as_completed(future_to_idx):
|
| 190 |
+
exec_idx = future_to_idx[future]
|
| 191 |
+
orig_idx, ct = tasks_to_execute[exec_idx]
|
| 192 |
+
try:
|
| 193 |
+
ref, is_fallback = future.result()
|
| 194 |
+
image_refs.append(ref)
|
| 195 |
+
completed_count += 1
|
| 196 |
+
|
| 197 |
+
# Register in cache (only successful targeted crops)
|
| 198 |
+
if crop_cache is not None:
|
| 199 |
+
crop_cache.register(
|
| 200 |
+
page_num=ct["page_num"],
|
| 201 |
+
crop_instruction=ct["crop_instruction"],
|
| 202 |
+
label=ct["label"],
|
| 203 |
+
image_ref=ref,
|
| 204 |
+
is_fallback=is_fallback,
|
| 205 |
+
)
|
| 206 |
+
|
| 207 |
+
# Notify the UI as each crop completes
|
| 208 |
+
if progress_callback is not None:
|
| 209 |
+
source = "fallback" if is_fallback else "completed"
|
| 210 |
+
progress_callback(
|
| 211 |
+
ref, ct, source, completed_count, total_count,
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
except Exception as e:
|
| 215 |
+
completed_count += 1
|
| 216 |
+
errors.append(f"Crop task {orig_idx} failed: {e}")
|
| 217 |
+
logger.error("Crop task %d failed: %s", orig_idx, e)
|
| 218 |
+
|
| 219 |
+
# ----- Phase 3: Build status message -----
|
| 220 |
+
api_count = len(tasks_to_execute) - len(errors)
|
| 221 |
+
parts = [f"Completed {len(image_refs)} of {total_count} crops"]
|
| 222 |
+
if cache_hits:
|
| 223 |
+
parts.append(f"({cache_hits} from cache, {api_count} new)")
|
| 224 |
+
if errors:
|
| 225 |
+
parts.append(f"Errors: {'; '.join(errors)}")
|
| 226 |
+
status = ". ".join(parts) + "."
|
| 227 |
+
|
| 228 |
+
if crop_cache is not None:
|
| 229 |
+
logger.info(crop_cache.stats)
|
| 230 |
+
|
| 231 |
+
return {
|
| 232 |
+
"image_refs": image_refs,
|
| 233 |
+
"status_message": [status],
|
| 234 |
+
}
|
nodes/deliberation.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""deliberation node — GPT peer review of Gemini's compliance analysis."""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
from datetime import datetime
|
| 5 |
+
|
| 6 |
+
from openai import OpenAI
|
| 7 |
+
|
| 8 |
+
from config import DELIBERATION_MODEL, OPENAI_API_KEY
|
| 9 |
+
from prompts.deliberation import DELIBERATION_SYSTEM_PROMPT
|
| 10 |
+
from state import AgentMessage, ComplianceState
|
| 11 |
+
from tools.image_store import ImageStore
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def deliberation(state: ComplianceState, image_store: ImageStore) -> dict:
|
| 15 |
+
"""Send compliance analysis + images + code report to GPT for peer review."""
|
| 16 |
+
question = state["question"]
|
| 17 |
+
compliance_analysis = state.get("compliance_analysis", "")
|
| 18 |
+
code_report = state.get("code_report", "")
|
| 19 |
+
image_refs = state.get("image_refs", [])
|
| 20 |
+
|
| 21 |
+
if not compliance_analysis:
|
| 22 |
+
return {
|
| 23 |
+
"reviewer_analysis": "",
|
| 24 |
+
"discussion_log": [
|
| 25 |
+
AgentMessage(
|
| 26 |
+
timestamp=datetime.now().strftime("%H:%M:%S"),
|
| 27 |
+
agent="reviewer",
|
| 28 |
+
action="review",
|
| 29 |
+
summary="No analysis to review.",
|
| 30 |
+
detail="",
|
| 31 |
+
evidence_refs=[],
|
| 32 |
+
)
|
| 33 |
+
],
|
| 34 |
+
"status_message": ["No analysis to review."],
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
client = OpenAI(api_key=OPENAI_API_KEY)
|
| 38 |
+
|
| 39 |
+
# Build multimodal message
|
| 40 |
+
user_content: list[dict] = [
|
| 41 |
+
{"type": "text", "text": f"USER COMPLIANCE QUESTION: {question}"},
|
| 42 |
+
{"type": "text", "text": f"\n=== LEGAL REQUIREMENTS ===\n{code_report}"},
|
| 43 |
+
{"type": "text", "text": f"\n=== ANALYST'S COMPLIANCE FINDINGS ===\n{compliance_analysis}"},
|
| 44 |
+
{"type": "text", "text": "\nBELOW ARE THE SAME CROPPED IMAGES THE ANALYST EXAMINED:"},
|
| 45 |
+
]
|
| 46 |
+
|
| 47 |
+
for ref in image_refs:
|
| 48 |
+
user_content.append(
|
| 49 |
+
{"type": "text", "text": f"\nImage: {ref['label']}"}
|
| 50 |
+
)
|
| 51 |
+
try:
|
| 52 |
+
user_content.append(image_store.to_openai_base64(ref))
|
| 53 |
+
except Exception as e:
|
| 54 |
+
user_content.append(
|
| 55 |
+
{"type": "text", "text": f"(Could not load image: {e})"}
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
user_content.append(
|
| 59 |
+
{"type": "text", "text": "\nPerform your peer review of the compliance determination."}
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
response = client.chat.completions.create(
|
| 63 |
+
model=DELIBERATION_MODEL,
|
| 64 |
+
messages=[
|
| 65 |
+
{"role": "system", "content": DELIBERATION_SYSTEM_PROMPT},
|
| 66 |
+
{"role": "user", "content": user_content},
|
| 67 |
+
],
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
review_text = response.choices[0].message.content or ""
|
| 71 |
+
|
| 72 |
+
discussion_msg = AgentMessage(
|
| 73 |
+
timestamp=datetime.now().strftime("%H:%M:%S"),
|
| 74 |
+
agent="reviewer",
|
| 75 |
+
action="review",
|
| 76 |
+
summary=f"Peer review complete. {review_text[:100]}...",
|
| 77 |
+
detail=review_text[:1500],
|
| 78 |
+
evidence_refs=[],
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
return {
|
| 82 |
+
"reviewer_analysis": review_text,
|
| 83 |
+
"discussion_log": [discussion_msg],
|
| 84 |
+
"status_message": ["Deliberation/peer review complete."],
|
| 85 |
+
}
|
nodes/final_verdict.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""final_verdict node — synthesize compliance analysis + optional peer review into structured verdict."""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
from datetime import datetime
|
| 5 |
+
|
| 6 |
+
from google import genai
|
| 7 |
+
from google.genai import types
|
| 8 |
+
|
| 9 |
+
from config import GOOGLE_API_KEY, VERDICT_MODEL
|
| 10 |
+
from state import AgentMessage, ComplianceState
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def final_verdict(state: ComplianceState) -> dict:
|
| 14 |
+
"""Produce the final compliance verdict, synthesizing all evidence."""
|
| 15 |
+
question = state["question"]
|
| 16 |
+
compliance_analysis = state.get("compliance_analysis", "")
|
| 17 |
+
reviewer_analysis = state.get("reviewer_analysis", "")
|
| 18 |
+
code_report = state.get("code_report", "")
|
| 19 |
+
enable_consensus = state.get("enable_consensus", False)
|
| 20 |
+
|
| 21 |
+
# If no consensus was run, pass through the analyst's determination
|
| 22 |
+
if not enable_consensus or not reviewer_analysis:
|
| 23 |
+
verdict_msg = AgentMessage(
|
| 24 |
+
timestamp=datetime.now().strftime("%H:%M:%S"),
|
| 25 |
+
agent="compliance_analyst",
|
| 26 |
+
action="verdict",
|
| 27 |
+
summary="Final compliance verdict issued.",
|
| 28 |
+
detail=compliance_analysis[:1000],
|
| 29 |
+
evidence_refs=[],
|
| 30 |
+
)
|
| 31 |
+
return {
|
| 32 |
+
"final_verdict": compliance_analysis,
|
| 33 |
+
"discussion_log": [verdict_msg],
|
| 34 |
+
"status_message": ["Final verdict ready."],
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
# Synthesize both perspectives
|
| 38 |
+
client = genai.Client(api_key=GOOGLE_API_KEY)
|
| 39 |
+
|
| 40 |
+
synthesis_prompt = f"""\
|
| 41 |
+
You are producing a FINAL COMPLIANCE VERDICT for a NYC building code review.
|
| 42 |
+
|
| 43 |
+
USER QUESTION: {question}
|
| 44 |
+
|
| 45 |
+
CODE REQUIREMENTS (from legal research):
|
| 46 |
+
{code_report[:3000]}
|
| 47 |
+
|
| 48 |
+
ANALYST A (Gemini) compliance findings:
|
| 49 |
+
{compliance_analysis}
|
| 50 |
+
|
| 51 |
+
ANALYST B (GPT) peer review:
|
| 52 |
+
{reviewer_analysis}
|
| 53 |
+
|
| 54 |
+
YOUR TASK:
|
| 55 |
+
1. If both analysts AGREE on compliance status: produce a confident, unified verdict.
|
| 56 |
+
2. If they PARTIALLY AGREE: produce the verdict based on agreed points, and explicitly \
|
| 57 |
+
note areas of disagreement with evidence from both sides.
|
| 58 |
+
3. If they DISAGREE: present both interpretations, explain the discrepancy, and state \
|
| 59 |
+
which determination appears better supported by the evidence.
|
| 60 |
+
|
| 61 |
+
OUTPUT FORMAT:
|
| 62 |
+
|
| 63 |
+
### Compliance Verdict
|
| 64 |
+
**Status:** Compliant | Non-Compliant | Partially Compliant | Unverifiable
|
| 65 |
+
|
| 66 |
+
### Legal Basis
|
| 67 |
+
For each code requirement checked:
|
| 68 |
+
- **[Code Type] SS[Section] — [Title]**
|
| 69 |
+
- Requirement: [specific measurable requirement]
|
| 70 |
+
- Drawing Evidence: [what was observed]
|
| 71 |
+
- Determination: [compliant/non-compliant/unverifiable]
|
| 72 |
+
|
| 73 |
+
### Key Findings
|
| 74 |
+
- Bullet points of the most important compliance determinations
|
| 75 |
+
|
| 76 |
+
### Analyst Consensus
|
| 77 |
+
- Agreement/disagreement between Gemini and GPT analysts
|
| 78 |
+
- Resolution of any conflicts
|
| 79 |
+
|
| 80 |
+
### Limitations
|
| 81 |
+
- What could not be verified and why
|
| 82 |
+
- Recommended follow-up actions
|
| 83 |
+
|
| 84 |
+
Always cite BOTH code sections AND image crop labels for every factual claim.
|
| 85 |
+
"""
|
| 86 |
+
|
| 87 |
+
response = client.models.generate_content(
|
| 88 |
+
model=VERDICT_MODEL,
|
| 89 |
+
contents=[synthesis_prompt],
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
verdict_text = response.text
|
| 93 |
+
|
| 94 |
+
verdict_msg = AgentMessage(
|
| 95 |
+
timestamp=datetime.now().strftime("%H:%M:%S"),
|
| 96 |
+
agent="compliance_analyst",
|
| 97 |
+
action="verdict",
|
| 98 |
+
summary="Final synthesized compliance verdict issued.",
|
| 99 |
+
detail=verdict_text[:1000],
|
| 100 |
+
evidence_refs=[],
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
return {
|
| 104 |
+
"final_verdict": verdict_text,
|
| 105 |
+
"discussion_log": [verdict_msg],
|
| 106 |
+
"status_message": ["Final synthesized verdict ready."],
|
| 107 |
+
}
|
nodes/metadata_generator.py
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Background page metadata generator — extracts per-page descriptions from the full PDF.
|
| 2 |
+
|
| 3 |
+
Uses parallel batch processing: the PDF is split into 5-page chunks and each
|
| 4 |
+
chunk is sent to Gemini concurrently for faster metadata extraction.
|
| 5 |
+
"""
|
| 6 |
+
from __future__ import annotations
|
| 7 |
+
|
| 8 |
+
import json
|
| 9 |
+
import logging
|
| 10 |
+
import math
|
| 11 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 12 |
+
|
| 13 |
+
from google import genai
|
| 14 |
+
from google.genai import types
|
| 15 |
+
|
| 16 |
+
from config import GOOGLE_API_KEY, METADATA_MODEL
|
| 17 |
+
from prompts.metadata import METADATA_SYSTEM_PROMPT
|
| 18 |
+
from tools.pdf_processor import extract_page_range_bytes
|
| 19 |
+
|
| 20 |
+
logger = logging.getLogger(__name__)
|
| 21 |
+
|
| 22 |
+
# Number of PDF pages per batch sent to Gemini in parallel.
|
| 23 |
+
BATCH_SIZE = 5
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
# ---------------------------------------------------------------------------
|
| 27 |
+
# JSON extraction helper
|
| 28 |
+
# ---------------------------------------------------------------------------
|
| 29 |
+
|
| 30 |
+
def _extract_json_array(response_text: str) -> list[dict]:
|
| 31 |
+
"""Extract the outermost balanced JSON array from a response string."""
|
| 32 |
+
start = response_text.find("[")
|
| 33 |
+
if start == -1:
|
| 34 |
+
raise ValueError("No JSON array found in metadata generation response")
|
| 35 |
+
|
| 36 |
+
depth = 0
|
| 37 |
+
end = None
|
| 38 |
+
for i in range(start, len(response_text)):
|
| 39 |
+
if response_text[i] == "[":
|
| 40 |
+
depth += 1
|
| 41 |
+
elif response_text[i] == "]":
|
| 42 |
+
depth -= 1
|
| 43 |
+
if depth == 0:
|
| 44 |
+
end = i
|
| 45 |
+
break
|
| 46 |
+
|
| 47 |
+
if end is None:
|
| 48 |
+
raise ValueError("No matching closing bracket found in metadata response")
|
| 49 |
+
|
| 50 |
+
result = json.loads(response_text[start : end + 1])
|
| 51 |
+
if not isinstance(result, list):
|
| 52 |
+
raise ValueError(f"Expected list, got {type(result)}")
|
| 53 |
+
return result
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
# ---------------------------------------------------------------------------
|
| 57 |
+
# Single-batch API call
|
| 58 |
+
# ---------------------------------------------------------------------------
|
| 59 |
+
|
| 60 |
+
def _generate_batch(
|
| 61 |
+
pdf_path: str,
|
| 62 |
+
page_start_0: int,
|
| 63 |
+
page_end_0: int,
|
| 64 |
+
page_start_1: int,
|
| 65 |
+
page_end_1: int,
|
| 66 |
+
) -> list[dict]:
|
| 67 |
+
"""Generate metadata for a contiguous range of pages.
|
| 68 |
+
|
| 69 |
+
Args:
|
| 70 |
+
pdf_path: Path to the full PDF on disk.
|
| 71 |
+
page_start_0: First page (0-indexed, inclusive) for PDF extraction.
|
| 72 |
+
page_end_0: Last page (0-indexed, inclusive) for PDF extraction.
|
| 73 |
+
page_start_1: First page (1-indexed) — used in the prompt text.
|
| 74 |
+
page_end_1: Last page (1-indexed) — used in the prompt text.
|
| 75 |
+
|
| 76 |
+
Returns:
|
| 77 |
+
List of metadata dicts for the pages in this batch.
|
| 78 |
+
"""
|
| 79 |
+
client = genai.Client(api_key=GOOGLE_API_KEY)
|
| 80 |
+
|
| 81 |
+
batch_pdf_bytes = extract_page_range_bytes(pdf_path, page_start_0, page_end_0)
|
| 82 |
+
pdf_part = types.Part.from_bytes(data=batch_pdf_bytes, mime_type="application/pdf")
|
| 83 |
+
|
| 84 |
+
num_batch_pages = page_end_1 - page_start_1 + 1
|
| 85 |
+
instruction_text = (
|
| 86 |
+
f"This PDF excerpt contains {num_batch_pages} page(s), "
|
| 87 |
+
f"corresponding to pages {page_start_1} through {page_end_1} of the full drawing set.\n"
|
| 88 |
+
f"Generate structured metadata for ALL {num_batch_pages} page(s). "
|
| 89 |
+
f"Use page numbers {page_start_1} through {page_end_1} (1-indexed). "
|
| 90 |
+
f"Return a JSON array with exactly {num_batch_pages} objects."
|
| 91 |
+
)
|
| 92 |
+
instruction_part = types.Part.from_text(text=instruction_text)
|
| 93 |
+
|
| 94 |
+
response = client.models.generate_content(
|
| 95 |
+
model=METADATA_MODEL,
|
| 96 |
+
contents=[types.Content(role="user", parts=[pdf_part, instruction_part])],
|
| 97 |
+
config=types.GenerateContentConfig(
|
| 98 |
+
system_instruction=METADATA_SYSTEM_PROMPT,
|
| 99 |
+
),
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
return _extract_json_array(response.text.strip())
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
# ---------------------------------------------------------------------------
|
| 106 |
+
# Public entry point
|
| 107 |
+
# ---------------------------------------------------------------------------
|
| 108 |
+
|
| 109 |
+
def generate_page_metadata(
|
| 110 |
+
pdf_path: str,
|
| 111 |
+
num_pages: int,
|
| 112 |
+
progress_callback=None,
|
| 113 |
+
) -> list[dict]:
|
| 114 |
+
"""Extract per-page structured metadata from a PDF using parallel batches.
|
| 115 |
+
|
| 116 |
+
The PDF is split into chunks of ``BATCH_SIZE`` pages. Each chunk is sent to
|
| 117 |
+
Gemini concurrently via a thread pool. Results are merged, any missing
|
| 118 |
+
pages are back-filled, and the list is returned sorted by page number.
|
| 119 |
+
|
| 120 |
+
Args:
|
| 121 |
+
pdf_path: Path to the full PDF.
|
| 122 |
+
num_pages: Total number of pages.
|
| 123 |
+
progress_callback: Optional ``(completed_batches, total_batches, page_range_str) -> None``
|
| 124 |
+
called after each batch finishes.
|
| 125 |
+
|
| 126 |
+
Returns a list of dicts (1-indexed page_num), one per page.
|
| 127 |
+
Raises on failure (caller is responsible for error handling).
|
| 128 |
+
"""
|
| 129 |
+
num_batches = math.ceil(num_pages / BATCH_SIZE)
|
| 130 |
+
logger.info(
|
| 131 |
+
"Starting parallel metadata generation: %d pages in %d batches of %d",
|
| 132 |
+
num_pages, num_batches, BATCH_SIZE,
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
all_results: list[dict] = []
|
| 136 |
+
errors: list[str] = []
|
| 137 |
+
completed_count = 0
|
| 138 |
+
|
| 139 |
+
with ThreadPoolExecutor(max_workers=num_batches) as executor:
|
| 140 |
+
futures = {}
|
| 141 |
+
for batch_idx in range(num_batches):
|
| 142 |
+
page_start_0 = batch_idx * BATCH_SIZE
|
| 143 |
+
page_end_0 = min(page_start_0 + BATCH_SIZE - 1, num_pages - 1)
|
| 144 |
+
page_start_1 = page_start_0 + 1
|
| 145 |
+
page_end_1 = page_end_0 + 1
|
| 146 |
+
|
| 147 |
+
future = executor.submit(
|
| 148 |
+
_generate_batch,
|
| 149 |
+
pdf_path,
|
| 150 |
+
page_start_0,
|
| 151 |
+
page_end_0,
|
| 152 |
+
page_start_1,
|
| 153 |
+
page_end_1,
|
| 154 |
+
)
|
| 155 |
+
futures[future] = (page_start_1, page_end_1)
|
| 156 |
+
|
| 157 |
+
for future in as_completed(futures):
|
| 158 |
+
batch_range = futures[future]
|
| 159 |
+
try:
|
| 160 |
+
batch_results = future.result()
|
| 161 |
+
all_results.extend(batch_results)
|
| 162 |
+
completed_count += 1
|
| 163 |
+
logger.info("Batch pages %d-%d complete: %d entries", batch_range[0], batch_range[1], len(batch_results))
|
| 164 |
+
if progress_callback is not None:
|
| 165 |
+
progress_callback(
|
| 166 |
+
completed_count,
|
| 167 |
+
num_batches,
|
| 168 |
+
f"Pages {batch_range[0]}-{batch_range[1]}",
|
| 169 |
+
)
|
| 170 |
+
except Exception as e:
|
| 171 |
+
completed_count += 1
|
| 172 |
+
errors.append(f"Batch pages {batch_range[0]}-{batch_range[1]} failed: {e}")
|
| 173 |
+
logger.exception("Batch pages %d-%d failed", batch_range[0], batch_range[1])
|
| 174 |
+
if progress_callback is not None:
|
| 175 |
+
progress_callback(
|
| 176 |
+
completed_count,
|
| 177 |
+
num_batches,
|
| 178 |
+
f"Pages {batch_range[0]}-{batch_range[1]} (failed)",
|
| 179 |
+
)
|
| 180 |
+
|
| 181 |
+
if errors and not all_results:
|
| 182 |
+
raise RuntimeError(
|
| 183 |
+
f"All metadata batches failed:\n" + "\n".join(errors)
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
if errors:
|
| 187 |
+
logger.warning("Some batches failed (results will have gaps): %s", errors)
|
| 188 |
+
|
| 189 |
+
# Metadata stays 1-indexed (as the model produced it) because it will be
|
| 190 |
+
# passed as context text to the planner model, which also uses 1-indexed.
|
| 191 |
+
# The planner's *output* is converted to 0-indexed in nodes/planner.py.
|
| 192 |
+
|
| 193 |
+
# Fill in any missing pages with minimal entries (1-indexed)
|
| 194 |
+
covered_pages = {item.get("page_num") for item in all_results}
|
| 195 |
+
for p in range(1, num_pages + 1):
|
| 196 |
+
if p not in covered_pages:
|
| 197 |
+
all_results.append({
|
| 198 |
+
"page_num": p,
|
| 199 |
+
"sheet_id": "unknown",
|
| 200 |
+
"sheet_title": "Unknown",
|
| 201 |
+
"discipline": "other",
|
| 202 |
+
"page_type": "other",
|
| 203 |
+
"description": "Metadata not extracted for this page.",
|
| 204 |
+
"key_elements": [],
|
| 205 |
+
"spatial_coverage": "",
|
| 206 |
+
})
|
| 207 |
+
|
| 208 |
+
# Sort by page number
|
| 209 |
+
all_results.sort(key=lambda x: x.get("page_num", 0))
|
| 210 |
+
|
| 211 |
+
return all_results
|
prompts/__init__.py
ADDED
|
File without changes
|
prompts/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (232 Bytes). View file
|
|
|
prompts/__pycache__/annotator.cpython-313.pyc
ADDED
|
Binary file (1.1 kB). View file
|
|
|
prompts/__pycache__/code_lookup.cpython-313.pyc
ADDED
|
Binary file (1.95 kB). View file
|
|
|
prompts/__pycache__/compliance_analyst.cpython-313.pyc
ADDED
|
Binary file (4.29 kB). View file
|
|
|
prompts/__pycache__/compliance_planner.cpython-313.pyc
ADDED
|
Binary file (3.2 kB). View file
|
|
|
prompts/__pycache__/cropper.cpython-313.pyc
ADDED
|
Binary file (1.18 kB). View file
|
|
|
prompts/__pycache__/deliberation.cpython-313.pyc
ADDED
|
Binary file (1.71 kB). View file
|
|
|