Spaces:

NYSERDA-CRE-Working-Group
/

Updated_code_complaince

Sleeping

App Files Files Community

Ryan2219 commited on Feb 22

Commit

e1ced8e

verified ·

1 Parent(s): f6b94ac

Upload 70 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +3 -0
NorthMaconPark.pdf +3 -0
__init__.py +0 -0
app.py +628 -0
config.py +40 -0
data/BUILDING_CODE.json +0 -0
data/FUEL_GAS_CODE.json +0 -0
data/GENERAL_ADMINISTRATIVE_PROVISIONS.json +3 -0
data/MECHANICAL_CODE.json +0 -0
data/PLUMBING_CODE.json +0 -0
data/ingest_chromadb.py +120 -0
data/nyc_code_db/chroma.sqlite3 +3 -0
data/nyc_code_db/d5ad1fca-6483-43d1-b3bd-08c280e227d1/data_level0.bin +3 -0
data/nyc_code_db/d5ad1fca-6483-43d1-b3bd-08c280e227d1/header.bin +3 -0
data/nyc_code_db/d5ad1fca-6483-43d1-b3bd-08c280e227d1/index_metadata.pickle +3 -0
data/nyc_code_db/d5ad1fca-6483-43d1-b3bd-08c280e227d1/length.bin +3 -0
data/nyc_code_db/d5ad1fca-6483-43d1-b3bd-08c280e227d1/link_lists.bin +3 -0
data/nyc_code_db/f32247b2-1c25-42c9-9177-5ccfe22bb0b1/data_level0.bin +3 -0
data/nyc_code_db/f32247b2-1c25-42c9-9177-5ccfe22bb0b1/header.bin +3 -0
data/nyc_code_db/f32247b2-1c25-42c9-9177-5ccfe22bb0b1/index_metadata.pickle +3 -0
data/nyc_code_db/f32247b2-1c25-42c9-9177-5ccfe22bb0b1/length.bin +3 -0
data/nyc_code_db/f32247b2-1c25-42c9-9177-5ccfe22bb0b1/link_lists.bin +3 -0
data/preprocess_codes.py +253 -0
graph.py +147 -0
nodes/__init__.py +0 -0
nodes/__pycache__/__init__.cpython-313.pyc +0 -0
nodes/__pycache__/annotator.cpython-313.pyc +0 -0
nodes/__pycache__/code_lookup.cpython-313.pyc +0 -0
nodes/__pycache__/compliance_analyst.cpython-313.pyc +0 -0
nodes/__pycache__/compliance_planner.cpython-313.pyc +0 -0
nodes/__pycache__/cropper.cpython-313.pyc +0 -0
nodes/__pycache__/deliberation.cpython-313.pyc +0 -0
nodes/__pycache__/final_verdict.cpython-313.pyc +0 -0
nodes/__pycache__/metadata_generator.cpython-313.pyc +0 -0
nodes/annotator.py +117 -0
nodes/code_lookup.py +286 -0
nodes/compliance_analyst.py +188 -0
nodes/compliance_planner.py +130 -0
nodes/cropper.py +234 -0
nodes/deliberation.py +85 -0
nodes/final_verdict.py +107 -0
nodes/metadata_generator.py +211 -0
prompts/__init__.py +0 -0
prompts/__pycache__/__init__.cpython-313.pyc +0 -0
prompts/__pycache__/annotator.cpython-313.pyc +0 -0
prompts/__pycache__/code_lookup.cpython-313.pyc +0 -0
prompts/__pycache__/compliance_analyst.cpython-313.pyc +0 -0
prompts/__pycache__/compliance_planner.cpython-313.pyc +0 -0
prompts/__pycache__/cropper.cpython-313.pyc +0 -0
prompts/__pycache__/deliberation.cpython-313.pyc +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/GENERAL_ADMINISTRATIVE_PROVISIONS.json filter=lfs diff=lfs merge=lfs -text
+data/nyc_code_db/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
+NorthMaconPark.pdf filter=lfs diff=lfs merge=lfs -text

NorthMaconPark.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9aed76b73fbe205e1579e3a00be6e95b7564e72594b1fdb83311819d447f8fc4
+size 39114794

__init__.py ADDED Viewed

File without changes

app.py ADDED Viewed

	@@ -0,0 +1,628 @@

+"""Streamlit UI for NYC Code Compliance Bot — with agent discussion panel."""
+from __future__ import annotations
+import json
+import logging
+import tempfile
+from pathlib import Path
+import os
+import sys
+import streamlit as st
+from PIL import Image
+from config import MAX_INVESTIGATION_ROUNDS
+from graph import compile_compliance_graph
+from tools.chroma_tools import warmup_collection, is_warmed_up
+from tools.crop_cache import CropCache
+from tools.image_store import ImageStore
+from tools.metadata_cache import MetadataState, get_cached_metadata
+from tools.pdf_processor import render_pages
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Page config
+# ---------------------------------------------------------------------------
+st.set_page_config(
+    page_title="NYC Code Compliance Bot",
+    page_icon=":building_construction:",
+    layout="wide",
+)
+# ---------------------------------------------------------------------------
+# Custom CSS for agent discussion panel
+# ---------------------------------------------------------------------------
+st.markdown("""
+<style>
+.agent-msg {
+    padding: 8px 12px;
+    margin: 4px 0;
+    border-radius: 8px;
+    font-size: 0.9em;
+}
+.agent-planner {
+    background-color: #e3f2fd;
+    border-left: 4px solid #1565c0;
+}
+.agent-code_analyst {
+    background-color: #fff3e0;
+    border-left: 4px solid #e65100;
+}
+.agent-compliance_analyst {
+    background-color: #e8f5e9;
+    border-left: 4px solid #2e7d32;
+}
+.agent-reviewer {
+    background-color: #f3e5f5;
+    border-left: 4px solid #6a1b9a;
+}
+.agent-icon {
+    font-weight: bold;
+    margin-right: 6px;
+}
+.agent-timestamp {
+    color: #666;
+    font-size: 0.8em;
+}
+</style>
+""", unsafe_allow_html=True)
+AGENT_ICONS = {
+    "planner": "\U0001f4cb",
+    "code_analyst": "\u2696\ufe0f",
+    "compliance_analyst": "\U0001f50d",
+    "reviewer": "\U0001f91d",
+}
+AGENT_LABELS = {
+    "planner": "Planner",
+    "code_analyst": "Code Analyst",
+    "compliance_analyst": "Compliance Analyst",
+    "reviewer": "Reviewer",
+}
+# ---------------------------------------------------------------------------
+# Session state defaults
+# ---------------------------------------------------------------------------
+if "pdf_loaded" not in st.session_state:
+    st.session_state.pdf_loaded = False
+if "chat_history" not in st.session_state:
+    st.session_state.chat_history = []
+if "image_store" not in st.session_state:
+    st.session_state.image_store = None
+if "ingest_state" not in st.session_state:
+    st.session_state.ingest_state = {}
+if "pdf_bytes" not in st.session_state:
+    st.session_state.pdf_bytes = None
+if "metadata_state" not in st.session_state:
+    st.session_state.metadata_state = MetadataState()
+if "crop_cache" not in st.session_state:
+    st.session_state.crop_cache = CropCache()
+if "discussion_log" not in st.session_state:
+    st.session_state.discussion_log = []
+if "code_report" not in st.session_state:
+    st.session_state.code_report = ""
+if "code_sections" not in st.session_state:
+    st.session_state.code_sections = []
+if "image_refs" not in st.session_state:
+    st.session_state.image_refs = []
+if "db_ready" not in st.session_state:
+    st.session_state.db_ready = False
+# ---------------------------------------------------------------------------
+# Startup: warm up embedding model + ChromaDB
+# ---------------------------------------------------------------------------
+if not st.session_state.db_ready:
+    with st.status("Loading NYC Code Database...", expanded=True) as _db_status:
+        st.write(":brain: Loading embedding model (bge-large-en-v1.5)...")
+        st.write("_This is a one-time download (~1.3 GB) on first run._")
+        ok = warmup_collection()
+        if ok:
+            st.session_state.db_ready = True
+            _db_status.update(label="NYC Code Database ready", state="complete")
+        else:
+            _db_status.update(
+                label="NYC Code Database not available — code lookup will be disabled",
+                state="error",
+            )
+            st.session_state.db_ready = False
+    if st.session_state.db_ready:
+        st.rerun()
+# ---------------------------------------------------------------------------
+# Sidebar
+# ---------------------------------------------------------------------------
+with st.sidebar:
+    st.title(":building_construction: NYC Code Compliance Bot")
+    st.markdown(
+        "Upload a construction drawing PDF and ask compliance questions. "
+        "The system uses **agentic vision** + **NYC code database** to "
+        "verify code compliance."
+    )
+    st.divider()
+    # PDF upload
+    uploaded_file = st.file_uploader("Upload Drawing PDF", type=["pdf"])
+    # Default drawing button
+    _DEFAULT_PDF = Path(__file__).parent / "NorthMaconPark.pdf"
+    if _DEFAULT_PDF.exists() and not st.session_state.pdf_loaded:
+        st.markdown("**— or —**")
+        if st.button("Use Default Drawing", use_container_width=True):
+            st.session_state._use_default_pdf = True
+            st.rerun()
+    st.divider()
+    # Settings
+    st.subheader("Settings")
+    enable_consensus = st.checkbox(
+        "Enable peer review (Gemini + GPT)",
+        value=False,
+        help="GPT reviews Gemini's compliance analysis. Slower but more thorough.",
+    )
+    enable_annotation = st.checkbox(
+        "Enable annotation",
+        value=False,
+        help="Annotate crops with numbered highlights before analysis.",
+    )
+    max_rounds = st.slider(
+        "Max investigation rounds",
+        min_value=1,
+        max_value=5,
+        value=MAX_INVESTIGATION_ROUNDS,
+        help="Maximum crop-analyze loops before forcing a final verdict.",
+    )
+    st.divider()
+    st.caption("Powered by LangGraph + Gemini + GPT + ChromaDB")
+# ---------------------------------------------------------------------------
+# PDF ingestion — Phase A: render pages
+# ---------------------------------------------------------------------------
+# Determine if we have a PDF to process (uploaded or default)
+_pending_pdf: tuple[str, bytes] | None = None
+if not st.session_state.pdf_loaded:
+    if uploaded_file is not None:
+        _pending_pdf = (uploaded_file.name, uploaded_file.getvalue())
+    elif st.session_state.get("_use_default_pdf"):
+        _default = Path(__file__).parent / "NorthMaconPark.pdf"
+        if _default.exists():
+            _pending_pdf = (_default.name, _default.read_bytes())
+if _pending_pdf is not None and not st.session_state.pdf_loaded:
+    pdf_name, pdf_bytes = _pending_pdf
+    with st.status("Converting PDF to images...", expanded=True) as status:
+        tmp_dir = tempfile.mkdtemp(prefix="compliance_bot_")
+        pdf_path = Path(tmp_dir) / pdf_name
+        pdf_path.write_bytes(pdf_bytes)
+        st.session_state.pdf_bytes = pdf_bytes
+        st.session_state.crop_cache = CropCache()
+        image_store = ImageStore(str(Path(tmp_dir) / "images"))
+        st.session_state.image_store = image_store
+        page_image_dir = str(image_store._pages_dir)
+        # Check for cached metadata
+        cached = get_cached_metadata(pdf_bytes)
+        if cached is not None:
+            st.session_state.metadata_state.set_ready(json.dumps(cached, indent=2))
+            st.write("Page index loaded from cache")
+        st.write("Rendering pages...")
+        num_pages = render_pages(str(pdf_path), page_image_dir)
+        st.session_state.ingest_state = {
+            "pdf_path": str(pdf_path),
+            "page_image_dir": page_image_dir,
+            "num_pages": num_pages,
+        }
+        st.session_state.pdf_loaded = True
+        st.session_state.pop("_use_default_pdf", None)
+        st.write(f"Converted {num_pages} pages to images.")
+        status.update(label=f"PDF ready: {num_pages} pages", state="complete")
+    st.rerun()
+# ---------------------------------------------------------------------------
+# PDF ingestion — Phase B: generate page index
+# ---------------------------------------------------------------------------
+if st.session_state.pdf_loaded:
+    meta = st.session_state.metadata_state
+    if meta.status == "not_started":
+        if st.session_state.pdf_bytes is not None:
+            with st.expander(":page_facing_up: PDF Viewer", expanded=False):
+                st.pdf(st.session_state.pdf_bytes, height=400)
+        ingest = st.session_state.ingest_state
+        num_pages = ingest["num_pages"]
+        st.write("**Generating page index...**")
+        progress_bar = st.progress(0, text="Analyzing pages to build searchable index...")
+        def _index_progress(completed: int, total: int, label: str):
+            pct = completed / total
+            progress_bar.progress(pct, text=f"Indexing: {label}  ({completed}/{total} batches)")
+        meta.generate_sync(
+            ingest["pdf_path"],
+            num_pages,
+            st.session_state.pdf_bytes,
+            progress_callback=_index_progress,
+        )
+        if meta.is_ready:
+            progress_bar.progress(1.0, text="Page index ready!")
+        else:
+            progress_bar.progress(1.0, text="Indexing failed — using full PDF mode")
+        st.rerun()
+# ---------------------------------------------------------------------------
+# Main layout (pre-upload welcome)
+# ---------------------------------------------------------------------------
+if not st.session_state.pdf_loaded:
+    _left, center, _right = st.columns([1, 2, 1])
+    with center:
+        st.markdown(
+            "<h1 style='text-align: center;'>:building_construction: NYC Code Compliance Bot</h1>",
+            unsafe_allow_html=True,
+        )
+        st.markdown(
+            "<p style='text-align: center; color: grey;'>"
+            "Upload a construction drawing PDF in the sidebar to get started.<br>"
+            "This tool uses <b>agentic vision</b> and the <b>NYC Building Code database</b> "
+            "to verify code compliance in your drawings."
+            "</p>",
+            unsafe_allow_html=True,
+        )
+    st.stop()
+# ---------------------------------------------------------------------------
+# PDF viewer
+# ---------------------------------------------------------------------------
+if st.session_state.pdf_bytes is not None:
+    with st.expander(":page_facing_up: PDF Viewer", expanded=False):
+        st.pdf(st.session_state.pdf_bytes, height=400)
+# ---------------------------------------------------------------------------
+# Three-column layout: chat | discussion | images+code
+# ---------------------------------------------------------------------------
+chat_col, discuss_col, evidence_col = st.columns([2, 2, 2])
+# ---------------------------------------------------------------------------
+# Discussion panel (agent conversation)
+# ---------------------------------------------------------------------------
+def render_discussion_log(container, discussion_log: list[dict]):
+    """Render the agent discussion log with styled messages."""
+    with container:
+        for msg in discussion_log:
+            agent = msg.get("agent", "unknown")
+            icon = AGENT_ICONS.get(agent, "\U0001f916")
+            label = AGENT_LABELS.get(agent, agent)
+            css_class = f"agent-{agent}"
+            st.markdown(
+                f'<div class="agent-msg {css_class}">'
+                f'<span class="agent-timestamp">[{msg.get("timestamp", "")}]</span> '
+                f'<span class="agent-icon">{icon} {label}</span><br>'
+                f'{msg.get("summary", "")}'
+                f'</div>',
+                unsafe_allow_html=True,
+            )
+# ---------------------------------------------------------------------------
+# Chat history display
+# ---------------------------------------------------------------------------
+with chat_col:
+    st.subheader(":speech_balloon: Chat")
+    meta = st.session_state.metadata_state
+    if meta.is_ready:
+        st.caption("Page index ready — fast planning enabled")
+    elif meta.status == "failed":
+        st.caption("Page indexing failed — using full PDF mode")
+    for role, content, _refs in st.session_state.chat_history:
+        with st.chat_message(role):
+            st.markdown(content)
+    question = st.chat_input("Ask a compliance question about the drawing...")
+# ---------------------------------------------------------------------------
+# Discussion panel
+# ---------------------------------------------------------------------------
+with discuss_col:
+    st.subheader(":busts_in_silhouette: Agent Discussion")
+    discussion_container = st.container()
+    if st.session_state.discussion_log:
+        render_discussion_log(discussion_container, st.session_state.discussion_log)
+    else:
+        st.info("Agent discussions will appear here during analysis.")
+# ---------------------------------------------------------------------------
+# Evidence panel (images + code)
+# ---------------------------------------------------------------------------
+with evidence_col:
+    st.subheader(":framed_picture: Evidence")
+    evidence_tabs = st.tabs(["Drawing Crops", "Code Sections"])
+    with evidence_tabs[0]:
+        if st.session_state.image_refs:
+            for ref in st.session_state.image_refs:
+                try:
+                    img = Image.open(ref["path"])
+                    st.image(img, caption=ref["label"], use_container_width=True)
+                except Exception:
+                    st.warning(f"Could not load: {ref['label']}")
+        elif st.session_state.chat_history:
+            st.info("No images for this question.")
+        else:
+            st.info("Ask a question to see drawing crops here.")
+    with evidence_tabs[1]:
+        if st.session_state.code_sections:
+            for sec in st.session_state.code_sections:
+                with st.expander(
+                    f":balance_scale: {sec.get('code_type', '?')} §{sec.get('section_full', '?')}",
+                    expanded=False,
+                ):
+                    if sec.get("relevance"):
+                        st.caption(sec["relevance"])
+                    st.markdown(sec.get("text", "")[:1500])
+            if st.session_state.code_report:
+                with st.expander(":page_facing_up: Full Code Report", expanded=False):
+                    st.markdown(st.session_state.code_report[:5000])
+        else:
+            st.info("Code sections retrieved during analysis will appear here.")
+# ---------------------------------------------------------------------------
+# Question processing
+# ---------------------------------------------------------------------------
+if question:
+    # Add user message to history
+    st.session_state.chat_history.append(("user", question, []))
+    st.session_state.discussion_log = []  # Reset discussion for new question
+    st.session_state.code_report = ""     # Reset code report for new question
+    st.session_state.code_sections = []   # Reset code sections for new question
+    st.session_state.image_refs = []      # Reset image refs for new question
+    with chat_col:
+        with st.chat_message("user"):
+            st.markdown(question)
+    # Build initial state
+    ingest = st.session_state.ingest_state
+    image_store = st.session_state.image_store
+    meta = st.session_state.metadata_state
+    metadata_json = meta.data_json if meta.is_ready else ""
+    question_state = {
+        "messages": [],
+        "question": question,
+        "pdf_path": ingest.get("pdf_path", ""),
+        "page_image_dir": ingest.get("page_image_dir", ""),
+        "num_pages": ingest.get("num_pages", 0),
+        "page_metadata_json": metadata_json,
+        "legend_pages": [],
+        "target_pages": [],
+        "crop_tasks": [],
+        "code_queries": [],
+        "image_refs": [],
+        "code_sections": [],
+        "code_report": "",
+        "code_chapters_fetched": [],
+        "compliance_analysis": "",
+        "reviewer_analysis": "",
+        "final_verdict": "",
+        "discussion_log": [],
+        "additional_crop_tasks": [],
+        "additional_code_queries": [],
+        "needs_more_investigation": False,
+        "investigation_round": 0,
+        "max_rounds": max_rounds,
+        "enable_consensus": enable_consensus,
+        "enable_annotation": enable_annotation,
+        "status_message": [],
+    }
+    # ------------------------------------------------------------------
+    # Live progress
+    # ------------------------------------------------------------------
+    crop_cache = st.session_state.crop_cache
+    with evidence_col:
+        with evidence_tabs[0]:
+            crop_counter_placeholder = st.empty()
+            crop_image_container = st.container()
+    def on_crop_progress(
+        completed_ref, crop_task, source: str, completed_count: int, total_count: int,
+    ) -> None:
+        source_tag = " (cached)" if source == "cached" else ""
+        crop_counter_placeholder.markdown(
+            f"**Crop {completed_count}/{total_count}**{source_tag}  \n"
+            f"Latest: *{crop_task.get('label', 'Crop')}*"
+        )
+        with crop_image_container:
+            try:
+                img = Image.open(completed_ref["path"])
+                caption = completed_ref["label"]
+                if source == "cached":
+                    caption += " (cached)"
+                st.image(img, caption=caption, use_container_width=True)
+            except Exception:
+                st.warning(f"Could not load: {completed_ref['label']}")
+    # Compile graph
+    compliance_graph = compile_compliance_graph(image_store, crop_cache, on_crop_progress)
+    # Node progress labels
+    PROGRESS_LABELS = {
+        "compliance_planner": "Planning investigation...",
+        "execute_crops": "Cropping drawing images...",
+        "annotate_crops": "Annotating crops...",
+        "initial_code_lookup": "Searching NYC code database...",
+        "compliance_analyst": "Analyzing compliance...",
+        "targeted_code_lookup": "Follow-up code search...",
+        "deliberation": "Running peer review...",
+        "final_verdict": "Synthesizing verdict...",
+    }
+    with chat_col:
+        with st.status("Investigating compliance...", expanded=True) as status:
+            all_image_refs: list[dict] = []
+            all_discussion: list[dict] = []
+            final_verdict_text = ""
+            code_report_text = ""
+            st.write(PROGRESS_LABELS["compliance_planner"])
+            # Placeholder for parallel-branch status (updated after planner completes)
+            parallel_status = st.empty()
+            for event in compliance_graph.stream(question_state, stream_mode="updates"):
+                node_name = list(event.keys())[0]
+                update = event[node_name]
+                # Status messages (list, since parallel nodes can both emit)
+                status_msgs = update.get("status_message", [])
+                for status_msg in status_msgs:
+                    if status_msg:
+                        st.write(f":white_check_mark: {status_msg}")
+                # Collect discussion messages
+                new_discussion = update.get("discussion_log", [])
+                if new_discussion:
+                    all_discussion.extend(new_discussion)
+                    st.session_state.discussion_log = all_discussion
+                    # Re-render discussion panel
+                    render_discussion_log(discussion_container, all_discussion)
+                # Node-specific handling
+                if node_name == "compliance_planner":
+                    target_pages = update.get("target_pages", [])
+                    crop_tasks = update.get("crop_tasks", [])
+                    code_queries = update.get("code_queries", [])
+                    with st.expander(":clipboard: Investigation Plan", expanded=True):
+                        if target_pages:
+                            st.markdown(f"**Target pages:** {', '.join(str(p + 1) for p in target_pages)}")
+                        if crop_tasks:
+                            st.markdown(f"**Image crops ({len(crop_tasks)}):**")
+                            for i, task in enumerate(crop_tasks, 1):
+                                display_page = task.get("page_num", 0) + 1
+                                st.markdown(f"  {i}. {task.get('label', 'Crop')} (p.{display_page})")
+                        if code_queries:
+                            st.markdown(f"**Code queries ({len(code_queries)}):**")
+                            for i, q in enumerate(code_queries, 1):
+                                st.markdown(f"  {i}. [{q.get('focus_area', '?')}] {q.get('query', '')[:80]}...")
+                    if crop_tasks:
+                        crop_counter_placeholder.markdown(f"**Crop 0/{len(crop_tasks)}** — starting...")
+                    # Show parallel execution message (this appears while both branches run)
+                    parallel_status.info(
+                        ":arrows_counterclockwise: Running in parallel: "
+                        f"**Cropping {len(crop_tasks)} images** + "
+                        f"**Searching {len(code_queries)} code queries**. "
+                        "This may take 30-60 seconds..."
+                    )
+                elif node_name in ("initial_code_lookup", "execute_crops"):
+                    # Clear the parallel status once a branch finishes
+                    parallel_status.empty()
+                if node_name in ("initial_code_lookup", "targeted_code_lookup"):
+                    report = update.get("code_report", "")
+                    new_sections = update.get("code_sections", [])
+                    if report:
+                        code_report_text = report
+                        st.session_state.code_report = report
+                    if new_sections:
+                        st.session_state.code_sections.extend(new_sections)
+                        # Render each new section in the evidence panel in real-time
+                        with evidence_col:
+                            with evidence_tabs[1]:
+                                for sec in new_sections:
+                                    with st.expander(
+                                        f":balance_scale: {sec.get('code_type', '?')} "
+                                        f"§{sec.get('section_full', '?')}",
+                                        expanded=False,
+                                    ):
+                                        if sec.get("relevance"):
+                                            st.caption(sec["relevance"])
+                                        st.markdown(sec.get("text", "")[:1500])
+                elif node_name == "compliance_analyst":
+                    analysis = update.get("compliance_analysis", "")
+                    needs_more = update.get("needs_more_investigation", False)
+                    round_num = update.get("investigation_round", 1)
+                    if analysis:
+                        label = f":mag: Compliance Analysis (Round {round_num})"
+                        if needs_more:
+                            label += " — requesting more evidence"
+                        with st.expander(label, expanded=False):
+                            st.markdown(analysis[:5000])
+                elif node_name == "deliberation":
+                    review = update.get("reviewer_analysis", "")
+                    if review:
+                        with st.expander(":handshake: Peer Review", expanded=False):
+                            st.markdown(review[:3000])
+                # Collect images — persist to session state and render in evidence panel
+                new_refs = update.get("image_refs", [])
+                if new_refs:
+                    all_image_refs.extend(new_refs)
+                    st.session_state.image_refs.extend(new_refs)
+                    # Render each new crop in the evidence panel in real-time
+                    with evidence_col:
+                        with evidence_tabs[0]:
+                            for ref in new_refs:
+                                try:
+                                    img = Image.open(ref["path"])
+                                    st.image(img, caption=ref["label"], use_container_width=True)
+                                except Exception:
+                                    st.warning(f"Could not load: {ref['label']}")
+                # Capture final verdict
+                if "final_verdict" in update and update["final_verdict"]:
+                    final_verdict_text = update["final_verdict"]
+                # Show next step label
+                if node_name in PROGRESS_LABELS:
+                    next_labels = {
+                        "compliance_planner": ["execute_crops", "initial_code_lookup"],
+                        "execute_crops": ["compliance_analyst"],
+                        "annotate_crops": ["compliance_analyst"],
+                        "initial_code_lookup": ["compliance_analyst"],
+                        "compliance_analyst": ["final_verdict"],
+                        "targeted_code_lookup": ["compliance_analyst"],
+                        "deliberation": ["final_verdict"],
+                    }
+                    for next_node in next_labels.get(node_name, []):
+                        if next_node in PROGRESS_LABELS:
+                            st.write(PROGRESS_LABELS[next_node])
+            if crop_cache.size > 0:
+                st.caption(f":file_folder: {crop_cache.stats}")
+            status.update(label="Compliance investigation complete", state="complete")
+    # Display final verdict
+    if final_verdict_text:
+        with chat_col:
+            with st.chat_message("assistant"):
+                st.markdown(final_verdict_text)
+        st.session_state.chat_history[-1] = ("user", question, [])
+        st.session_state.chat_history.append(("assistant", final_verdict_text, all_image_refs))
+    else:
+        with chat_col:
+            st.error("No verdict was generated. Please try again.")
+    st.rerun()

config.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import os
+from dotenv import load_dotenv
+# Explicit path so .env is found regardless of the working directory
+_THIS_DIR = os.path.dirname(os.path.abspath(__file__))
+load_dotenv(os.path.join(_THIS_DIR, ".env"))
+# --- API Keys ---
+GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY", "")
+OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "")
+# --- Model Names ---
+PLANNER_MODEL = "gemini-3-flash-preview"
+CROPPER_MODEL = "gemini-3-flash-preview"
+ANNOTATOR_MODEL = "gemini-2.5-flash-image"
+ANALYZER_MODEL = "gemini-3-flash-preview"
+CODE_LOOKUP_MODEL = "gpt-5-mini"
+DELIBERATION_MODEL = "gpt-5.2"
+VERDICT_MODEL = "gemini-3-flash-preview"
+METADATA_MODEL = "gemini-3-flash-preview"
+# --- ChromaDB ---
+CHROMA_DB_PATH = os.path.join(os.path.dirname(__file__), "data", "nyc_code_db")
+CHROMA_COLLECTION_NAME = "nyc_building_codes"
+EMBEDDING_MODEL_NAME = "BAAI/bge-large-en-v1.5"
+# --- Processing Constants ---
+PDF_RENDER_DPI = 100
+MAX_INVESTIGATION_ROUNDS = 3
+CROP_PADDING_PX = 40
+# --- Code Lookup Budgets (per individual query, queries run in parallel) ---
+MAX_DISCOVER_CALLS = 1
+MAX_FETCH_CALLS = 1
+MAX_CODE_LOOKUP_TURNS = 5
+# --- Search Defaults ---
+DISCOVER_N_RESULTS = 50    # Retrieve more candidates for re-ranking
+RERANK_TOP_K = 20          # Return top-K after re-ranking
+FETCH_MAX_SECTIONS = 30    # Max sections per chapter fetch

data/BUILDING_CODE.json ADDED Viewed

The diff for this file is too large to render. See raw diff

data/FUEL_GAS_CODE.json ADDED Viewed

The diff for this file is too large to render. See raw diff

data/GENERAL_ADMINISTRATIVE_PROVISIONS.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a2060b78a8e7c0061ee1dab1afe66a2a3e4cb28694af0495a8b3340a26c69940
+size 20920937

data/MECHANICAL_CODE.json ADDED Viewed

The diff for this file is too large to render. See raw diff

data/PLUMBING_CODE.json ADDED Viewed

The diff for this file is too large to render. See raw diff

data/ingest_chromadb.py ADDED Viewed

	@@ -0,0 +1,120 @@

+"""Ingest preprocessed NYC code JSON files into ChromaDB with bge-large-en-v1.5."""
+from __future__ import annotations
+import json
+import os
+import sys
+import chromadb
+from chromadb.utils import embedding_functions
+EMBEDDING_MODEL = "BAAI/bge-large-en-v1.5"
+COLLECTION_NAME = "nyc_building_codes"
+DB_PATH = os.path.join(os.path.dirname(__file__), "nyc_code_db")
+# Map of JSON files to their code types
+CODE_FILES = {
+    "BUILDING_CODE.json": "Building",
+    "FUEL_GAS_CODE.json": "FuelGas",
+    "GENERAL_ADMINISTRATIVE_PROVISIONS.json": "Administrative",
+    "MECHANICAL_CODE.json": "Mechanical",
+    "PLUMBING_CODE.json": "Plumbing",
+}
+def create_collection(db_path: str = DB_PATH, reset: bool = True):
+    """Create or reset the ChromaDB collection."""
+    client = chromadb.PersistentClient(path=db_path)
+    embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
+        model_name=EMBEDDING_MODEL,
+    )
+    if reset:
+        try:
+            client.delete_collection(name=COLLECTION_NAME)
+            print(f"Deleted existing collection '{COLLECTION_NAME}'.")
+        except Exception:
+            pass
+    collection = client.create_collection(
+        name=COLLECTION_NAME,
+        embedding_function=embedding_fn,
+    )
+    return client, collection
+def ingest_json_file(collection, json_path: str, code_type: str) -> int:
+    """Ingest a single JSON file into the collection. Returns count of sections added."""
+    with open(json_path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    documents = []
+    metadatas = []
+    ids = []
+    seen_ids: set[str] = set()
+    for entry in data:
+        meta = entry["metadata"]
+        # Ensure code_type is set (should already be from preprocessing)
+        meta["code_type"] = code_type
+        unique_id = f"{code_type}_{entry['id']}"
+        if unique_id in seen_ids:
+            continue
+        # Flatten list-type metadata for ChromaDB (only supports str/int/float/bool)
+        flat_meta = {}
+        for k, v in meta.items():
+            if isinstance(v, list):
+                flat_meta[k] = ", ".join(str(x) for x in v) if v else ""
+            elif isinstance(v, bool):
+                flat_meta[k] = v
+            elif isinstance(v, (int, float)):
+                flat_meta[k] = v
+            else:
+                flat_meta[k] = str(v)
+        documents.append(entry["text"])
+        metadatas.append(flat_meta)
+        ids.append(unique_id)
+        seen_ids.add(unique_id)
+    # Batch upsert
+    batch_size = 200  # Smaller batches for larger embeddings
+    for i in range(0, len(documents), batch_size):
+        batch_end = min(i + batch_size, len(documents))
+        collection.upsert(
+            documents=documents[i:batch_end],
+            metadatas=metadatas[i:batch_end],
+            ids=ids[i:batch_end],
+        )
+        print(f"  Batch {i // batch_size + 1}: upserted {batch_end - i} sections")
+    return len(ids)
+def ingest_all(data_dir: str, db_path: str = DB_PATH) -> dict[str, int]:
+    """Ingest all code JSON files into a fresh ChromaDB collection."""
+    print(f"Creating ChromaDB at {db_path} with embedding model: {EMBEDDING_MODEL}")
+    _client, collection = create_collection(db_path, reset=True)
+    counts: dict[str, int] = {}
+    for filename, code_type in CODE_FILES.items():
+        json_path = os.path.join(data_dir, filename)
+        if os.path.exists(json_path):
+            print(f"\nIngesting {filename} as '{code_type}'...")
+            count = ingest_json_file(collection, json_path, code_type)
+            counts[code_type] = count
+            print(f"  -> {count} sections ingested")
+        else:
+            print(f"WARNING: {json_path} not found, skipping.")
+    total = sum(counts.values())
+    print(f"\nIngestion complete. Total: {total} sections across {len(counts)} code types.")
+    return counts
+if __name__ == "__main__":
+    data_dir = sys.argv[1] if len(sys.argv) > 1 else os.path.dirname(__file__)
+    ingest_all(data_dir)

data/nyc_code_db/chroma.sqlite3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8cba84d670f2a081621d233004d1dc55d6e6766c0d9b3a5dbae28e4337aed86b
+size 118640640

data/nyc_code_db/d5ad1fca-6483-43d1-b3bd-08c280e227d1/data_level0.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b13fd3171ed75d30bbc76630532fc4ff49c459ac97735349fc8670bc5402056e
+size 21180000

data/nyc_code_db/d5ad1fca-6483-43d1-b3bd-08c280e227d1/header.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4976cac8d856d3914be9284cb6405a0b16019a1c69f0d98c21d517da4492fed0
+size 100

data/nyc_code_db/d5ad1fca-6483-43d1-b3bd-08c280e227d1/index_metadata.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:abf1da4a172d264eca520e35093cd9d1dbfc1a2174c5cc08f2bf14e40dc27c0c
+size 266462

data/nyc_code_db/d5ad1fca-6483-43d1-b3bd-08c280e227d1/length.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c59d245953aaf1e3e56e16a29fc026d3d3b255c07909bdf1345af07e98e51a70
+size 20000

data/nyc_code_db/d5ad1fca-6483-43d1-b3bd-08c280e227d1/link_lists.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2798c188a968c06bd2a7b8bf9886e25e9f0ef8de40118f57f92647e11c7e58cf
+size 42916

data/nyc_code_db/f32247b2-1c25-42c9-9177-5ccfe22bb0b1/data_level0.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f7351c3a89b1d41bdbc04c8cda4131784642164ac6289029f21ea79369ecebfd
+size 43050468

data/nyc_code_db/f32247b2-1c25-42c9-9177-5ccfe22bb0b1/header.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7de83396c6ea38709c2085abdd0ad029742aff3877b5cf4499cafca3bd2d3977
+size 100

data/nyc_code_db/f32247b2-1c25-42c9-9177-5ccfe22bb0b1/index_metadata.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fa1c7ef73ee5c32a15415bf8d5e8d43ec77e26cfbf77cfe04e7ccbb6b295b705
+size 557222

data/nyc_code_db/f32247b2-1c25-42c9-9177-5ccfe22bb0b1/length.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a246c61293a0e64dc3582af64fbb6af58c2e9c26228c15c5023f6fd32d4dcff1
+size 40652

data/nyc_code_db/f32247b2-1c25-42c9-9177-5ccfe22bb0b1/link_lists.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a17b8f2739f3b77723cdb5a3c0de9fcbd4e64d65b16a9be028ba214ecab449e2
+size 86212

data/preprocess_codes.py ADDED Viewed

	@@ -0,0 +1,253 @@

+"""Improved NYC code preprocessing — fixes duplicates, improves metadata, preserves structure."""
+from __future__ import annotations
+import hashlib
+import json
+import os
+import re
+from collections import Counter, OrderedDict
+# ---------------------------------------------------------------------------
+# Text cleaning
+# ---------------------------------------------------------------------------
+def clean_and_flatten(text: str) -> str:
+    """Fix mid-word line breaks and collapse whitespace while preserving list structure."""
+    # Fix words split by hyphens across lines (e.g., "accord-\nance")
+    text = re.sub(r"(\w+)-\s*\n\s*(\w+)", r"\1\2", text)
+    # Preserve numbered list items by inserting a marker before cleanup
+    text = re.sub(r"\n\s*(\d+\.)\s+", r" __LISTBREAK__ \1 ", text)
+    text = re.sub(r"\n\s*(Exception(?:s)?[\s:.])", r" __LISTBREAK__ \1", text)
+    text = text.replace("\n", " ")
+    # Clean spacing around dashes in section numbers (e.g., 28 - 101)
+    text = re.sub(r"(\d+)\s*-\s*(\d+)", r"\1-\2", text)
+    text = re.sub(r"\s+", " ", text).strip()
+    # Restore list breaks as newlines
+    text = text.replace("__LISTBREAK__", "\n")
+    return text
+# ---------------------------------------------------------------------------
+# Anchor / section detection
+# ---------------------------------------------------------------------------
+def get_dominant_anchor(content: str) -> str | None:
+    """Detect the dominant chapter digit (1-9) or Appendix letter (A-Z)."""
+    anchors = re.findall(
+        r"(?m)^(?:\*?\s?§?\s?)(?:([1-9])\d{2,3}\.|([A-Z])(?:\d{2,3})?\.)",
+        content,
+    )
+    found = [item for sublist in anchors for item in sublist if item]
+    if not found:
+        return None
+    return Counter(found).most_common(1)[0][0]
+# ---------------------------------------------------------------------------
+# Metadata extraction from section text
+# ---------------------------------------------------------------------------
+_OCCUPANCY_RE = re.compile(
+    r"\b(?:Group|Occupancy|Classification)\s+"
+    r"([A-Z]-?\d?(?:\s*,\s*[A-Z]-?\d?)*)",
+    re.IGNORECASE,
+)
+_CONSTRUCTION_TYPE_RE = re.compile(
+    r"\bType\s+(I[A-B]?|II[A-B]?|III[A-B]?|IV[A-B]?|V[A-B]?)\b",
+    re.IGNORECASE,
+)
+_EXCEPTION_RE = re.compile(r"\bException(?:s)?\s*[:.]", re.IGNORECASE)
+_CROSS_REF_RE = re.compile(
+    r"(?:Section|Sections|§)\s+(\d{2,4}(?:\.\d+)*(?:\s*(?:,|and|through)\s*\d{2,4}(?:\.\d+)*)*)",
+    re.IGNORECASE,
+)
+def extract_rich_metadata(section_id: str, text: str, code_type: str) -> dict:
+    """Extract enhanced metadata from section text for better filtering."""
+    id_parts = section_id.split(".")
+    parent_major = id_parts[0]
+    parent_minor = ".".join(id_parts[:2]) if len(id_parts) > 1 else parent_major
+    # Occupancy classes mentioned
+    occ_matches = _OCCUPANCY_RE.findall(text)
+    occupancy_classes = []
+    for m in occ_matches:
+        for cls in re.split(r"\s*,\s*", m):
+            cls = cls.strip().upper()
+            if cls and cls not in occupancy_classes:
+                occupancy_classes.append(cls)
+    # Construction types mentioned
+    const_matches = _CONSTRUCTION_TYPE_RE.findall(text)
+    construction_types = sorted(set(m.upper() for m in const_matches))
+    # Exception detection
+    has_exceptions = bool(_EXCEPTION_RE.search(text))
+    exception_count = len(_EXCEPTION_RE.findall(text))
+    # Cross-references
+    xref_matches = _CROSS_REF_RE.findall(text)
+    cross_references = []
+    for m in xref_matches:
+        for ref in re.split(r"\s*(?:,|and|through)\s*", m):
+            ref = ref.strip()
+            if ref and ref != section_id and ref not in cross_references:
+                cross_references.append(ref)
+    return {
+        "section_full": section_id,
+        "parent_major": parent_major,
+        "parent_minor": parent_minor,
+        "code_type": code_type,
+        "occupancy_classes": occupancy_classes,
+        "construction_types": construction_types,
+        "has_exceptions": has_exceptions,
+        "exception_count": exception_count,
+        "cross_references": cross_references,
+    }
+# ---------------------------------------------------------------------------
+# Core extraction with deduplication
+# ---------------------------------------------------------------------------
+def extract_trade_sections(
+    file_path: str,
+    global_dict: OrderedDict,
+    code_type: str,
+    seen_hashes: dict[str, set[str]],
+) -> OrderedDict:
+    """Extract code sections from a single source file with deduplication."""
+    if not os.path.exists(file_path):
+        return global_dict
+    with open(file_path, "r", encoding="utf-8") as f:
+        content = f.read().replace("\xa0", " ")
+    anchor = get_dominant_anchor(content)
+    if not anchor:
+        return global_dict
+    # Build section-matching regex
+    if anchor.isalpha():
+        id_pattern = rf"[A-Z]?{re.escape(anchor)}\d*(?:\.\d+)+"
+    else:
+        id_pattern = rf"{re.escape(anchor)}\d{{2,3}}(?:\.\d+)+"
+    pattern = rf"(?m)^\s*[\*§]?\s*({id_pattern})\s+([A-Z\w]+)"
+    matches = list(re.finditer(pattern, content))
+    skip_words = {
+        "and", "through", "to", "or", "sections", "the", "of", "in", "under", "as",
+    }
+    for i in range(len(matches)):
+        clean_id = matches[i].group(1).strip()
+        first_word = matches[i].group(2)
+        if first_word.lower() in skip_words:
+            continue
+        start_pos = matches[i].start()
+        end_pos = matches[i + 1].start() if i + 1 < len(matches) else len(content)
+        raw_body = content[start_pos:end_pos]
+        clean_body = clean_and_flatten(raw_body)
+        if len(clean_body) < 60:
+            continue
+        # ------ DEDUPLICATION via content hashing ------
+        block_hash = hashlib.md5(clean_body.encode()).hexdigest()
+        if clean_id in global_dict:
+            # Check if this block is a genuine duplicate
+            if clean_id not in seen_hashes:
+                seen_hashes[clean_id] = set()
+            if block_hash in seen_hashes[clean_id]:
+                continue  # Skip exact duplicate
+            seen_hashes[clean_id].add(block_hash)
+            global_dict[clean_id]["text"] += f" [CONT.]: {clean_body}"
+            source_name = os.path.basename(file_path)
+            if source_name not in global_dict[clean_id]["metadata"]["source"]:
+                global_dict[clean_id]["metadata"]["source"] += f", {source_name}"
+        else:
+            seen_hashes[clean_id] = {block_hash}
+            metadata = extract_rich_metadata(clean_id, clean_body, code_type)
+            metadata["source"] = os.path.basename(file_path)
+            global_dict[clean_id] = {
+                "id": clean_id,
+                "text": f"CONTEXT: {metadata['parent_major']} > {metadata['parent_minor']} | CONTENT: {clean_id} {clean_body}",
+                "metadata": metadata,
+            }
+    return global_dict
+# ---------------------------------------------------------------------------
+# Main pipeline
+# ---------------------------------------------------------------------------
+# File ranges per code type (same as original, but parameterized)
+CODE_CONFIGS = {
+    "Building": {
+        "file_range": [i for i in range(58, 112) if i not in {90, 91, 92, 93, 94, 100, 101, 103, 106, 107}],
+        "output_file": "BUILDING_CODE.json",
+    },
+    "FuelGas": {
+        "file_range": [i for i in range(43, 58) if i not in {50, 51, 52, 53, 54, 56}],
+        "output_file": "FUEL_GAS_CODE.json",
+    },
+    "Mechanical": {
+        "file_range": [i for i in range(24, 43) if i not in {30, 31}],
+        "output_file": "MECHANICAL_CODE.json",
+    },
+    "Plumbing": {
+        "file_range": list(range(1, 24)),
+        "output_file": "PLUMBING_CODE.json",
+    },
+    "Administrative": {
+        "file_range": list(range(112, 160)),
+        "output_file": "GENERAL_ADMINISTRATIVE_PROVISIONS.json",
+    },
+}
+def preprocess_all(text_dir: str, output_dir: str) -> dict[str, int]:
+    """Run preprocessing for all code types. Returns counts per type."""
+    os.makedirs(output_dir, exist_ok=True)
+    counts: dict[str, int] = {}
+    for code_type, cfg in CODE_CONFIGS.items():
+        master_dict: OrderedDict = OrderedDict()
+        seen_hashes: dict[str, set[str]] = {}
+        for file_num in cfg["file_range"]:
+            path = os.path.join(text_dir, f"{file_num:03d}.txt")
+            if os.path.exists(path):
+                print(f"[{code_type}] Processing {path}...")
+                extract_trade_sections(path, master_dict, code_type, seen_hashes)
+        result = list(master_dict.values())
+        output_path = os.path.join(output_dir, cfg["output_file"])
+        with open(output_path, "w", encoding="utf-8") as f:
+            json.dump(result, f, indent=2, ensure_ascii=False)
+        counts[code_type] = len(result)
+        print(f"[{code_type}] Wrote {len(result)} sections to {output_path}")
+    return counts
+if __name__ == "__main__":
+    import sys
+    text_dir = sys.argv[1] if len(sys.argv) > 1 else "Text"
+    output_dir = sys.argv[2] if len(sys.argv) > 2 else "data"
+    counts = preprocess_all(text_dir, output_dir)
+    print(f"\nPreprocessing complete: {counts}")

graph.py ADDED Viewed

	@@ -0,0 +1,147 @@

+"""LangGraph definition — compliance workflow with parallel fan-out/fan-in and investigation loop."""
+from __future__ import annotations
+from langgraph.graph import END, StateGraph
+from config import MAX_INVESTIGATION_ROUNDS
+from nodes.code_lookup import initial_code_lookup, targeted_code_lookup
+from nodes.compliance_analyst import compliance_analyst
+from nodes.compliance_planner import compliance_planner
+from nodes.cropper import ProgressCallback, execute_crops
+from nodes.annotator import annotate_crops
+from nodes.deliberation import deliberation
+from nodes.final_verdict import final_verdict
+from state import ComplianceState
+from tools.crop_cache import CropCache
+from tools.image_store import ImageStore
+def _build_compliance_graph(
+    image_store: ImageStore,
+    crop_cache: CropCache | None = None,
+    progress_callback: ProgressCallback | None = None,
+) -> StateGraph:
+    """Build the compliance analysis graph with parallel fan-out/fan-in.
+    Architecture:
+        compliance_planner
+            ├── execute_crops ──► annotate_crops (optional) ──┐
+            └── initial_code_lookup ─────────────────────────┤
+                                                              ▼
+                                                   compliance_analyst ◄──┐
+                                                      │    │    │        │
+                                                      │    │    │        │
+                                               (crops)(code)(done)       │
+                                                      │    │    │        │
+                                                      └────┘    │        │
+                                                           │    │        │
+                                                           ▼    ▼        │
+                                                    deliberation (opt)   │
+                                                           │             │
+                                                    final_verdict ───────┘
+                                                           │
+                                                          END
+    """
+    # ---- Wrap nodes that need ImageStore / CropCache / callback ----
+    def _execute_crops(state: ComplianceState) -> dict:
+        return execute_crops(state, image_store, crop_cache, progress_callback)
+    def _annotate_crops(state: ComplianceState) -> dict:
+        return annotate_crops(state, image_store)
+    def _compliance_analyst(state: ComplianceState) -> dict:
+        return compliance_analyst(state, image_store)
+    def _deliberation(state: ComplianceState) -> dict:
+        return deliberation(state, image_store)
+    # ---- Build graph ----
+    graph = StateGraph(ComplianceState)
+    # Add all nodes
+    graph.add_node("compliance_planner", compliance_planner)
+    graph.add_node("execute_crops", _execute_crops)
+    graph.add_node("annotate_crops", _annotate_crops)
+    graph.add_node("initial_code_lookup", initial_code_lookup)
+    graph.add_node("compliance_analyst", _compliance_analyst)
+    graph.add_node("targeted_code_lookup", targeted_code_lookup)
+    graph.add_node("deliberation", _deliberation)
+    graph.add_node("final_verdict", final_verdict)
+    # ---- Edges ----
+    # Entry: planner is always first
+    graph.set_entry_point("compliance_planner")
+    # Parallel fan-out: planner → both execute_crops AND initial_code_lookup
+    graph.add_edge("compliance_planner", "execute_crops")
+    graph.add_edge("compliance_planner", "initial_code_lookup")
+    # After crops: optionally annotate, then go to compliance_analyst
+    def _after_crops(state: ComplianceState) -> str:
+        if not state.get("enable_annotation", True):
+            return "compliance_analyst"
+        crop_tasks = state.get("crop_tasks", [])
+        if not any(t.get("annotate") and t.get("annotation_prompt") for t in crop_tasks):
+            return "compliance_analyst"
+        return "annotate_crops"
+    graph.add_conditional_edges(
+        "execute_crops",
+        _after_crops,
+        {"annotate_crops": "annotate_crops", "compliance_analyst": "compliance_analyst"},
+    )
+    graph.add_edge("annotate_crops", "compliance_analyst")
+    # Code lookup also feeds into compliance_analyst (fan-in)
+    graph.add_edge("initial_code_lookup", "compliance_analyst")
+    # After analysis: loop for more evidence, deliberate, or go to verdict
+    def _after_analyst(state: ComplianceState) -> str:
+        needs_more = state.get("needs_more_investigation", False)
+        round_num = state.get("investigation_round", 0)
+        max_rounds = state.get("max_rounds", MAX_INVESTIGATION_ROUNDS)
+        has_additional_crops = bool(state.get("additional_crop_tasks", []))
+        has_additional_code = bool(state.get("additional_code_queries", []))
+        enable_consensus = state.get("enable_consensus", False)
+        if needs_more and round_num < max_rounds:
+            if has_additional_code:
+                return "targeted_code_lookup"
+            if has_additional_crops:
+                return "execute_crops"
+        if enable_consensus:
+            return "deliberation"
+        return "final_verdict"
+    graph.add_conditional_edges(
+        "compliance_analyst",
+        _after_analyst,
+        {
+            "execute_crops": "execute_crops",
+            "targeted_code_lookup": "targeted_code_lookup",
+            "deliberation": "deliberation",
+            "final_verdict": "final_verdict",
+        },
+    )
+    # Targeted code lookup feeds back to analyst
+    graph.add_edge("targeted_code_lookup", "compliance_analyst")
+    # Deliberation → verdict
+    graph.add_edge("deliberation", "final_verdict")
+    # Verdict → END
+    graph.add_edge("final_verdict", END)
+    return graph
+def compile_compliance_graph(
+    image_store: ImageStore,
+    crop_cache: CropCache | None = None,
+    progress_callback: ProgressCallback | None = None,
+):
+    """Return a compiled, ready-to-invoke compliance graph."""
+    return _build_compliance_graph(image_store, crop_cache, progress_callback).compile()

nodes/__init__.py ADDED Viewed

File without changes

nodes/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (230 Bytes). View file

nodes/__pycache__/annotator.cpython-313.pyc ADDED Viewed

Binary file (5.18 kB). View file

nodes/__pycache__/code_lookup.cpython-313.pyc ADDED Viewed

Binary file (10.6 kB). View file

nodes/__pycache__/compliance_analyst.cpython-313.pyc ADDED Viewed

Binary file (8.35 kB). View file

nodes/__pycache__/compliance_planner.cpython-313.pyc ADDED Viewed

Binary file (6.46 kB). View file

nodes/__pycache__/cropper.cpython-313.pyc ADDED Viewed

Binary file (8.87 kB). View file

nodes/__pycache__/deliberation.cpython-313.pyc ADDED Viewed

Binary file (3.32 kB). View file

nodes/__pycache__/final_verdict.cpython-313.pyc ADDED Viewed

Binary file (3.74 kB). View file

nodes/__pycache__/metadata_generator.cpython-313.pyc ADDED Viewed

Binary file (7.52 kB). View file

nodes/annotator.py ADDED Viewed

	@@ -0,0 +1,117 @@

+"""annotate_crops node — nano-banana (Gemini image generation) for semantic annotation."""
+from __future__ import annotations
+import io
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from google import genai
+from google.genai import types
+from PIL import Image
+from config import ANNOTATOR_MODEL, GOOGLE_API_KEY
+from prompts.annotator import ANNOTATION_WRAPPER
+from state import DrawingReaderState, ImageRef
+from tools.image_store import ImageStore
+def _extract_generated_image(response) -> Image.Image | None:
+    """Extract the generated image from a Gemini image-generation response."""
+    for part in response.candidates[0].content.parts:
+        if part.inline_data is not None:
+            return Image.open(io.BytesIO(part.inline_data.data))
+    return None
+def _annotate_single_crop_sync(
+    client: genai.Client,
+    crop_ref: ImageRef,
+    annotation_prompt: str,
+    image_store: ImageStore,
+) -> ImageRef | None:
+    """Annotate one crop using nano-banana (synchronous)."""
+    crop_bytes = image_store.load_bytes(crop_ref)
+    full_prompt = ANNOTATION_WRAPPER.format(annotation_prompt=annotation_prompt)
+    response = client.models.generate_content(
+        model=ANNOTATOR_MODEL,
+        contents=[
+            types.Part.from_bytes(data=crop_bytes, mime_type="image/png"),
+            full_prompt,
+        ],
+        config=types.GenerateContentConfig(
+            response_modalities=["TEXT", "IMAGE"],
+        ),
+    )
+    annotated_image = _extract_generated_image(response)
+    if annotated_image is None:
+        return None
+    ref = image_store.save_annotated(crop_ref, annotated_image)
+    return ref
+def annotate_crops(state: DrawingReaderState, image_store: ImageStore) -> dict:
+    """Run nano-banana annotation on crops that need it."""
+    crop_tasks = state.get("crop_tasks", [])
+    image_refs = state.get("image_refs", [])
+    # Build a mapping: find crops that need annotation.
+    # The most recent batch of crops corresponds to the current crop_tasks.
+    # Take the LAST len(crop_tasks) crops from image_refs to match by position,
+    # so that on loop-back rounds we only match against the newest crops.
+    crops_needing_annotation: list[tuple[ImageRef, str]] = []
+    all_crops = [r for r in image_refs if r["crop_type"] == "crop"]
+    # Only the tail — the most recent batch produced by execute_crops
+    recent_crops = all_crops[-len(crop_tasks):] if crop_tasks else []
+    for i, task in enumerate(crop_tasks):
+        if task["annotate"] and task["annotation_prompt"] and i < len(recent_crops):
+            crops_needing_annotation.append(
+                (recent_crops[i], task["annotation_prompt"])
+            )
+    if not crops_needing_annotation:
+        return {"status_message": ["No annotation needed for these crops."]}
+    client = genai.Client(api_key=GOOGLE_API_KEY)
+    # Use a thread pool instead of asyncio to avoid event-loop conflicts
+    # with Streamlit's own event loop.
+    results: list[ImageRef | None | Exception] = [None] * len(crops_needing_annotation)
+    with ThreadPoolExecutor(max_workers=min(len(crops_needing_annotation), 4)) as pool:
+        future_to_idx = {}
+        for i, (ref, prompt) in enumerate(crops_needing_annotation):
+            future = pool.submit(
+                _annotate_single_crop_sync, client, ref, prompt, image_store,
+            )
+            future_to_idx[future] = i
+        for future in as_completed(future_to_idx):
+            idx = future_to_idx[future]
+            try:
+                results[idx] = future.result()
+            except Exception as e:
+                results[idx] = e
+    annotated_refs: list[ImageRef] = []
+    errors: list[str] = []
+    for i, result in enumerate(results):
+        if isinstance(result, Exception):
+            errors.append(f"Annotation {i} failed: {result}")
+        elif result is not None:
+            annotated_refs.append(result)
+        else:
+            errors.append(f"Annotation {i} returned no image")
+    status = f"Annotated {len(annotated_refs)} of {len(crops_needing_annotation)} crops."
+    if errors:
+        status += f" Issues: {'; '.join(errors)}"
+    return {
+        "image_refs": annotated_refs,
+        "status_message": [status],
+    }

nodes/code_lookup.py ADDED Viewed

	@@ -0,0 +1,286 @@

+"""code_lookup node — lightweight snippet reviewer (no multi-turn tool loop).
+Flow:
+1. discover_code_locations() — ChromaDB semantic search (~1-2 sec per query)
+2. GPT reviews the raw snippets in a SINGLE call — flags relevant ones with
+   a relevance tag and brief note
+3. Raw flagged snippets + GPT notes go to the compliance analyst
+No fetch_full_chapter in the initial pass.  The compliance analyst can request
+targeted chapter fetches via additional_code_queries if it needs more context.
+"""
+from __future__ import annotations
+import json
+import logging
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from datetime import datetime
+from openai import OpenAI
+from config import CODE_LOOKUP_MODEL, OPENAI_API_KEY
+from prompts.code_lookup import CODE_REVIEWER_SYSTEM_PROMPT
+from state import AgentMessage, CodeQuery, CodeSection, ComplianceState
+from tools.chroma_tools import QueryCache, discover_code_locations
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Single-call snippet reviewer (replaces the multi-turn tool loop)
+# ---------------------------------------------------------------------------
+def _review_snippets(
+    research_goal: str,
+    discover_report: str,
+) -> tuple[str, list[CodeSection]]:
+    """GPT reviews discover results in ONE call.
+    Returns (brief_review, flagged_sections).
+    """
+    client = OpenAI(api_key=OPENAI_API_KEY)
+    response = client.chat.completions.create(
+        model=CODE_LOOKUP_MODEL,
+        messages=[
+            {"role": "system", "content": CODE_REVIEWER_SYSTEM_PROMPT},
+            {
+                "role": "user",
+                "content": (
+                    f"## Research Goal\n{research_goal}\n\n"
+                    f"## Code Snippets from Database\n{discover_report}"
+                ),
+            },
+        ],
+        response_format={"type": "json_object"},
+    )
+    raw = response.choices[0].message.content or "{}"
+    try:
+        parsed = json.loads(raw)
+    except json.JSONDecodeError:
+        logger.warning("GPT snippet review returned invalid JSON, using raw text.")
+        return raw, []
+    flagged_sections: list[CodeSection] = []
+    for item in parsed.get("relevant_sections", []):
+        flagged_sections.append(
+            CodeSection(
+                section_full=item.get("section_id", "?"),
+                code_type=item.get("code_type", "?"),
+                parent_major=item.get("chapter", "?"),
+                text=item.get("snippet", "")[:1500],
+                relevance=item.get("relevance_note", ""),
+            )
+        )
+    summary = parsed.get("summary", "No summary provided.")
+    return summary, flagged_sections
+def _run_single_lookup(
+    cq: CodeQuery,
+    query_cache: QueryCache | None = None,
+) -> tuple[str, list[CodeSection], str]:
+    """Run discover + review for ONE code query.
+    Returns (summary, flagged_sections, discover_report_raw).
+    """
+    research_goal = f"{cq['query']} (Context: {cq['context']})"
+    logger.info("ChromaDB query: %s", research_goal)
+    # Step 1: ChromaDB discover (fast, ~1-2s)
+    discover_report = discover_code_locations(research_goal, cache=query_cache)
+    # Step 2: GPT reviews snippets in a single call
+    summary, flagged = _review_snippets(research_goal, discover_report)
+    return summary, flagged, discover_report
+# ---------------------------------------------------------------------------
+# LangGraph node functions
+# ---------------------------------------------------------------------------
+def initial_code_lookup(state: ComplianceState) -> dict:
+    """Run the initial code lookup based on planner's code_queries.
+    All queries run in PARALLEL via ThreadPoolExecutor.
+    Each query = 1 discover + 1 GPT review call (no multi-turn loop).
+    """
+    code_queries = state.get("code_queries", [])
+    if not code_queries:
+        return {
+            "code_report": "No code queries were planned.",
+            "code_sections": [],
+            "discussion_log": [
+                AgentMessage(
+                    timestamp=datetime.now().strftime("%H:%M:%S"),
+                    agent="code_analyst",
+                    action="search_code",
+                    summary="No code queries to execute.",
+                    detail="The planner did not generate any code queries.",
+                    evidence_refs=[],
+                )
+            ],
+            "status_message": ["No code queries to execute."],
+        }
+    query_cache = QueryCache()
+    discussion_messages: list[AgentMessage] = []
+    # Add "searching" messages for all queries upfront
+    for cq in code_queries:
+        discussion_messages.append(
+            AgentMessage(
+                timestamp=datetime.now().strftime("%H:%M:%S"),
+                agent="code_analyst",
+                action="search_code",
+                summary=f"Searching: {cq['query'][:80]}...",
+                detail=f"Focus area: {cq['focus_area']}\nContext: {cq['context']}",
+                evidence_refs=[],
+            )
+        )
+    # Execute ALL queries concurrently
+    results: dict[int, tuple[str, list[CodeSection], str]] = {}
+    with ThreadPoolExecutor(max_workers=min(len(code_queries), 4)) as pool:
+        futures = {
+            pool.submit(_run_single_lookup, cq, query_cache): i
+            for i, cq in enumerate(code_queries)
+        }
+        for future in as_completed(futures):
+            i = futures[future]
+            try:
+                summary, flagged, _raw = future.result()
+                results[i] = (summary, flagged, _raw)
+                cq = code_queries[i]
+                section_ids = ", ".join(
+                    s["section_full"] for s in flagged
+                )
+                discussion_messages.append(
+                    AgentMessage(
+                        timestamp=datetime.now().strftime("%H:%M:%S"),
+                        agent="code_analyst",
+                        action="search_code",
+                        summary=(
+                            f"Flagged {len(flagged)} sections "
+                            f"for '{cq['query']}'"
+                        ),
+                        detail=(
+                            f"**Query:** {cq['query']}\n"
+                            f"**Focus:** {cq['focus_area']}\n\n"
+                            f"**Sections:** {section_ids}\n\n"
+                            f"{summary[:800]}"
+                        ),
+                        evidence_refs=[s["section_full"] for s in flagged],
+                    )
+                )
+            except Exception as e:
+                logger.error("Code query %d failed: %s", i, e)
+                results[i] = (f"Error: {e}", [], "")
+                discussion_messages.append(
+                    AgentMessage(
+                        timestamp=datetime.now().strftime("%H:%M:%S"),
+                        agent="code_analyst",
+                        action="search_code",
+                        summary=f"Query {i + 1} failed: {e}",
+                        detail=str(e),
+                        evidence_refs=[],
+                    )
+                )
+    # Reassemble in original order
+    report_parts: list[str] = []
+    all_sections: list[CodeSection] = []
+    for i in range(len(code_queries)):
+        summary, flagged, _raw = results.get(i, ("No result", [], ""))
+        cq = code_queries[i]
+        report_parts.append(
+            f"### Query {i + 1}: {cq['focus_area']}\n{summary}"
+        )
+        all_sections.extend(flagged)
+    combined_report = "\n\n---\n\n".join(report_parts)
+    return {
+        "code_report": combined_report,
+        "code_sections": all_sections,
+        "discussion_log": discussion_messages,
+        "status_message": [
+            f"Code lookup complete. {len(all_sections)} relevant sections "
+            f"flagged across {len(code_queries)} queries."
+        ],
+    }
+def targeted_code_lookup(state: ComplianceState) -> dict:
+    """Run additional code lookups requested by the compliance analyst.
+    These may use fetch_full_chapter for deeper context when the analyst
+    needs full exception text or cross-reference detail.
+    """
+    additional_queries = state.get("additional_code_queries", [])
+    if not additional_queries:
+        return {
+            "status_message": ["No additional code queries."],
+        }
+    query_cache = QueryCache()
+    all_sections: list[CodeSection] = []
+    report_parts: list[str] = []
+    discussion_messages: list[AgentMessage] = []
+    for i, cq in enumerate(additional_queries):
+        discussion_messages.append(
+            AgentMessage(
+                timestamp=datetime.now().strftime("%H:%M:%S"),
+                agent="code_analyst",
+                action="search_code",
+                summary=f"Follow-up search: {cq['query'][:80]}...",
+                detail=f"Requested by compliance analyst.\nFocus: {cq['focus_area']}",
+                evidence_refs=[],
+            )
+        )
+        summary, flagged, _raw = _run_single_lookup(cq, query_cache)
+        report_parts.append(summary)
+        all_sections.extend(flagged)
+        section_ids = ", ".join(s["section_full"] for s in flagged)
+        discussion_messages.append(
+            AgentMessage(
+                timestamp=datetime.now().strftime("%H:%M:%S"),
+                agent="code_analyst",
+                action="search_code",
+                summary=(
+                    f"Follow-up: flagged {len(flagged)} sections "
+                    f"for '{cq['query']}'"
+                ),
+                detail=(
+                    f"**Query:** {cq['query']}\n"
+                    f"**Focus:** {cq['focus_area']}\n\n"
+                    f"**Sections:** {section_ids}\n\n"
+                    f"{summary[:800]}"
+                ),
+                evidence_refs=[s["section_full"] for s in flagged],
+            )
+        )
+    # Append to existing report
+    existing_report = state.get("code_report", "")
+    new_report = "\n\n---\n\n".join(report_parts)
+    combined_report = f"{existing_report}\n\n## FOLLOW-UP CODE RESEARCH\n\n{new_report}"
+    return {
+        "code_report": combined_report,
+        "code_sections": all_sections,
+        "additional_code_queries": [],  # Clear after processing
+        "discussion_log": discussion_messages,
+        "status_message": [
+            f"Targeted code lookup complete. {len(all_sections)} additional sections "
+            f"from {len(additional_queries)} follow-up queries."
+        ],
+    }

nodes/compliance_analyst.py ADDED Viewed

	@@ -0,0 +1,188 @@

+"""compliance_analyst node — multimodal fusion of images + code for compliance determination."""
+from __future__ import annotations
+import json
+import re
+from datetime import datetime
+from google import genai
+from google.genai import types
+from config import ANALYZER_MODEL, GOOGLE_API_KEY
+from prompts.compliance_analyst import COMPLIANCE_ANALYST_SYSTEM_PROMPT
+from state import AgentMessage, CodeQuery, ComplianceState, CropTask
+from tools.image_store import ImageStore
+def compliance_analyst(state: ComplianceState, image_store: ImageStore) -> dict:
+    """Review all cropped images AND code sections to produce compliance findings."""
+    question = state["question"]
+    image_refs = state.get("image_refs", [])
+    code_report = state.get("code_report", "")
+    legend_pages = set(state.get("legend_pages", []))
+    investigation_round = state.get("investigation_round", 0)
+    discussion_log = state.get("discussion_log", [])
+    client = genai.Client(api_key=GOOGLE_API_KEY)
+    # Build multimodal content
+    content_parts: list[types.Part] = []
+    # 1. User question
+    content_parts.append(types.Part.from_text(text=f"USER COMPLIANCE QUESTION: {question}"))
+    # 2. Code report (legal requirements)
+    if code_report:
+        content_parts.append(
+            types.Part.from_text(
+                text=f"\n=== LEGAL REQUIREMENTS FROM NYC CODE ===\n{code_report}"
+            )
+        )
+    else:
+        content_parts.append(
+            types.Part.from_text(text="\n=== NO CODE SECTIONS RETRIEVED ===\n")
+        )
+    # 3. Discussion log summary (what previous agents found)
+    if discussion_log:
+        log_summary = "\n".join(
+            f"[{m['timestamp']}] {m['agent']}: {m['summary']}"
+            for m in discussion_log[-10:]  # Last 10 messages
+        )
+        content_parts.append(
+            types.Part.from_text(
+                text=f"\n=== AGENT DISCUSSION LOG ===\n{log_summary}"
+            )
+        )
+    # 4. Images — legends first, then detail crops, then annotated
+    legend_refs = [r for r in image_refs if r["page_num"] in legend_pages and r["crop_type"] == "crop"]
+    detail_crops = [r for r in image_refs if r["page_num"] not in legend_pages and r["crop_type"] == "crop"]
+    annotated_refs = [r for r in image_refs if r["crop_type"] == "annotated"]
+    ordered_refs = legend_refs + detail_crops + annotated_refs
+    if legend_refs:
+        content_parts.append(
+            types.Part.from_text(text="\n=== LEGEND / SCHEDULE CROPS (study these first) ===")
+        )
+    first_detail_id = detail_crops[0]["id"] if detail_crops else None
+    first_annotated_id = annotated_refs[0]["id"] if annotated_refs else None
+    for ref in ordered_refs:
+        if first_detail_id is not None and ref["id"] == first_detail_id:
+            content_parts.append(types.Part.from_text(text="\n=== DETAIL CROPS ==="))
+        if first_annotated_id is not None and ref["id"] == first_annotated_id:
+            content_parts.append(
+                types.Part.from_text(text="\n=== ANNOTATED CROPS (highlighted versions) ===")
+            )
+        content_parts.append(types.Part.from_text(text=f"\nImage: {ref['label']}"))
+        try:
+            content_parts.append(image_store.to_gemini_part(ref))
+        except Exception as e:
+            content_parts.append(
+                types.Part.from_text(text=f"(Could not load image: {e})")
+            )
+    # 5. Investigation round context
+    content_parts.append(
+        types.Part.from_text(
+            text=(
+                f"\nThis is investigation round {investigation_round + 1}. "
+                "Analyze the drawings against the code requirements. "
+                "If you need more evidence (crops or code lookups), include a JSON block at the end."
+            )
+        )
+    )
+    # Call Gemini
+    response = client.models.generate_content(
+        model=ANALYZER_MODEL,
+        contents=[types.Content(role="user", parts=content_parts)],
+        config=types.GenerateContentConfig(
+            system_instruction=COMPLIANCE_ANALYST_SYSTEM_PROMPT,
+        ),
+    )
+    analysis_text = response.text
+    # Parse additional investigation requests
+    needs_more = False
+    additional_crops: list[CropTask] = []
+    additional_code_queries: list[CodeQuery] = []
+    json_match = re.search(
+        r"```json\s*(\{.*?\"needs_more\"\s*:\s*true.*?\})\s*```",
+        analysis_text,
+        re.DOTALL,
+    )
+    if json_match:
+        try:
+            extra = json.loads(json_match.group(1))
+            if extra.get("needs_more"):
+                needs_more = True
+                for t in extra.get("additional_crops", []):
+                    raw_page = int(t.get("page_num", 1))
+                    additional_crops.append(
+                        CropTask(
+                            page_num=raw_page - 1,
+                            crop_instruction=t.get("crop_instruction", ""),
+                            annotate=bool(t.get("annotate", False)),
+                            annotation_prompt=t.get("annotation_prompt", ""),
+                            label=t.get("label", "Additional crop"),
+                            priority=int(t.get("priority", 1)),
+                        )
+                    )
+                for q in extra.get("additional_code_queries", []):
+                    additional_code_queries.append(
+                        CodeQuery(
+                            query=q.get("query", ""),
+                            focus_area=q.get("focus_area", ""),
+                            context=q.get("context", ""),
+                            priority=int(q.get("priority", 0)),
+                        )
+                    )
+        except (json.JSONDecodeError, KeyError):
+            pass
+        # Clean the JSON block from the analysis text
+        analysis_text = analysis_text[: json_match.start()].strip()
+    # Build discussion message
+    if needs_more:
+        summary = (
+            f"Round {investigation_round + 1} analysis complete. "
+            f"Requesting {len(additional_crops)} more crops and "
+            f"{len(additional_code_queries)} more code lookups."
+        )
+    else:
+        summary = f"Round {investigation_round + 1} compliance analysis complete."
+    discussion_msg = AgentMessage(
+        timestamp=datetime.now().strftime("%H:%M:%S"),
+        agent="compliance_analyst",
+        action="analyze" if not needs_more else "request_more",
+        summary=summary,
+        detail=analysis_text[:1500],
+        evidence_refs=[ref["id"] for ref in image_refs[:5]],
+    )
+    result: dict = {
+        "compliance_analysis": analysis_text,
+        "investigation_round": investigation_round + 1,
+        "needs_more_investigation": needs_more,
+        "discussion_log": [discussion_msg],
+        "status_message": [summary],
+    }
+    if additional_crops:
+        result["crop_tasks"] = additional_crops
+        result["additional_crop_tasks"] = additional_crops
+    if additional_code_queries:
+        result["additional_code_queries"] = additional_code_queries
+    return result

nodes/compliance_planner.py ADDED Viewed

	@@ -0,0 +1,130 @@

+"""compliance_planner node — dual-plan generation (crops + code queries)."""
+from __future__ import annotations
+import json
+import re
+from datetime import datetime
+from google import genai
+from google.genai import types
+from config import GOOGLE_API_KEY, PLANNER_MODEL
+from prompts.compliance_planner import COMPLIANCE_PLANNER_SYSTEM_PROMPT
+from state import AgentMessage, CodeQuery, ComplianceState, CropTask
+def compliance_planner(state: ComplianceState) -> dict:
+    """Analyze page metadata + user question and produce dual plans for
+    image cropping AND code lookup."""
+    question = state["question"]
+    num_pages = state.get("num_pages", 0)
+    page_metadata_json = state.get("page_metadata_json", "")
+    investigation_round = state.get("investigation_round", 0)
+    client = genai.Client(api_key=GOOGLE_API_KEY)
+    question_text = (
+        f"USER COMPLIANCE QUESTION: {question}\n\n"
+        f"The PDF has {num_pages} pages (1-indexed, from page 1 to page {num_pages}).\n"
+        f"This is investigation round {investigation_round + 1}.\n\n"
+    )
+    if page_metadata_json:
+        question_text += f"PAGE METADATA:\n{page_metadata_json}"
+    else:
+        question_text += (
+            "No page metadata available. Based on the question alone, "
+            "plan what code lookups are needed. Crop tasks will use default pages."
+        )
+    response = client.models.generate_content(
+        model=PLANNER_MODEL,
+        contents=[types.Content(role="user", parts=[types.Part.from_text(text=question_text)])],
+        config=types.GenerateContentConfig(
+            system_instruction=COMPLIANCE_PLANNER_SYSTEM_PROMPT,
+        ),
+    )
+    response_text = response.text.strip()
+    # Parse JSON response
+    json_match = re.search(r"\{.*\}", response_text, re.DOTALL)
+    target_pages: list[int] = []
+    legend_pages: list[int] = []
+    crop_tasks: list[CropTask] = []
+    code_queries: list[CodeQuery] = []
+    if json_match:
+        try:
+            parsed = json.loads(json_match.group())
+            valid_0indexed = set(range(num_pages))
+            target_pages = [
+                int(p) - 1 for p in parsed.get("target_pages", [])
+                if int(p) - 1 in valid_0indexed
+            ]
+            legend_pages = [
+                int(p) - 1 for p in parsed.get("legend_pages", [])
+                if int(p) - 1 in valid_0indexed
+            ]
+            for t in parsed.get("crop_tasks", []):
+                raw_page = int(t.get("page_num", 1))
+                crop_tasks.append(
+                    CropTask(
+                        page_num=raw_page - 1,
+                        crop_instruction=t.get("crop_instruction", ""),
+                        annotate=bool(t.get("annotate", False)),
+                        annotation_prompt=t.get("annotation_prompt", ""),
+                        label=t.get("label", f"Page {raw_page} crop"),
+                        priority=int(t.get("priority", 1)),
+                    )
+                )
+            for q in parsed.get("code_queries", []):
+                code_queries.append(
+                    CodeQuery(
+                        query=q.get("query", ""),
+                        focus_area=q.get("focus_area", ""),
+                        context=q.get("context", ""),
+                        priority=int(q.get("priority", 0)),
+                    )
+                )
+        except (json.JSONDecodeError, ValueError, KeyError):
+            pass
+    # Sort crop tasks by priority
+    crop_tasks.sort(key=lambda t: t["priority"])
+    # Fallback: if nothing identified, use first 5 pages
+    if not target_pages and not crop_tasks:
+        target_pages = list(range(min(num_pages, 5)))
+    # Build discussion log message
+    crop_summary = f"{len(crop_tasks)} crop tasks on pages {', '.join(str(p + 1) for p in target_pages[:5])}"
+    code_summary = f"{len(code_queries)} code queries"
+    if code_queries:
+        code_summary += f" ({', '.join(q['focus_area'] for q in code_queries[:3])})"
+    discussion_msg = AgentMessage(
+        timestamp=datetime.now().strftime("%H:%M:%S"),
+        agent="planner",
+        action="plan",
+        summary=f"Planned {crop_summary} and {code_summary}.",
+        detail=response_text,
+        evidence_refs=[],
+    )
+    return {
+        "target_pages": target_pages,
+        "legend_pages": legend_pages,
+        "crop_tasks": crop_tasks,
+        "code_queries": code_queries,
+        "discussion_log": [discussion_msg],
+        "status_message": [
+            f"Selected {len(target_pages)} pages ({len(legend_pages)} legends), "
+            f"planned {len(crop_tasks)} crop tasks, {len(code_queries)} code queries."
+        ],
+    }

nodes/cropper.py ADDED Viewed

	@@ -0,0 +1,234 @@

+"""execute_crops node — Gemini code_execution for agentic cropping (PoC 1 style)."""
+from __future__ import annotations
+import io
+import logging
+import time
+import uuid
+from collections.abc import Callable
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from google import genai
+from google.genai import types
+from PIL import Image
+from config import CROPPER_MODEL, GOOGLE_API_KEY
+from prompts.cropper import CROPPER_PROMPT_TEMPLATE
+from state import CropTask, DrawingReaderState, ImageRef
+from tools.crop_cache import CropCache
+from tools.image_store import ImageStore
+from tools.pdf_processor import get_page_image_bytes
+logger = logging.getLogger(__name__)
+# Type alias for the progress callback.
+# Signature: (completed_ref, crop_task, source, completed_count, total_count)
+ProgressCallback = Callable[[ImageRef, CropTask, str, int, int], None]
+# Retry settings for transient API errors (429 / 503)
+MAX_RETRIES = 3
+RETRY_BASE_DELAY = 2.0  # seconds
+def _extract_last_image(response) -> Image.Image | None:
+    """Extract the last generated image from a Gemini code_execution response."""
+    last_image = None
+    for part in response.candidates[0].content.parts:
+        # Try as_image() first
+        try:
+            img_data = part.as_image()
+            if img_data is not None:
+                last_image = Image.open(io.BytesIO(img_data.image_bytes))
+                continue
+        except Exception:
+            pass
+        # Fallback: inline_data
+        try:
+            if hasattr(part, "inline_data") and part.inline_data is not None:
+                img_bytes = part.inline_data.data
+                last_image = Image.open(io.BytesIO(img_bytes))
+        except Exception:
+            pass
+    return last_image
+def _execute_single_crop_sync(
+    client: genai.Client,
+    page_image_bytes: bytes,
+    crop_task: CropTask,
+    image_store: ImageStore,
+) -> tuple[ImageRef, bool]:
+    """Execute one crop via Gemini code_execution (synchronous).
+    Includes retry logic for transient 503/429 errors.
+    Returns
+    -------
+    (image_ref, is_fallback)
+        ``is_fallback`` is True when Gemini failed to produce a crop and the
+        full page image was returned instead.  Fallbacks should NOT be cached.
+    """
+    prompt = CROPPER_PROMPT_TEMPLATE.format(
+        crop_instruction=crop_task["crop_instruction"],
+    )
+    image_part = types.Part.from_bytes(data=page_image_bytes, mime_type="image/png")
+    # Retry loop for transient API errors
+    response = None
+    for attempt in range(MAX_RETRIES):
+        try:
+            response = client.models.generate_content(
+                model=CROPPER_MODEL,
+                contents=[image_part, prompt],
+                config=types.GenerateContentConfig(
+                    tools=[types.Tool(code_execution=types.ToolCodeExecution)]
+                ),
+            )
+            break
+        except Exception as e:
+            err_str = str(e)
+            if ("503" in err_str or "429" in err_str or "UNAVAILABLE" in err_str):
+                delay = RETRY_BASE_DELAY * (2 ** attempt)
+                logger.warning(
+                    "Crop API error (attempt %d/%d): %s — retrying in %.1fs",
+                    attempt + 1, MAX_RETRIES, err_str[:120], delay,
+                )
+                time.sleep(delay)
+            else:
+                raise
+    is_fallback = True
+    if response is not None:
+        final_image = _extract_last_image(response)
+        if final_image is not None:
+            is_fallback = False
+        else:
+            final_image = Image.open(io.BytesIO(page_image_bytes))
+    else:
+        # All retries exhausted
+        final_image = Image.open(io.BytesIO(page_image_bytes))
+    crop_id = f"crop_{uuid.uuid4().hex[:6]}"
+    ref = image_store.save_crop(
+        page_num=crop_task["page_num"],
+        crop_id=crop_id,
+        image=final_image,
+        label=crop_task["label"],
+    )
+    return ref, is_fallback
+def execute_crops(
+    state: DrawingReaderState,
+    image_store: ImageStore,
+    crop_cache: CropCache | None = None,
+    progress_callback: ProgressCallback | None = None,
+) -> dict:
+    """Execute all crop tasks concurrently, reusing cached crops when possible.
+    Parameters
+    ----------
+    progress_callback
+        Optional callback invoked on the **main thread** each time a crop
+        completes (or is served from cache).  Called with
+        ``(image_ref, crop_task, source, completed_count, total_count)``
+        where *source* is ``"cached"``, ``"completed"``, or ``"fallback"``.
+    """
+    crop_tasks = state.get("crop_tasks", [])
+    page_image_dir = state["page_image_dir"]
+    if not crop_tasks:
+        return {"status_message": ["No crop tasks to execute."]}
+    total_count = len(crop_tasks)
+    completed_count = 0
+    # ----- Phase 1: Separate cache hits from tasks that need API calls -----
+    image_refs: list[ImageRef] = []         # final ordered results
+    tasks_to_execute: list[tuple[int, CropTask]] = []  # (original_index, task)
+    cache_hits = 0
+    for i, ct in enumerate(crop_tasks):
+        if crop_cache is not None:
+            cached_ref = crop_cache.lookup(ct["page_num"], ct["crop_instruction"])
+            if cached_ref is not None:
+                image_refs.append(cached_ref)
+                cache_hits += 1
+                completed_count += 1
+                logger.info(
+                    "Reusing cached crop for '%s' (page %d)",
+                    ct["label"], ct["page_num"],
+                )
+                # Notify the UI immediately for each cache hit
+                if progress_callback is not None:
+                    progress_callback(
+                        cached_ref, ct, "cached", completed_count, total_count,
+                    )
+                continue
+        # Not cached — needs an API call
+        tasks_to_execute.append((i, ct))
+    # ----- Phase 2: Execute uncached crops via Gemini -----
+    errors: list[str] = []
+    if tasks_to_execute:
+        client = genai.Client(api_key=GOOGLE_API_KEY)
+        with ThreadPoolExecutor(max_workers=min(len(tasks_to_execute), 4)) as pool:
+            future_to_idx: dict = {}
+            for exec_idx, (_, ct) in enumerate(tasks_to_execute):
+                page_bytes = get_page_image_bytes(page_image_dir, ct["page_num"])
+                future = pool.submit(
+                    _execute_single_crop_sync, client, page_bytes, ct, image_store,
+                )
+                future_to_idx[future] = exec_idx
+            # Process results as they arrive — this runs on the MAIN thread,
+            # so we can safely invoke the Streamlit progress callback here.
+            for future in as_completed(future_to_idx):
+                exec_idx = future_to_idx[future]
+                orig_idx, ct = tasks_to_execute[exec_idx]
+                try:
+                    ref, is_fallback = future.result()
+                    image_refs.append(ref)
+                    completed_count += 1
+                    # Register in cache (only successful targeted crops)
+                    if crop_cache is not None:
+                        crop_cache.register(
+                            page_num=ct["page_num"],
+                            crop_instruction=ct["crop_instruction"],
+                            label=ct["label"],
+                            image_ref=ref,
+                            is_fallback=is_fallback,
+                        )
+                    # Notify the UI as each crop completes
+                    if progress_callback is not None:
+                        source = "fallback" if is_fallback else "completed"
+                        progress_callback(
+                            ref, ct, source, completed_count, total_count,
+                        )
+                except Exception as e:
+                    completed_count += 1
+                    errors.append(f"Crop task {orig_idx} failed: {e}")
+                    logger.error("Crop task %d failed: %s", orig_idx, e)
+    # ----- Phase 3: Build status message -----
+    api_count = len(tasks_to_execute) - len(errors)
+    parts = [f"Completed {len(image_refs)} of {total_count} crops"]
+    if cache_hits:
+        parts.append(f"({cache_hits} from cache, {api_count} new)")
+    if errors:
+        parts.append(f"Errors: {'; '.join(errors)}")
+    status = ". ".join(parts) + "."
+    if crop_cache is not None:
+        logger.info(crop_cache.stats)
+    return {
+        "image_refs": image_refs,
+        "status_message": [status],
+    }

nodes/deliberation.py ADDED Viewed

	@@ -0,0 +1,85 @@

+"""deliberation node — GPT peer review of Gemini's compliance analysis."""
+from __future__ import annotations
+from datetime import datetime
+from openai import OpenAI
+from config import DELIBERATION_MODEL, OPENAI_API_KEY
+from prompts.deliberation import DELIBERATION_SYSTEM_PROMPT
+from state import AgentMessage, ComplianceState
+from tools.image_store import ImageStore
+def deliberation(state: ComplianceState, image_store: ImageStore) -> dict:
+    """Send compliance analysis + images + code report to GPT for peer review."""
+    question = state["question"]
+    compliance_analysis = state.get("compliance_analysis", "")
+    code_report = state.get("code_report", "")
+    image_refs = state.get("image_refs", [])
+    if not compliance_analysis:
+        return {
+            "reviewer_analysis": "",
+            "discussion_log": [
+                AgentMessage(
+                    timestamp=datetime.now().strftime("%H:%M:%S"),
+                    agent="reviewer",
+                    action="review",
+                    summary="No analysis to review.",
+                    detail="",
+                    evidence_refs=[],
+                )
+            ],
+            "status_message": ["No analysis to review."],
+        }
+    client = OpenAI(api_key=OPENAI_API_KEY)
+    # Build multimodal message
+    user_content: list[dict] = [
+        {"type": "text", "text": f"USER COMPLIANCE QUESTION: {question}"},
+        {"type": "text", "text": f"\n=== LEGAL REQUIREMENTS ===\n{code_report}"},
+        {"type": "text", "text": f"\n=== ANALYST'S COMPLIANCE FINDINGS ===\n{compliance_analysis}"},
+        {"type": "text", "text": "\nBELOW ARE THE SAME CROPPED IMAGES THE ANALYST EXAMINED:"},
+    ]
+    for ref in image_refs:
+        user_content.append(
+            {"type": "text", "text": f"\nImage: {ref['label']}"}
+        )
+        try:
+            user_content.append(image_store.to_openai_base64(ref))
+        except Exception as e:
+            user_content.append(
+                {"type": "text", "text": f"(Could not load image: {e})"}
+            )
+    user_content.append(
+        {"type": "text", "text": "\nPerform your peer review of the compliance determination."}
+    )
+    response = client.chat.completions.create(
+        model=DELIBERATION_MODEL,
+        messages=[
+            {"role": "system", "content": DELIBERATION_SYSTEM_PROMPT},
+            {"role": "user", "content": user_content},
+        ],
+    )
+    review_text = response.choices[0].message.content or ""
+    discussion_msg = AgentMessage(
+        timestamp=datetime.now().strftime("%H:%M:%S"),
+        agent="reviewer",
+        action="review",
+        summary=f"Peer review complete. {review_text[:100]}...",
+        detail=review_text[:1500],
+        evidence_refs=[],
+    )
+    return {
+        "reviewer_analysis": review_text,
+        "discussion_log": [discussion_msg],
+        "status_message": ["Deliberation/peer review complete."],
+    }

nodes/final_verdict.py ADDED Viewed

	@@ -0,0 +1,107 @@

+"""final_verdict node — synthesize compliance analysis + optional peer review into structured verdict."""
+from __future__ import annotations
+from datetime import datetime
+from google import genai
+from google.genai import types
+from config import GOOGLE_API_KEY, VERDICT_MODEL
+from state import AgentMessage, ComplianceState
+def final_verdict(state: ComplianceState) -> dict:
+    """Produce the final compliance verdict, synthesizing all evidence."""
+    question = state["question"]
+    compliance_analysis = state.get("compliance_analysis", "")
+    reviewer_analysis = state.get("reviewer_analysis", "")
+    code_report = state.get("code_report", "")
+    enable_consensus = state.get("enable_consensus", False)
+    # If no consensus was run, pass through the analyst's determination
+    if not enable_consensus or not reviewer_analysis:
+        verdict_msg = AgentMessage(
+            timestamp=datetime.now().strftime("%H:%M:%S"),
+            agent="compliance_analyst",
+            action="verdict",
+            summary="Final compliance verdict issued.",
+            detail=compliance_analysis[:1000],
+            evidence_refs=[],
+        )
+        return {
+            "final_verdict": compliance_analysis,
+            "discussion_log": [verdict_msg],
+            "status_message": ["Final verdict ready."],
+        }
+    # Synthesize both perspectives
+    client = genai.Client(api_key=GOOGLE_API_KEY)
+    synthesis_prompt = f"""\
+You are producing a FINAL COMPLIANCE VERDICT for a NYC building code review.
+USER QUESTION: {question}
+CODE REQUIREMENTS (from legal research):
+{code_report[:3000]}
+ANALYST A (Gemini) compliance findings:
+{compliance_analysis}
+ANALYST B (GPT) peer review:
+{reviewer_analysis}
+YOUR TASK:
+1. If both analysts AGREE on compliance status: produce a confident, unified verdict.
+2. If they PARTIALLY AGREE: produce the verdict based on agreed points, and explicitly \
+   note areas of disagreement with evidence from both sides.
+3. If they DISAGREE: present both interpretations, explain the discrepancy, and state \
+   which determination appears better supported by the evidence.
+OUTPUT FORMAT:
+### Compliance Verdict
+**Status:** Compliant | Non-Compliant | Partially Compliant | Unverifiable
+### Legal Basis
+For each code requirement checked:
+- **[Code Type] SS[Section] — [Title]**
+  - Requirement: [specific measurable requirement]
+  - Drawing Evidence: [what was observed]
+  - Determination: [compliant/non-compliant/unverifiable]
+### Key Findings
+- Bullet points of the most important compliance determinations
+### Analyst Consensus
+- Agreement/disagreement between Gemini and GPT analysts
+- Resolution of any conflicts
+### Limitations
+- What could not be verified and why
+- Recommended follow-up actions
+Always cite BOTH code sections AND image crop labels for every factual claim.
+"""
+    response = client.models.generate_content(
+        model=VERDICT_MODEL,
+        contents=[synthesis_prompt],
+    )
+    verdict_text = response.text
+    verdict_msg = AgentMessage(
+        timestamp=datetime.now().strftime("%H:%M:%S"),
+        agent="compliance_analyst",
+        action="verdict",
+        summary="Final synthesized compliance verdict issued.",
+        detail=verdict_text[:1000],
+        evidence_refs=[],
+    )
+    return {
+        "final_verdict": verdict_text,
+        "discussion_log": [verdict_msg],
+        "status_message": ["Final synthesized verdict ready."],
+    }

nodes/metadata_generator.py ADDED Viewed

	@@ -0,0 +1,211 @@

+"""Background page metadata generator — extracts per-page descriptions from the full PDF.
+Uses parallel batch processing: the PDF is split into 5-page chunks and each
+chunk is sent to Gemini concurrently for faster metadata extraction.
+"""
+from __future__ import annotations
+import json
+import logging
+import math
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from google import genai
+from google.genai import types
+from config import GOOGLE_API_KEY, METADATA_MODEL
+from prompts.metadata import METADATA_SYSTEM_PROMPT
+from tools.pdf_processor import extract_page_range_bytes
+logger = logging.getLogger(__name__)
+# Number of PDF pages per batch sent to Gemini in parallel.
+BATCH_SIZE = 5
+# ---------------------------------------------------------------------------
+# JSON extraction helper
+# ---------------------------------------------------------------------------
+def _extract_json_array(response_text: str) -> list[dict]:
+    """Extract the outermost balanced JSON array from a response string."""
+    start = response_text.find("[")
+    if start == -1:
+        raise ValueError("No JSON array found in metadata generation response")
+    depth = 0
+    end = None
+    for i in range(start, len(response_text)):
+        if response_text[i] == "[":
+            depth += 1
+        elif response_text[i] == "]":
+            depth -= 1
+            if depth == 0:
+                end = i
+                break
+    if end is None:
+        raise ValueError("No matching closing bracket found in metadata response")
+    result = json.loads(response_text[start : end + 1])
+    if not isinstance(result, list):
+        raise ValueError(f"Expected list, got {type(result)}")
+    return result
+# ---------------------------------------------------------------------------
+# Single-batch API call
+# ---------------------------------------------------------------------------
+def _generate_batch(
+    pdf_path: str,
+    page_start_0: int,
+    page_end_0: int,
+    page_start_1: int,
+    page_end_1: int,
+) -> list[dict]:
+    """Generate metadata for a contiguous range of pages.
+    Args:
+        pdf_path: Path to the full PDF on disk.
+        page_start_0: First page (0-indexed, inclusive) for PDF extraction.
+        page_end_0: Last page (0-indexed, inclusive) for PDF extraction.
+        page_start_1: First page (1-indexed) — used in the prompt text.
+        page_end_1: Last page (1-indexed) — used in the prompt text.
+    Returns:
+        List of metadata dicts for the pages in this batch.
+    """
+    client = genai.Client(api_key=GOOGLE_API_KEY)
+    batch_pdf_bytes = extract_page_range_bytes(pdf_path, page_start_0, page_end_0)
+    pdf_part = types.Part.from_bytes(data=batch_pdf_bytes, mime_type="application/pdf")
+    num_batch_pages = page_end_1 - page_start_1 + 1
+    instruction_text = (
+        f"This PDF excerpt contains {num_batch_pages} page(s), "
+        f"corresponding to pages {page_start_1} through {page_end_1} of the full drawing set.\n"
+        f"Generate structured metadata for ALL {num_batch_pages} page(s). "
+        f"Use page numbers {page_start_1} through {page_end_1} (1-indexed). "
+        f"Return a JSON array with exactly {num_batch_pages} objects."
+    )
+    instruction_part = types.Part.from_text(text=instruction_text)
+    response = client.models.generate_content(
+        model=METADATA_MODEL,
+        contents=[types.Content(role="user", parts=[pdf_part, instruction_part])],
+        config=types.GenerateContentConfig(
+            system_instruction=METADATA_SYSTEM_PROMPT,
+        ),
+    )
+    return _extract_json_array(response.text.strip())
+# ---------------------------------------------------------------------------
+# Public entry point
+# ---------------------------------------------------------------------------
+def generate_page_metadata(
+    pdf_path: str,
+    num_pages: int,
+    progress_callback=None,
+) -> list[dict]:
+    """Extract per-page structured metadata from a PDF using parallel batches.
+    The PDF is split into chunks of ``BATCH_SIZE`` pages. Each chunk is sent to
+    Gemini concurrently via a thread pool.  Results are merged, any missing
+    pages are back-filled, and the list is returned sorted by page number.
+    Args:
+        pdf_path: Path to the full PDF.
+        num_pages: Total number of pages.
+        progress_callback: Optional ``(completed_batches, total_batches, page_range_str) -> None``
+            called after each batch finishes.
+    Returns a list of dicts (1-indexed page_num), one per page.
+    Raises on failure (caller is responsible for error handling).
+    """
+    num_batches = math.ceil(num_pages / BATCH_SIZE)
+    logger.info(
+        "Starting parallel metadata generation: %d pages in %d batches of %d",
+        num_pages, num_batches, BATCH_SIZE,
+    )
+    all_results: list[dict] = []
+    errors: list[str] = []
+    completed_count = 0
+    with ThreadPoolExecutor(max_workers=num_batches) as executor:
+        futures = {}
+        for batch_idx in range(num_batches):
+            page_start_0 = batch_idx * BATCH_SIZE
+            page_end_0 = min(page_start_0 + BATCH_SIZE - 1, num_pages - 1)
+            page_start_1 = page_start_0 + 1
+            page_end_1 = page_end_0 + 1
+            future = executor.submit(
+                _generate_batch,
+                pdf_path,
+                page_start_0,
+                page_end_0,
+                page_start_1,
+                page_end_1,
+            )
+            futures[future] = (page_start_1, page_end_1)
+        for future in as_completed(futures):
+            batch_range = futures[future]
+            try:
+                batch_results = future.result()
+                all_results.extend(batch_results)
+                completed_count += 1
+                logger.info("Batch pages %d-%d complete: %d entries", batch_range[0], batch_range[1], len(batch_results))
+                if progress_callback is not None:
+                    progress_callback(
+                        completed_count,
+                        num_batches,
+                        f"Pages {batch_range[0]}-{batch_range[1]}",
+                    )
+            except Exception as e:
+                completed_count += 1
+                errors.append(f"Batch pages {batch_range[0]}-{batch_range[1]} failed: {e}")
+                logger.exception("Batch pages %d-%d failed", batch_range[0], batch_range[1])
+                if progress_callback is not None:
+                    progress_callback(
+                        completed_count,
+                        num_batches,
+                        f"Pages {batch_range[0]}-{batch_range[1]} (failed)",
+                    )
+    if errors and not all_results:
+        raise RuntimeError(
+            f"All metadata batches failed:\n" + "\n".join(errors)
+        )
+    if errors:
+        logger.warning("Some batches failed (results will have gaps): %s", errors)
+    # Metadata stays 1-indexed (as the model produced it) because it will be
+    # passed as context text to the planner model, which also uses 1-indexed.
+    # The planner's *output* is converted to 0-indexed in nodes/planner.py.
+    # Fill in any missing pages with minimal entries (1-indexed)
+    covered_pages = {item.get("page_num") for item in all_results}
+    for p in range(1, num_pages + 1):
+        if p not in covered_pages:
+            all_results.append({
+                "page_num": p,
+                "sheet_id": "unknown",
+                "sheet_title": "Unknown",
+                "discipline": "other",
+                "page_type": "other",
+                "description": "Metadata not extracted for this page.",
+                "key_elements": [],
+                "spatial_coverage": "",
+            })
+    # Sort by page number
+    all_results.sort(key=lambda x: x.get("page_num", 0))
+    return all_results

prompts/__init__.py ADDED Viewed

File without changes

prompts/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (232 Bytes). View file

prompts/__pycache__/annotator.cpython-313.pyc ADDED Viewed

Binary file (1.1 kB). View file

prompts/__pycache__/code_lookup.cpython-313.pyc ADDED Viewed

Binary file (1.95 kB). View file

prompts/__pycache__/compliance_analyst.cpython-313.pyc ADDED Viewed

Binary file (4.29 kB). View file

prompts/__pycache__/compliance_planner.cpython-313.pyc ADDED Viewed

Binary file (3.2 kB). View file

prompts/__pycache__/cropper.cpython-313.pyc ADDED Viewed

Binary file (1.18 kB). View file

prompts/__pycache__/deliberation.cpython-313.pyc ADDED Viewed

Binary file (1.71 kB). View file