Ryan2219 commited on
Commit
e1ced8e
·
verified ·
1 Parent(s): f6b94ac

Upload 70 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +3 -0
  2. NorthMaconPark.pdf +3 -0
  3. __init__.py +0 -0
  4. app.py +628 -0
  5. config.py +40 -0
  6. data/BUILDING_CODE.json +0 -0
  7. data/FUEL_GAS_CODE.json +0 -0
  8. data/GENERAL_ADMINISTRATIVE_PROVISIONS.json +3 -0
  9. data/MECHANICAL_CODE.json +0 -0
  10. data/PLUMBING_CODE.json +0 -0
  11. data/ingest_chromadb.py +120 -0
  12. data/nyc_code_db/chroma.sqlite3 +3 -0
  13. data/nyc_code_db/d5ad1fca-6483-43d1-b3bd-08c280e227d1/data_level0.bin +3 -0
  14. data/nyc_code_db/d5ad1fca-6483-43d1-b3bd-08c280e227d1/header.bin +3 -0
  15. data/nyc_code_db/d5ad1fca-6483-43d1-b3bd-08c280e227d1/index_metadata.pickle +3 -0
  16. data/nyc_code_db/d5ad1fca-6483-43d1-b3bd-08c280e227d1/length.bin +3 -0
  17. data/nyc_code_db/d5ad1fca-6483-43d1-b3bd-08c280e227d1/link_lists.bin +3 -0
  18. data/nyc_code_db/f32247b2-1c25-42c9-9177-5ccfe22bb0b1/data_level0.bin +3 -0
  19. data/nyc_code_db/f32247b2-1c25-42c9-9177-5ccfe22bb0b1/header.bin +3 -0
  20. data/nyc_code_db/f32247b2-1c25-42c9-9177-5ccfe22bb0b1/index_metadata.pickle +3 -0
  21. data/nyc_code_db/f32247b2-1c25-42c9-9177-5ccfe22bb0b1/length.bin +3 -0
  22. data/nyc_code_db/f32247b2-1c25-42c9-9177-5ccfe22bb0b1/link_lists.bin +3 -0
  23. data/preprocess_codes.py +253 -0
  24. graph.py +147 -0
  25. nodes/__init__.py +0 -0
  26. nodes/__pycache__/__init__.cpython-313.pyc +0 -0
  27. nodes/__pycache__/annotator.cpython-313.pyc +0 -0
  28. nodes/__pycache__/code_lookup.cpython-313.pyc +0 -0
  29. nodes/__pycache__/compliance_analyst.cpython-313.pyc +0 -0
  30. nodes/__pycache__/compliance_planner.cpython-313.pyc +0 -0
  31. nodes/__pycache__/cropper.cpython-313.pyc +0 -0
  32. nodes/__pycache__/deliberation.cpython-313.pyc +0 -0
  33. nodes/__pycache__/final_verdict.cpython-313.pyc +0 -0
  34. nodes/__pycache__/metadata_generator.cpython-313.pyc +0 -0
  35. nodes/annotator.py +117 -0
  36. nodes/code_lookup.py +286 -0
  37. nodes/compliance_analyst.py +188 -0
  38. nodes/compliance_planner.py +130 -0
  39. nodes/cropper.py +234 -0
  40. nodes/deliberation.py +85 -0
  41. nodes/final_verdict.py +107 -0
  42. nodes/metadata_generator.py +211 -0
  43. prompts/__init__.py +0 -0
  44. prompts/__pycache__/__init__.cpython-313.pyc +0 -0
  45. prompts/__pycache__/annotator.cpython-313.pyc +0 -0
  46. prompts/__pycache__/code_lookup.cpython-313.pyc +0 -0
  47. prompts/__pycache__/compliance_analyst.cpython-313.pyc +0 -0
  48. prompts/__pycache__/compliance_planner.cpython-313.pyc +0 -0
  49. prompts/__pycache__/cropper.cpython-313.pyc +0 -0
  50. prompts/__pycache__/deliberation.cpython-313.pyc +0 -0
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ data/GENERAL_ADMINISTRATIVE_PROVISIONS.json filter=lfs diff=lfs merge=lfs -text
37
+ data/nyc_code_db/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
38
+ NorthMaconPark.pdf filter=lfs diff=lfs merge=lfs -text
NorthMaconPark.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9aed76b73fbe205e1579e3a00be6e95b7564e72594b1fdb83311819d447f8fc4
3
+ size 39114794
__init__.py ADDED
File without changes
app.py ADDED
@@ -0,0 +1,628 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Streamlit UI for NYC Code Compliance Bot — with agent discussion panel."""
2
+ from __future__ import annotations
3
+
4
+ import json
5
+ import logging
6
+ import tempfile
7
+ from pathlib import Path
8
+ import os
9
+ import sys
10
+ import streamlit as st
11
+ from PIL import Image
12
+
13
+ from config import MAX_INVESTIGATION_ROUNDS
14
+ from graph import compile_compliance_graph
15
+ from tools.chroma_tools import warmup_collection, is_warmed_up
16
+ from tools.crop_cache import CropCache
17
+ from tools.image_store import ImageStore
18
+ from tools.metadata_cache import MetadataState, get_cached_metadata
19
+ from tools.pdf_processor import render_pages
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+ # ---------------------------------------------------------------------------
24
+ # Page config
25
+ # ---------------------------------------------------------------------------
26
+ st.set_page_config(
27
+ page_title="NYC Code Compliance Bot",
28
+ page_icon=":building_construction:",
29
+ layout="wide",
30
+ )
31
+
32
+ # ---------------------------------------------------------------------------
33
+ # Custom CSS for agent discussion panel
34
+ # ---------------------------------------------------------------------------
35
+ st.markdown("""
36
+ <style>
37
+ .agent-msg {
38
+ padding: 8px 12px;
39
+ margin: 4px 0;
40
+ border-radius: 8px;
41
+ font-size: 0.9em;
42
+ }
43
+ .agent-planner {
44
+ background-color: #e3f2fd;
45
+ border-left: 4px solid #1565c0;
46
+ }
47
+ .agent-code_analyst {
48
+ background-color: #fff3e0;
49
+ border-left: 4px solid #e65100;
50
+ }
51
+ .agent-compliance_analyst {
52
+ background-color: #e8f5e9;
53
+ border-left: 4px solid #2e7d32;
54
+ }
55
+ .agent-reviewer {
56
+ background-color: #f3e5f5;
57
+ border-left: 4px solid #6a1b9a;
58
+ }
59
+ .agent-icon {
60
+ font-weight: bold;
61
+ margin-right: 6px;
62
+ }
63
+ .agent-timestamp {
64
+ color: #666;
65
+ font-size: 0.8em;
66
+ }
67
+ </style>
68
+ """, unsafe_allow_html=True)
69
+
70
+ AGENT_ICONS = {
71
+ "planner": "\U0001f4cb",
72
+ "code_analyst": "\u2696\ufe0f",
73
+ "compliance_analyst": "\U0001f50d",
74
+ "reviewer": "\U0001f91d",
75
+ }
76
+
77
+ AGENT_LABELS = {
78
+ "planner": "Planner",
79
+ "code_analyst": "Code Analyst",
80
+ "compliance_analyst": "Compliance Analyst",
81
+ "reviewer": "Reviewer",
82
+ }
83
+
84
+ # ---------------------------------------------------------------------------
85
+ # Session state defaults
86
+ # ---------------------------------------------------------------------------
87
+ if "pdf_loaded" not in st.session_state:
88
+ st.session_state.pdf_loaded = False
89
+ if "chat_history" not in st.session_state:
90
+ st.session_state.chat_history = []
91
+ if "image_store" not in st.session_state:
92
+ st.session_state.image_store = None
93
+ if "ingest_state" not in st.session_state:
94
+ st.session_state.ingest_state = {}
95
+ if "pdf_bytes" not in st.session_state:
96
+ st.session_state.pdf_bytes = None
97
+ if "metadata_state" not in st.session_state:
98
+ st.session_state.metadata_state = MetadataState()
99
+ if "crop_cache" not in st.session_state:
100
+ st.session_state.crop_cache = CropCache()
101
+ if "discussion_log" not in st.session_state:
102
+ st.session_state.discussion_log = []
103
+ if "code_report" not in st.session_state:
104
+ st.session_state.code_report = ""
105
+ if "code_sections" not in st.session_state:
106
+ st.session_state.code_sections = []
107
+ if "image_refs" not in st.session_state:
108
+ st.session_state.image_refs = []
109
+ if "db_ready" not in st.session_state:
110
+ st.session_state.db_ready = False
111
+
112
+ # ---------------------------------------------------------------------------
113
+ # Startup: warm up embedding model + ChromaDB
114
+ # ---------------------------------------------------------------------------
115
+ if not st.session_state.db_ready:
116
+ with st.status("Loading NYC Code Database...", expanded=True) as _db_status:
117
+ st.write(":brain: Loading embedding model (bge-large-en-v1.5)...")
118
+ st.write("_This is a one-time download (~1.3 GB) on first run._")
119
+ ok = warmup_collection()
120
+ if ok:
121
+ st.session_state.db_ready = True
122
+ _db_status.update(label="NYC Code Database ready", state="complete")
123
+ else:
124
+ _db_status.update(
125
+ label="NYC Code Database not available — code lookup will be disabled",
126
+ state="error",
127
+ )
128
+ st.session_state.db_ready = False
129
+ if st.session_state.db_ready:
130
+ st.rerun()
131
+
132
+ # ---------------------------------------------------------------------------
133
+ # Sidebar
134
+ # ---------------------------------------------------------------------------
135
+ with st.sidebar:
136
+ st.title(":building_construction: NYC Code Compliance Bot")
137
+ st.markdown(
138
+ "Upload a construction drawing PDF and ask compliance questions. "
139
+ "The system uses **agentic vision** + **NYC code database** to "
140
+ "verify code compliance."
141
+ )
142
+
143
+ st.divider()
144
+
145
+ # PDF upload
146
+ uploaded_file = st.file_uploader("Upload Drawing PDF", type=["pdf"])
147
+
148
+ # Default drawing button
149
+ _DEFAULT_PDF = Path(__file__).parent / "NorthMaconPark.pdf"
150
+ if _DEFAULT_PDF.exists() and not st.session_state.pdf_loaded:
151
+ st.markdown("**— or —**")
152
+ if st.button("Use Default Drawing", use_container_width=True):
153
+ st.session_state._use_default_pdf = True
154
+ st.rerun()
155
+
156
+ st.divider()
157
+
158
+ # Settings
159
+ st.subheader("Settings")
160
+ enable_consensus = st.checkbox(
161
+ "Enable peer review (Gemini + GPT)",
162
+ value=False,
163
+ help="GPT reviews Gemini's compliance analysis. Slower but more thorough.",
164
+ )
165
+ enable_annotation = st.checkbox(
166
+ "Enable annotation",
167
+ value=False,
168
+ help="Annotate crops with numbered highlights before analysis.",
169
+ )
170
+ max_rounds = st.slider(
171
+ "Max investigation rounds",
172
+ min_value=1,
173
+ max_value=5,
174
+ value=MAX_INVESTIGATION_ROUNDS,
175
+ help="Maximum crop-analyze loops before forcing a final verdict.",
176
+ )
177
+
178
+ st.divider()
179
+ st.caption("Powered by LangGraph + Gemini + GPT + ChromaDB")
180
+
181
+ # ---------------------------------------------------------------------------
182
+ # PDF ingestion — Phase A: render pages
183
+ # ---------------------------------------------------------------------------
184
+ # Determine if we have a PDF to process (uploaded or default)
185
+ _pending_pdf: tuple[str, bytes] | None = None
186
+ if not st.session_state.pdf_loaded:
187
+ if uploaded_file is not None:
188
+ _pending_pdf = (uploaded_file.name, uploaded_file.getvalue())
189
+ elif st.session_state.get("_use_default_pdf"):
190
+ _default = Path(__file__).parent / "NorthMaconPark.pdf"
191
+ if _default.exists():
192
+ _pending_pdf = (_default.name, _default.read_bytes())
193
+
194
+ if _pending_pdf is not None and not st.session_state.pdf_loaded:
195
+ pdf_name, pdf_bytes = _pending_pdf
196
+ with st.status("Converting PDF to images...", expanded=True) as status:
197
+ tmp_dir = tempfile.mkdtemp(prefix="compliance_bot_")
198
+ pdf_path = Path(tmp_dir) / pdf_name
199
+ pdf_path.write_bytes(pdf_bytes)
200
+
201
+ st.session_state.pdf_bytes = pdf_bytes
202
+ st.session_state.crop_cache = CropCache()
203
+
204
+ image_store = ImageStore(str(Path(tmp_dir) / "images"))
205
+ st.session_state.image_store = image_store
206
+ page_image_dir = str(image_store._pages_dir)
207
+
208
+ # Check for cached metadata
209
+ cached = get_cached_metadata(pdf_bytes)
210
+ if cached is not None:
211
+ st.session_state.metadata_state.set_ready(json.dumps(cached, indent=2))
212
+ st.write("Page index loaded from cache")
213
+
214
+ st.write("Rendering pages...")
215
+ num_pages = render_pages(str(pdf_path), page_image_dir)
216
+
217
+ st.session_state.ingest_state = {
218
+ "pdf_path": str(pdf_path),
219
+ "page_image_dir": page_image_dir,
220
+ "num_pages": num_pages,
221
+ }
222
+ st.session_state.pdf_loaded = True
223
+ st.session_state.pop("_use_default_pdf", None)
224
+ st.write(f"Converted {num_pages} pages to images.")
225
+ status.update(label=f"PDF ready: {num_pages} pages", state="complete")
226
+ st.rerun()
227
+
228
+ # ---------------------------------------------------------------------------
229
+ # PDF ingestion — Phase B: generate page index
230
+ # ---------------------------------------------------------------------------
231
+ if st.session_state.pdf_loaded:
232
+ meta = st.session_state.metadata_state
233
+ if meta.status == "not_started":
234
+ if st.session_state.pdf_bytes is not None:
235
+ with st.expander(":page_facing_up: PDF Viewer", expanded=False):
236
+ st.pdf(st.session_state.pdf_bytes, height=400)
237
+
238
+ ingest = st.session_state.ingest_state
239
+ num_pages = ingest["num_pages"]
240
+
241
+ st.write("**Generating page index...**")
242
+ progress_bar = st.progress(0, text="Analyzing pages to build searchable index...")
243
+
244
+ def _index_progress(completed: int, total: int, label: str):
245
+ pct = completed / total
246
+ progress_bar.progress(pct, text=f"Indexing: {label} ({completed}/{total} batches)")
247
+
248
+ meta.generate_sync(
249
+ ingest["pdf_path"],
250
+ num_pages,
251
+ st.session_state.pdf_bytes,
252
+ progress_callback=_index_progress,
253
+ )
254
+ if meta.is_ready:
255
+ progress_bar.progress(1.0, text="Page index ready!")
256
+ else:
257
+ progress_bar.progress(1.0, text="Indexing failed — using full PDF mode")
258
+ st.rerun()
259
+
260
+ # ---------------------------------------------------------------------------
261
+ # Main layout (pre-upload welcome)
262
+ # ---------------------------------------------------------------------------
263
+ if not st.session_state.pdf_loaded:
264
+ _left, center, _right = st.columns([1, 2, 1])
265
+ with center:
266
+ st.markdown(
267
+ "<h1 style='text-align: center;'>:building_construction: NYC Code Compliance Bot</h1>",
268
+ unsafe_allow_html=True,
269
+ )
270
+ st.markdown(
271
+ "<p style='text-align: center; color: grey;'>"
272
+ "Upload a construction drawing PDF in the sidebar to get started.<br>"
273
+ "This tool uses <b>agentic vision</b> and the <b>NYC Building Code database</b> "
274
+ "to verify code compliance in your drawings."
275
+ "</p>",
276
+ unsafe_allow_html=True,
277
+ )
278
+ st.stop()
279
+
280
+ # ---------------------------------------------------------------------------
281
+ # PDF viewer
282
+ # ---------------------------------------------------------------------------
283
+ if st.session_state.pdf_bytes is not None:
284
+ with st.expander(":page_facing_up: PDF Viewer", expanded=False):
285
+ st.pdf(st.session_state.pdf_bytes, height=400)
286
+
287
+ # ---------------------------------------------------------------------------
288
+ # Three-column layout: chat | discussion | images+code
289
+ # ---------------------------------------------------------------------------
290
+ chat_col, discuss_col, evidence_col = st.columns([2, 2, 2])
291
+
292
+ # ---------------------------------------------------------------------------
293
+ # Discussion panel (agent conversation)
294
+ # ---------------------------------------------------------------------------
295
+ def render_discussion_log(container, discussion_log: list[dict]):
296
+ """Render the agent discussion log with styled messages."""
297
+ with container:
298
+ for msg in discussion_log:
299
+ agent = msg.get("agent", "unknown")
300
+ icon = AGENT_ICONS.get(agent, "\U0001f916")
301
+ label = AGENT_LABELS.get(agent, agent)
302
+ css_class = f"agent-{agent}"
303
+
304
+ st.markdown(
305
+ f'<div class="agent-msg {css_class}">'
306
+ f'<span class="agent-timestamp">[{msg.get("timestamp", "")}]</span> '
307
+ f'<span class="agent-icon">{icon} {label}</span><br>'
308
+ f'{msg.get("summary", "")}'
309
+ f'</div>',
310
+ unsafe_allow_html=True,
311
+ )
312
+
313
+ # ---------------------------------------------------------------------------
314
+ # Chat history display
315
+ # ---------------------------------------------------------------------------
316
+ with chat_col:
317
+ st.subheader(":speech_balloon: Chat")
318
+
319
+ meta = st.session_state.metadata_state
320
+ if meta.is_ready:
321
+ st.caption("Page index ready — fast planning enabled")
322
+ elif meta.status == "failed":
323
+ st.caption("Page indexing failed — using full PDF mode")
324
+
325
+ for role, content, _refs in st.session_state.chat_history:
326
+ with st.chat_message(role):
327
+ st.markdown(content)
328
+
329
+ question = st.chat_input("Ask a compliance question about the drawing...")
330
+
331
+ # ---------------------------------------------------------------------------
332
+ # Discussion panel
333
+ # ---------------------------------------------------------------------------
334
+ with discuss_col:
335
+ st.subheader(":busts_in_silhouette: Agent Discussion")
336
+ discussion_container = st.container()
337
+
338
+ if st.session_state.discussion_log:
339
+ render_discussion_log(discussion_container, st.session_state.discussion_log)
340
+ else:
341
+ st.info("Agent discussions will appear here during analysis.")
342
+
343
+ # ---------------------------------------------------------------------------
344
+ # Evidence panel (images + code)
345
+ # ---------------------------------------------------------------------------
346
+ with evidence_col:
347
+ st.subheader(":framed_picture: Evidence")
348
+
349
+ evidence_tabs = st.tabs(["Drawing Crops", "Code Sections"])
350
+
351
+ with evidence_tabs[0]:
352
+ if st.session_state.image_refs:
353
+ for ref in st.session_state.image_refs:
354
+ try:
355
+ img = Image.open(ref["path"])
356
+ st.image(img, caption=ref["label"], use_container_width=True)
357
+ except Exception:
358
+ st.warning(f"Could not load: {ref['label']}")
359
+ elif st.session_state.chat_history:
360
+ st.info("No images for this question.")
361
+ else:
362
+ st.info("Ask a question to see drawing crops here.")
363
+
364
+ with evidence_tabs[1]:
365
+ if st.session_state.code_sections:
366
+ for sec in st.session_state.code_sections:
367
+ with st.expander(
368
+ f":balance_scale: {sec.get('code_type', '?')} §{sec.get('section_full', '?')}",
369
+ expanded=False,
370
+ ):
371
+ if sec.get("relevance"):
372
+ st.caption(sec["relevance"])
373
+ st.markdown(sec.get("text", "")[:1500])
374
+ if st.session_state.code_report:
375
+ with st.expander(":page_facing_up: Full Code Report", expanded=False):
376
+ st.markdown(st.session_state.code_report[:5000])
377
+ else:
378
+ st.info("Code sections retrieved during analysis will appear here.")
379
+
380
+
381
+ # ---------------------------------------------------------------------------
382
+ # Question processing
383
+ # ---------------------------------------------------------------------------
384
+ if question:
385
+ # Add user message to history
386
+ st.session_state.chat_history.append(("user", question, []))
387
+ st.session_state.discussion_log = [] # Reset discussion for new question
388
+ st.session_state.code_report = "" # Reset code report for new question
389
+ st.session_state.code_sections = [] # Reset code sections for new question
390
+ st.session_state.image_refs = [] # Reset image refs for new question
391
+
392
+ with chat_col:
393
+ with st.chat_message("user"):
394
+ st.markdown(question)
395
+
396
+ # Build initial state
397
+ ingest = st.session_state.ingest_state
398
+ image_store = st.session_state.image_store
399
+
400
+ meta = st.session_state.metadata_state
401
+ metadata_json = meta.data_json if meta.is_ready else ""
402
+
403
+ question_state = {
404
+ "messages": [],
405
+ "question": question,
406
+ "pdf_path": ingest.get("pdf_path", ""),
407
+ "page_image_dir": ingest.get("page_image_dir", ""),
408
+ "num_pages": ingest.get("num_pages", 0),
409
+ "page_metadata_json": metadata_json,
410
+ "legend_pages": [],
411
+ "target_pages": [],
412
+ "crop_tasks": [],
413
+ "code_queries": [],
414
+ "image_refs": [],
415
+ "code_sections": [],
416
+ "code_report": "",
417
+ "code_chapters_fetched": [],
418
+ "compliance_analysis": "",
419
+ "reviewer_analysis": "",
420
+ "final_verdict": "",
421
+ "discussion_log": [],
422
+ "additional_crop_tasks": [],
423
+ "additional_code_queries": [],
424
+ "needs_more_investigation": False,
425
+ "investigation_round": 0,
426
+ "max_rounds": max_rounds,
427
+ "enable_consensus": enable_consensus,
428
+ "enable_annotation": enable_annotation,
429
+ "status_message": [],
430
+ }
431
+
432
+ # ------------------------------------------------------------------
433
+ # Live progress
434
+ # ------------------------------------------------------------------
435
+ crop_cache = st.session_state.crop_cache
436
+
437
+ with evidence_col:
438
+ with evidence_tabs[0]:
439
+ crop_counter_placeholder = st.empty()
440
+ crop_image_container = st.container()
441
+
442
+ def on_crop_progress(
443
+ completed_ref, crop_task, source: str, completed_count: int, total_count: int,
444
+ ) -> None:
445
+ source_tag = " (cached)" if source == "cached" else ""
446
+ crop_counter_placeholder.markdown(
447
+ f"**Crop {completed_count}/{total_count}**{source_tag} \n"
448
+ f"Latest: *{crop_task.get('label', 'Crop')}*"
449
+ )
450
+ with crop_image_container:
451
+ try:
452
+ img = Image.open(completed_ref["path"])
453
+ caption = completed_ref["label"]
454
+ if source == "cached":
455
+ caption += " (cached)"
456
+ st.image(img, caption=caption, use_container_width=True)
457
+ except Exception:
458
+ st.warning(f"Could not load: {completed_ref['label']}")
459
+
460
+ # Compile graph
461
+ compliance_graph = compile_compliance_graph(image_store, crop_cache, on_crop_progress)
462
+
463
+ # Node progress labels
464
+ PROGRESS_LABELS = {
465
+ "compliance_planner": "Planning investigation...",
466
+ "execute_crops": "Cropping drawing images...",
467
+ "annotate_crops": "Annotating crops...",
468
+ "initial_code_lookup": "Searching NYC code database...",
469
+ "compliance_analyst": "Analyzing compliance...",
470
+ "targeted_code_lookup": "Follow-up code search...",
471
+ "deliberation": "Running peer review...",
472
+ "final_verdict": "Synthesizing verdict...",
473
+ }
474
+
475
+ with chat_col:
476
+ with st.status("Investigating compliance...", expanded=True) as status:
477
+ all_image_refs: list[dict] = []
478
+ all_discussion: list[dict] = []
479
+ final_verdict_text = ""
480
+ code_report_text = ""
481
+
482
+ st.write(PROGRESS_LABELS["compliance_planner"])
483
+
484
+ # Placeholder for parallel-branch status (updated after planner completes)
485
+ parallel_status = st.empty()
486
+
487
+ for event in compliance_graph.stream(question_state, stream_mode="updates"):
488
+ node_name = list(event.keys())[0]
489
+ update = event[node_name]
490
+
491
+ # Status messages (list, since parallel nodes can both emit)
492
+ status_msgs = update.get("status_message", [])
493
+ for status_msg in status_msgs:
494
+ if status_msg:
495
+ st.write(f":white_check_mark: {status_msg}")
496
+
497
+ # Collect discussion messages
498
+ new_discussion = update.get("discussion_log", [])
499
+ if new_discussion:
500
+ all_discussion.extend(new_discussion)
501
+ st.session_state.discussion_log = all_discussion
502
+ # Re-render discussion panel
503
+ render_discussion_log(discussion_container, all_discussion)
504
+
505
+ # Node-specific handling
506
+ if node_name == "compliance_planner":
507
+ target_pages = update.get("target_pages", [])
508
+ crop_tasks = update.get("crop_tasks", [])
509
+ code_queries = update.get("code_queries", [])
510
+
511
+ with st.expander(":clipboard: Investigation Plan", expanded=True):
512
+ if target_pages:
513
+ st.markdown(f"**Target pages:** {', '.join(str(p + 1) for p in target_pages)}")
514
+ if crop_tasks:
515
+ st.markdown(f"**Image crops ({len(crop_tasks)}):**")
516
+ for i, task in enumerate(crop_tasks, 1):
517
+ display_page = task.get("page_num", 0) + 1
518
+ st.markdown(f" {i}. {task.get('label', 'Crop')} (p.{display_page})")
519
+ if code_queries:
520
+ st.markdown(f"**Code queries ({len(code_queries)}):**")
521
+ for i, q in enumerate(code_queries, 1):
522
+ st.markdown(f" {i}. [{q.get('focus_area', '?')}] {q.get('query', '')[:80]}...")
523
+
524
+ if crop_tasks:
525
+ crop_counter_placeholder.markdown(f"**Crop 0/{len(crop_tasks)}** — starting...")
526
+
527
+ # Show parallel execution message (this appears while both branches run)
528
+ parallel_status.info(
529
+ ":arrows_counterclockwise: Running in parallel: "
530
+ f"**Cropping {len(crop_tasks)} images** + "
531
+ f"**Searching {len(code_queries)} code queries**. "
532
+ "This may take 30-60 seconds..."
533
+ )
534
+
535
+ elif node_name in ("initial_code_lookup", "execute_crops"):
536
+ # Clear the parallel status once a branch finishes
537
+ parallel_status.empty()
538
+
539
+ if node_name in ("initial_code_lookup", "targeted_code_lookup"):
540
+ report = update.get("code_report", "")
541
+ new_sections = update.get("code_sections", [])
542
+ if report:
543
+ code_report_text = report
544
+ st.session_state.code_report = report
545
+ if new_sections:
546
+ st.session_state.code_sections.extend(new_sections)
547
+ # Render each new section in the evidence panel in real-time
548
+ with evidence_col:
549
+ with evidence_tabs[1]:
550
+ for sec in new_sections:
551
+ with st.expander(
552
+ f":balance_scale: {sec.get('code_type', '?')} "
553
+ f"§{sec.get('section_full', '?')}",
554
+ expanded=False,
555
+ ):
556
+ if sec.get("relevance"):
557
+ st.caption(sec["relevance"])
558
+ st.markdown(sec.get("text", "")[:1500])
559
+
560
+ elif node_name == "compliance_analyst":
561
+ analysis = update.get("compliance_analysis", "")
562
+ needs_more = update.get("needs_more_investigation", False)
563
+ round_num = update.get("investigation_round", 1)
564
+
565
+ if analysis:
566
+ label = f":mag: Compliance Analysis (Round {round_num})"
567
+ if needs_more:
568
+ label += " — requesting more evidence"
569
+ with st.expander(label, expanded=False):
570
+ st.markdown(analysis[:5000])
571
+
572
+ elif node_name == "deliberation":
573
+ review = update.get("reviewer_analysis", "")
574
+ if review:
575
+ with st.expander(":handshake: Peer Review", expanded=False):
576
+ st.markdown(review[:3000])
577
+
578
+ # Collect images — persist to session state and render in evidence panel
579
+ new_refs = update.get("image_refs", [])
580
+ if new_refs:
581
+ all_image_refs.extend(new_refs)
582
+ st.session_state.image_refs.extend(new_refs)
583
+ # Render each new crop in the evidence panel in real-time
584
+ with evidence_col:
585
+ with evidence_tabs[0]:
586
+ for ref in new_refs:
587
+ try:
588
+ img = Image.open(ref["path"])
589
+ st.image(img, caption=ref["label"], use_container_width=True)
590
+ except Exception:
591
+ st.warning(f"Could not load: {ref['label']}")
592
+
593
+ # Capture final verdict
594
+ if "final_verdict" in update and update["final_verdict"]:
595
+ final_verdict_text = update["final_verdict"]
596
+
597
+ # Show next step label
598
+ if node_name in PROGRESS_LABELS:
599
+ next_labels = {
600
+ "compliance_planner": ["execute_crops", "initial_code_lookup"],
601
+ "execute_crops": ["compliance_analyst"],
602
+ "annotate_crops": ["compliance_analyst"],
603
+ "initial_code_lookup": ["compliance_analyst"],
604
+ "compliance_analyst": ["final_verdict"],
605
+ "targeted_code_lookup": ["compliance_analyst"],
606
+ "deliberation": ["final_verdict"],
607
+ }
608
+ for next_node in next_labels.get(node_name, []):
609
+ if next_node in PROGRESS_LABELS:
610
+ st.write(PROGRESS_LABELS[next_node])
611
+
612
+ if crop_cache.size > 0:
613
+ st.caption(f":file_folder: {crop_cache.stats}")
614
+ status.update(label="Compliance investigation complete", state="complete")
615
+
616
+ # Display final verdict
617
+ if final_verdict_text:
618
+ with chat_col:
619
+ with st.chat_message("assistant"):
620
+ st.markdown(final_verdict_text)
621
+
622
+ st.session_state.chat_history[-1] = ("user", question, [])
623
+ st.session_state.chat_history.append(("assistant", final_verdict_text, all_image_refs))
624
+ else:
625
+ with chat_col:
626
+ st.error("No verdict was generated. Please try again.")
627
+
628
+ st.rerun()
config.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+
4
+ # Explicit path so .env is found regardless of the working directory
5
+ _THIS_DIR = os.path.dirname(os.path.abspath(__file__))
6
+ load_dotenv(os.path.join(_THIS_DIR, ".env"))
7
+
8
+ # --- API Keys ---
9
+ GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY", "")
10
+ OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "")
11
+
12
+ # --- Model Names ---
13
+ PLANNER_MODEL = "gemini-3-flash-preview"
14
+ CROPPER_MODEL = "gemini-3-flash-preview"
15
+ ANNOTATOR_MODEL = "gemini-2.5-flash-image"
16
+ ANALYZER_MODEL = "gemini-3-flash-preview"
17
+ CODE_LOOKUP_MODEL = "gpt-5-mini"
18
+ DELIBERATION_MODEL = "gpt-5.2"
19
+ VERDICT_MODEL = "gemini-3-flash-preview"
20
+ METADATA_MODEL = "gemini-3-flash-preview"
21
+
22
+ # --- ChromaDB ---
23
+ CHROMA_DB_PATH = os.path.join(os.path.dirname(__file__), "data", "nyc_code_db")
24
+ CHROMA_COLLECTION_NAME = "nyc_building_codes"
25
+ EMBEDDING_MODEL_NAME = "BAAI/bge-large-en-v1.5"
26
+
27
+ # --- Processing Constants ---
28
+ PDF_RENDER_DPI = 100
29
+ MAX_INVESTIGATION_ROUNDS = 3
30
+ CROP_PADDING_PX = 40
31
+
32
+ # --- Code Lookup Budgets (per individual query, queries run in parallel) ---
33
+ MAX_DISCOVER_CALLS = 1
34
+ MAX_FETCH_CALLS = 1
35
+ MAX_CODE_LOOKUP_TURNS = 5
36
+
37
+ # --- Search Defaults ---
38
+ DISCOVER_N_RESULTS = 50 # Retrieve more candidates for re-ranking
39
+ RERANK_TOP_K = 20 # Return top-K after re-ranking
40
+ FETCH_MAX_SECTIONS = 30 # Max sections per chapter fetch
data/BUILDING_CODE.json ADDED
The diff for this file is too large to render. See raw diff
 
data/FUEL_GAS_CODE.json ADDED
The diff for this file is too large to render. See raw diff
 
data/GENERAL_ADMINISTRATIVE_PROVISIONS.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2060b78a8e7c0061ee1dab1afe66a2a3e4cb28694af0495a8b3340a26c69940
3
+ size 20920937
data/MECHANICAL_CODE.json ADDED
The diff for this file is too large to render. See raw diff
 
data/PLUMBING_CODE.json ADDED
The diff for this file is too large to render. See raw diff
 
data/ingest_chromadb.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Ingest preprocessed NYC code JSON files into ChromaDB with bge-large-en-v1.5."""
2
+ from __future__ import annotations
3
+
4
+ import json
5
+ import os
6
+ import sys
7
+
8
+ import chromadb
9
+ from chromadb.utils import embedding_functions
10
+
11
+
12
+ EMBEDDING_MODEL = "BAAI/bge-large-en-v1.5"
13
+ COLLECTION_NAME = "nyc_building_codes"
14
+ DB_PATH = os.path.join(os.path.dirname(__file__), "nyc_code_db")
15
+
16
+ # Map of JSON files to their code types
17
+ CODE_FILES = {
18
+ "BUILDING_CODE.json": "Building",
19
+ "FUEL_GAS_CODE.json": "FuelGas",
20
+ "GENERAL_ADMINISTRATIVE_PROVISIONS.json": "Administrative",
21
+ "MECHANICAL_CODE.json": "Mechanical",
22
+ "PLUMBING_CODE.json": "Plumbing",
23
+ }
24
+
25
+
26
+ def create_collection(db_path: str = DB_PATH, reset: bool = True):
27
+ """Create or reset the ChromaDB collection."""
28
+ client = chromadb.PersistentClient(path=db_path)
29
+ embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
30
+ model_name=EMBEDDING_MODEL,
31
+ )
32
+
33
+ if reset:
34
+ try:
35
+ client.delete_collection(name=COLLECTION_NAME)
36
+ print(f"Deleted existing collection '{COLLECTION_NAME}'.")
37
+ except Exception:
38
+ pass
39
+
40
+ collection = client.create_collection(
41
+ name=COLLECTION_NAME,
42
+ embedding_function=embedding_fn,
43
+ )
44
+ return client, collection
45
+
46
+
47
+ def ingest_json_file(collection, json_path: str, code_type: str) -> int:
48
+ """Ingest a single JSON file into the collection. Returns count of sections added."""
49
+ with open(json_path, "r", encoding="utf-8") as f:
50
+ data = json.load(f)
51
+
52
+ documents = []
53
+ metadatas = []
54
+ ids = []
55
+ seen_ids: set[str] = set()
56
+
57
+ for entry in data:
58
+ meta = entry["metadata"]
59
+ # Ensure code_type is set (should already be from preprocessing)
60
+ meta["code_type"] = code_type
61
+
62
+ unique_id = f"{code_type}_{entry['id']}"
63
+ if unique_id in seen_ids:
64
+ continue
65
+
66
+ # Flatten list-type metadata for ChromaDB (only supports str/int/float/bool)
67
+ flat_meta = {}
68
+ for k, v in meta.items():
69
+ if isinstance(v, list):
70
+ flat_meta[k] = ", ".join(str(x) for x in v) if v else ""
71
+ elif isinstance(v, bool):
72
+ flat_meta[k] = v
73
+ elif isinstance(v, (int, float)):
74
+ flat_meta[k] = v
75
+ else:
76
+ flat_meta[k] = str(v)
77
+
78
+ documents.append(entry["text"])
79
+ metadatas.append(flat_meta)
80
+ ids.append(unique_id)
81
+ seen_ids.add(unique_id)
82
+
83
+ # Batch upsert
84
+ batch_size = 200 # Smaller batches for larger embeddings
85
+ for i in range(0, len(documents), batch_size):
86
+ batch_end = min(i + batch_size, len(documents))
87
+ collection.upsert(
88
+ documents=documents[i:batch_end],
89
+ metadatas=metadatas[i:batch_end],
90
+ ids=ids[i:batch_end],
91
+ )
92
+ print(f" Batch {i // batch_size + 1}: upserted {batch_end - i} sections")
93
+
94
+ return len(ids)
95
+
96
+
97
+ def ingest_all(data_dir: str, db_path: str = DB_PATH) -> dict[str, int]:
98
+ """Ingest all code JSON files into a fresh ChromaDB collection."""
99
+ print(f"Creating ChromaDB at {db_path} with embedding model: {EMBEDDING_MODEL}")
100
+ _client, collection = create_collection(db_path, reset=True)
101
+
102
+ counts: dict[str, int] = {}
103
+ for filename, code_type in CODE_FILES.items():
104
+ json_path = os.path.join(data_dir, filename)
105
+ if os.path.exists(json_path):
106
+ print(f"\nIngesting {filename} as '{code_type}'...")
107
+ count = ingest_json_file(collection, json_path, code_type)
108
+ counts[code_type] = count
109
+ print(f" -> {count} sections ingested")
110
+ else:
111
+ print(f"WARNING: {json_path} not found, skipping.")
112
+
113
+ total = sum(counts.values())
114
+ print(f"\nIngestion complete. Total: {total} sections across {len(counts)} code types.")
115
+ return counts
116
+
117
+
118
+ if __name__ == "__main__":
119
+ data_dir = sys.argv[1] if len(sys.argv) > 1 else os.path.dirname(__file__)
120
+ ingest_all(data_dir)
data/nyc_code_db/chroma.sqlite3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8cba84d670f2a081621d233004d1dc55d6e6766c0d9b3a5dbae28e4337aed86b
3
+ size 118640640
data/nyc_code_db/d5ad1fca-6483-43d1-b3bd-08c280e227d1/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b13fd3171ed75d30bbc76630532fc4ff49c459ac97735349fc8670bc5402056e
3
+ size 21180000
data/nyc_code_db/d5ad1fca-6483-43d1-b3bd-08c280e227d1/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4976cac8d856d3914be9284cb6405a0b16019a1c69f0d98c21d517da4492fed0
3
+ size 100
data/nyc_code_db/d5ad1fca-6483-43d1-b3bd-08c280e227d1/index_metadata.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abf1da4a172d264eca520e35093cd9d1dbfc1a2174c5cc08f2bf14e40dc27c0c
3
+ size 266462
data/nyc_code_db/d5ad1fca-6483-43d1-b3bd-08c280e227d1/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c59d245953aaf1e3e56e16a29fc026d3d3b255c07909bdf1345af07e98e51a70
3
+ size 20000
data/nyc_code_db/d5ad1fca-6483-43d1-b3bd-08c280e227d1/link_lists.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2798c188a968c06bd2a7b8bf9886e25e9f0ef8de40118f57f92647e11c7e58cf
3
+ size 42916
data/nyc_code_db/f32247b2-1c25-42c9-9177-5ccfe22bb0b1/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f7351c3a89b1d41bdbc04c8cda4131784642164ac6289029f21ea79369ecebfd
3
+ size 43050468
data/nyc_code_db/f32247b2-1c25-42c9-9177-5ccfe22bb0b1/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7de83396c6ea38709c2085abdd0ad029742aff3877b5cf4499cafca3bd2d3977
3
+ size 100
data/nyc_code_db/f32247b2-1c25-42c9-9177-5ccfe22bb0b1/index_metadata.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa1c7ef73ee5c32a15415bf8d5e8d43ec77e26cfbf77cfe04e7ccbb6b295b705
3
+ size 557222
data/nyc_code_db/f32247b2-1c25-42c9-9177-5ccfe22bb0b1/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a246c61293a0e64dc3582af64fbb6af58c2e9c26228c15c5023f6fd32d4dcff1
3
+ size 40652
data/nyc_code_db/f32247b2-1c25-42c9-9177-5ccfe22bb0b1/link_lists.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a17b8f2739f3b77723cdb5a3c0de9fcbd4e64d65b16a9be028ba214ecab449e2
3
+ size 86212
data/preprocess_codes.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Improved NYC code preprocessing — fixes duplicates, improves metadata, preserves structure."""
2
+ from __future__ import annotations
3
+
4
+ import hashlib
5
+ import json
6
+ import os
7
+ import re
8
+ from collections import Counter, OrderedDict
9
+
10
+
11
+ # ---------------------------------------------------------------------------
12
+ # Text cleaning
13
+ # ---------------------------------------------------------------------------
14
+
15
+ def clean_and_flatten(text: str) -> str:
16
+ """Fix mid-word line breaks and collapse whitespace while preserving list structure."""
17
+ # Fix words split by hyphens across lines (e.g., "accord-\nance")
18
+ text = re.sub(r"(\w+)-\s*\n\s*(\w+)", r"\1\2", text)
19
+ # Preserve numbered list items by inserting a marker before cleanup
20
+ text = re.sub(r"\n\s*(\d+\.)\s+", r" __LISTBREAK__ \1 ", text)
21
+ text = re.sub(r"\n\s*(Exception(?:s)?[\s:.])", r" __LISTBREAK__ \1", text)
22
+ text = text.replace("\n", " ")
23
+ # Clean spacing around dashes in section numbers (e.g., 28 - 101)
24
+ text = re.sub(r"(\d+)\s*-\s*(\d+)", r"\1-\2", text)
25
+ text = re.sub(r"\s+", " ", text).strip()
26
+ # Restore list breaks as newlines
27
+ text = text.replace("__LISTBREAK__", "\n")
28
+ return text
29
+
30
+
31
+ # ---------------------------------------------------------------------------
32
+ # Anchor / section detection
33
+ # ---------------------------------------------------------------------------
34
+
35
+ def get_dominant_anchor(content: str) -> str | None:
36
+ """Detect the dominant chapter digit (1-9) or Appendix letter (A-Z)."""
37
+ anchors = re.findall(
38
+ r"(?m)^(?:\*?\s?§?\s?)(?:([1-9])\d{2,3}\.|([A-Z])(?:\d{2,3})?\.)",
39
+ content,
40
+ )
41
+ found = [item for sublist in anchors for item in sublist if item]
42
+ if not found:
43
+ return None
44
+ return Counter(found).most_common(1)[0][0]
45
+
46
+
47
+ # ---------------------------------------------------------------------------
48
+ # Metadata extraction from section text
49
+ # ---------------------------------------------------------------------------
50
+
51
+ _OCCUPANCY_RE = re.compile(
52
+ r"\b(?:Group|Occupancy|Classification)\s+"
53
+ r"([A-Z]-?\d?(?:\s*,\s*[A-Z]-?\d?)*)",
54
+ re.IGNORECASE,
55
+ )
56
+ _CONSTRUCTION_TYPE_RE = re.compile(
57
+ r"\bType\s+(I[A-B]?|II[A-B]?|III[A-B]?|IV[A-B]?|V[A-B]?)\b",
58
+ re.IGNORECASE,
59
+ )
60
+ _EXCEPTION_RE = re.compile(r"\bException(?:s)?\s*[:.]", re.IGNORECASE)
61
+ _CROSS_REF_RE = re.compile(
62
+ r"(?:Section|Sections|§)\s+(\d{2,4}(?:\.\d+)*(?:\s*(?:,|and|through)\s*\d{2,4}(?:\.\d+)*)*)",
63
+ re.IGNORECASE,
64
+ )
65
+
66
+
67
+ def extract_rich_metadata(section_id: str, text: str, code_type: str) -> dict:
68
+ """Extract enhanced metadata from section text for better filtering."""
69
+ id_parts = section_id.split(".")
70
+ parent_major = id_parts[0]
71
+ parent_minor = ".".join(id_parts[:2]) if len(id_parts) > 1 else parent_major
72
+
73
+ # Occupancy classes mentioned
74
+ occ_matches = _OCCUPANCY_RE.findall(text)
75
+ occupancy_classes = []
76
+ for m in occ_matches:
77
+ for cls in re.split(r"\s*,\s*", m):
78
+ cls = cls.strip().upper()
79
+ if cls and cls not in occupancy_classes:
80
+ occupancy_classes.append(cls)
81
+
82
+ # Construction types mentioned
83
+ const_matches = _CONSTRUCTION_TYPE_RE.findall(text)
84
+ construction_types = sorted(set(m.upper() for m in const_matches))
85
+
86
+ # Exception detection
87
+ has_exceptions = bool(_EXCEPTION_RE.search(text))
88
+ exception_count = len(_EXCEPTION_RE.findall(text))
89
+
90
+ # Cross-references
91
+ xref_matches = _CROSS_REF_RE.findall(text)
92
+ cross_references = []
93
+ for m in xref_matches:
94
+ for ref in re.split(r"\s*(?:,|and|through)\s*", m):
95
+ ref = ref.strip()
96
+ if ref and ref != section_id and ref not in cross_references:
97
+ cross_references.append(ref)
98
+
99
+ return {
100
+ "section_full": section_id,
101
+ "parent_major": parent_major,
102
+ "parent_minor": parent_minor,
103
+ "code_type": code_type,
104
+ "occupancy_classes": occupancy_classes,
105
+ "construction_types": construction_types,
106
+ "has_exceptions": has_exceptions,
107
+ "exception_count": exception_count,
108
+ "cross_references": cross_references,
109
+ }
110
+
111
+
112
+ # ---------------------------------------------------------------------------
113
+ # Core extraction with deduplication
114
+ # ---------------------------------------------------------------------------
115
+
116
+ def extract_trade_sections(
117
+ file_path: str,
118
+ global_dict: OrderedDict,
119
+ code_type: str,
120
+ seen_hashes: dict[str, set[str]],
121
+ ) -> OrderedDict:
122
+ """Extract code sections from a single source file with deduplication."""
123
+ if not os.path.exists(file_path):
124
+ return global_dict
125
+
126
+ with open(file_path, "r", encoding="utf-8") as f:
127
+ content = f.read().replace("\xa0", " ")
128
+
129
+ anchor = get_dominant_anchor(content)
130
+ if not anchor:
131
+ return global_dict
132
+
133
+ # Build section-matching regex
134
+ if anchor.isalpha():
135
+ id_pattern = rf"[A-Z]?{re.escape(anchor)}\d*(?:\.\d+)+"
136
+ else:
137
+ id_pattern = rf"{re.escape(anchor)}\d{{2,3}}(?:\.\d+)+"
138
+
139
+ pattern = rf"(?m)^\s*[\*§]?\s*({id_pattern})\s+([A-Z\w]+)"
140
+ matches = list(re.finditer(pattern, content))
141
+
142
+ skip_words = {
143
+ "and", "through", "to", "or", "sections", "the", "of", "in", "under", "as",
144
+ }
145
+
146
+ for i in range(len(matches)):
147
+ clean_id = matches[i].group(1).strip()
148
+ first_word = matches[i].group(2)
149
+
150
+ if first_word.lower() in skip_words:
151
+ continue
152
+
153
+ start_pos = matches[i].start()
154
+ end_pos = matches[i + 1].start() if i + 1 < len(matches) else len(content)
155
+
156
+ raw_body = content[start_pos:end_pos]
157
+ clean_body = clean_and_flatten(raw_body)
158
+
159
+ if len(clean_body) < 60:
160
+ continue
161
+
162
+ # ------ DEDUPLICATION via content hashing ------
163
+ block_hash = hashlib.md5(clean_body.encode()).hexdigest()
164
+
165
+ if clean_id in global_dict:
166
+ # Check if this block is a genuine duplicate
167
+ if clean_id not in seen_hashes:
168
+ seen_hashes[clean_id] = set()
169
+ if block_hash in seen_hashes[clean_id]:
170
+ continue # Skip exact duplicate
171
+ seen_hashes[clean_id].add(block_hash)
172
+
173
+ global_dict[clean_id]["text"] += f" [CONT.]: {clean_body}"
174
+ source_name = os.path.basename(file_path)
175
+ if source_name not in global_dict[clean_id]["metadata"]["source"]:
176
+ global_dict[clean_id]["metadata"]["source"] += f", {source_name}"
177
+ else:
178
+ seen_hashes[clean_id] = {block_hash}
179
+ metadata = extract_rich_metadata(clean_id, clean_body, code_type)
180
+ metadata["source"] = os.path.basename(file_path)
181
+
182
+ global_dict[clean_id] = {
183
+ "id": clean_id,
184
+ "text": f"CONTEXT: {metadata['parent_major']} > {metadata['parent_minor']} | CONTENT: {clean_id} {clean_body}",
185
+ "metadata": metadata,
186
+ }
187
+
188
+ return global_dict
189
+
190
+
191
+ # ---------------------------------------------------------------------------
192
+ # Main pipeline
193
+ # ---------------------------------------------------------------------------
194
+
195
+ # File ranges per code type (same as original, but parameterized)
196
+ CODE_CONFIGS = {
197
+ "Building": {
198
+ "file_range": [i for i in range(58, 112) if i not in {90, 91, 92, 93, 94, 100, 101, 103, 106, 107}],
199
+ "output_file": "BUILDING_CODE.json",
200
+ },
201
+ "FuelGas": {
202
+ "file_range": [i for i in range(43, 58) if i not in {50, 51, 52, 53, 54, 56}],
203
+ "output_file": "FUEL_GAS_CODE.json",
204
+ },
205
+ "Mechanical": {
206
+ "file_range": [i for i in range(24, 43) if i not in {30, 31}],
207
+ "output_file": "MECHANICAL_CODE.json",
208
+ },
209
+ "Plumbing": {
210
+ "file_range": list(range(1, 24)),
211
+ "output_file": "PLUMBING_CODE.json",
212
+ },
213
+ "Administrative": {
214
+ "file_range": list(range(112, 160)),
215
+ "output_file": "GENERAL_ADMINISTRATIVE_PROVISIONS.json",
216
+ },
217
+ }
218
+
219
+
220
+ def preprocess_all(text_dir: str, output_dir: str) -> dict[str, int]:
221
+ """Run preprocessing for all code types. Returns counts per type."""
222
+ os.makedirs(output_dir, exist_ok=True)
223
+ counts: dict[str, int] = {}
224
+
225
+ for code_type, cfg in CODE_CONFIGS.items():
226
+ master_dict: OrderedDict = OrderedDict()
227
+ seen_hashes: dict[str, set[str]] = {}
228
+
229
+ for file_num in cfg["file_range"]:
230
+ path = os.path.join(text_dir, f"{file_num:03d}.txt")
231
+ if os.path.exists(path):
232
+ print(f"[{code_type}] Processing {path}...")
233
+ extract_trade_sections(path, master_dict, code_type, seen_hashes)
234
+
235
+ result = list(master_dict.values())
236
+ output_path = os.path.join(output_dir, cfg["output_file"])
237
+ with open(output_path, "w", encoding="utf-8") as f:
238
+ json.dump(result, f, indent=2, ensure_ascii=False)
239
+
240
+ counts[code_type] = len(result)
241
+ print(f"[{code_type}] Wrote {len(result)} sections to {output_path}")
242
+
243
+ return counts
244
+
245
+
246
+ if __name__ == "__main__":
247
+ import sys
248
+
249
+ text_dir = sys.argv[1] if len(sys.argv) > 1 else "Text"
250
+ output_dir = sys.argv[2] if len(sys.argv) > 2 else "data"
251
+
252
+ counts = preprocess_all(text_dir, output_dir)
253
+ print(f"\nPreprocessing complete: {counts}")
graph.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """LangGraph definition — compliance workflow with parallel fan-out/fan-in and investigation loop."""
2
+ from __future__ import annotations
3
+
4
+ from langgraph.graph import END, StateGraph
5
+
6
+ from config import MAX_INVESTIGATION_ROUNDS
7
+ from nodes.code_lookup import initial_code_lookup, targeted_code_lookup
8
+ from nodes.compliance_analyst import compliance_analyst
9
+ from nodes.compliance_planner import compliance_planner
10
+ from nodes.cropper import ProgressCallback, execute_crops
11
+ from nodes.annotator import annotate_crops
12
+ from nodes.deliberation import deliberation
13
+ from nodes.final_verdict import final_verdict
14
+ from state import ComplianceState
15
+ from tools.crop_cache import CropCache
16
+ from tools.image_store import ImageStore
17
+
18
+
19
+ def _build_compliance_graph(
20
+ image_store: ImageStore,
21
+ crop_cache: CropCache | None = None,
22
+ progress_callback: ProgressCallback | None = None,
23
+ ) -> StateGraph:
24
+ """Build the compliance analysis graph with parallel fan-out/fan-in.
25
+
26
+ Architecture:
27
+ compliance_planner
28
+ ├── execute_crops ──► annotate_crops (optional) ──┐
29
+ └── initial_code_lookup ─────────────────────────┤
30
+
31
+ compliance_analyst ◄──┐
32
+ │ │ │ │
33
+ │ │ │ │
34
+ (crops)(code)(done) │
35
+ │ │ │ │
36
+ └────┘ │ │
37
+ │ │ │
38
+ ▼ ▼ │
39
+ deliberation (opt) │
40
+ │ │
41
+ final_verdict ───────┘
42
+
43
+ END
44
+ """
45
+
46
+ # ---- Wrap nodes that need ImageStore / CropCache / callback ----
47
+ def _execute_crops(state: ComplianceState) -> dict:
48
+ return execute_crops(state, image_store, crop_cache, progress_callback)
49
+
50
+ def _annotate_crops(state: ComplianceState) -> dict:
51
+ return annotate_crops(state, image_store)
52
+
53
+ def _compliance_analyst(state: ComplianceState) -> dict:
54
+ return compliance_analyst(state, image_store)
55
+
56
+ def _deliberation(state: ComplianceState) -> dict:
57
+ return deliberation(state, image_store)
58
+
59
+ # ---- Build graph ----
60
+ graph = StateGraph(ComplianceState)
61
+
62
+ # Add all nodes
63
+ graph.add_node("compliance_planner", compliance_planner)
64
+ graph.add_node("execute_crops", _execute_crops)
65
+ graph.add_node("annotate_crops", _annotate_crops)
66
+ graph.add_node("initial_code_lookup", initial_code_lookup)
67
+ graph.add_node("compliance_analyst", _compliance_analyst)
68
+ graph.add_node("targeted_code_lookup", targeted_code_lookup)
69
+ graph.add_node("deliberation", _deliberation)
70
+ graph.add_node("final_verdict", final_verdict)
71
+
72
+ # ---- Edges ----
73
+
74
+ # Entry: planner is always first
75
+ graph.set_entry_point("compliance_planner")
76
+
77
+ # Parallel fan-out: planner → both execute_crops AND initial_code_lookup
78
+ graph.add_edge("compliance_planner", "execute_crops")
79
+ graph.add_edge("compliance_planner", "initial_code_lookup")
80
+
81
+ # After crops: optionally annotate, then go to compliance_analyst
82
+ def _after_crops(state: ComplianceState) -> str:
83
+ if not state.get("enable_annotation", True):
84
+ return "compliance_analyst"
85
+ crop_tasks = state.get("crop_tasks", [])
86
+ if not any(t.get("annotate") and t.get("annotation_prompt") for t in crop_tasks):
87
+ return "compliance_analyst"
88
+ return "annotate_crops"
89
+
90
+ graph.add_conditional_edges(
91
+ "execute_crops",
92
+ _after_crops,
93
+ {"annotate_crops": "annotate_crops", "compliance_analyst": "compliance_analyst"},
94
+ )
95
+ graph.add_edge("annotate_crops", "compliance_analyst")
96
+
97
+ # Code lookup also feeds into compliance_analyst (fan-in)
98
+ graph.add_edge("initial_code_lookup", "compliance_analyst")
99
+
100
+ # After analysis: loop for more evidence, deliberate, or go to verdict
101
+ def _after_analyst(state: ComplianceState) -> str:
102
+ needs_more = state.get("needs_more_investigation", False)
103
+ round_num = state.get("investigation_round", 0)
104
+ max_rounds = state.get("max_rounds", MAX_INVESTIGATION_ROUNDS)
105
+ has_additional_crops = bool(state.get("additional_crop_tasks", []))
106
+ has_additional_code = bool(state.get("additional_code_queries", []))
107
+ enable_consensus = state.get("enable_consensus", False)
108
+
109
+ if needs_more and round_num < max_rounds:
110
+ if has_additional_code:
111
+ return "targeted_code_lookup"
112
+ if has_additional_crops:
113
+ return "execute_crops"
114
+ if enable_consensus:
115
+ return "deliberation"
116
+ return "final_verdict"
117
+
118
+ graph.add_conditional_edges(
119
+ "compliance_analyst",
120
+ _after_analyst,
121
+ {
122
+ "execute_crops": "execute_crops",
123
+ "targeted_code_lookup": "targeted_code_lookup",
124
+ "deliberation": "deliberation",
125
+ "final_verdict": "final_verdict",
126
+ },
127
+ )
128
+
129
+ # Targeted code lookup feeds back to analyst
130
+ graph.add_edge("targeted_code_lookup", "compliance_analyst")
131
+
132
+ # Deliberation → verdict
133
+ graph.add_edge("deliberation", "final_verdict")
134
+
135
+ # Verdict → END
136
+ graph.add_edge("final_verdict", END)
137
+
138
+ return graph
139
+
140
+
141
+ def compile_compliance_graph(
142
+ image_store: ImageStore,
143
+ crop_cache: CropCache | None = None,
144
+ progress_callback: ProgressCallback | None = None,
145
+ ):
146
+ """Return a compiled, ready-to-invoke compliance graph."""
147
+ return _build_compliance_graph(image_store, crop_cache, progress_callback).compile()
nodes/__init__.py ADDED
File without changes
nodes/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (230 Bytes). View file
 
nodes/__pycache__/annotator.cpython-313.pyc ADDED
Binary file (5.18 kB). View file
 
nodes/__pycache__/code_lookup.cpython-313.pyc ADDED
Binary file (10.6 kB). View file
 
nodes/__pycache__/compliance_analyst.cpython-313.pyc ADDED
Binary file (8.35 kB). View file
 
nodes/__pycache__/compliance_planner.cpython-313.pyc ADDED
Binary file (6.46 kB). View file
 
nodes/__pycache__/cropper.cpython-313.pyc ADDED
Binary file (8.87 kB). View file
 
nodes/__pycache__/deliberation.cpython-313.pyc ADDED
Binary file (3.32 kB). View file
 
nodes/__pycache__/final_verdict.cpython-313.pyc ADDED
Binary file (3.74 kB). View file
 
nodes/__pycache__/metadata_generator.cpython-313.pyc ADDED
Binary file (7.52 kB). View file
 
nodes/annotator.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """annotate_crops node — nano-banana (Gemini image generation) for semantic annotation."""
2
+ from __future__ import annotations
3
+
4
+ import io
5
+ from concurrent.futures import ThreadPoolExecutor, as_completed
6
+
7
+ from google import genai
8
+ from google.genai import types
9
+ from PIL import Image
10
+
11
+ from config import ANNOTATOR_MODEL, GOOGLE_API_KEY
12
+ from prompts.annotator import ANNOTATION_WRAPPER
13
+ from state import DrawingReaderState, ImageRef
14
+ from tools.image_store import ImageStore
15
+
16
+
17
+ def _extract_generated_image(response) -> Image.Image | None:
18
+ """Extract the generated image from a Gemini image-generation response."""
19
+ for part in response.candidates[0].content.parts:
20
+ if part.inline_data is not None:
21
+ return Image.open(io.BytesIO(part.inline_data.data))
22
+ return None
23
+
24
+
25
+ def _annotate_single_crop_sync(
26
+ client: genai.Client,
27
+ crop_ref: ImageRef,
28
+ annotation_prompt: str,
29
+ image_store: ImageStore,
30
+ ) -> ImageRef | None:
31
+ """Annotate one crop using nano-banana (synchronous)."""
32
+ crop_bytes = image_store.load_bytes(crop_ref)
33
+
34
+ full_prompt = ANNOTATION_WRAPPER.format(annotation_prompt=annotation_prompt)
35
+
36
+ response = client.models.generate_content(
37
+ model=ANNOTATOR_MODEL,
38
+ contents=[
39
+ types.Part.from_bytes(data=crop_bytes, mime_type="image/png"),
40
+ full_prompt,
41
+ ],
42
+ config=types.GenerateContentConfig(
43
+ response_modalities=["TEXT", "IMAGE"],
44
+ ),
45
+ )
46
+
47
+ annotated_image = _extract_generated_image(response)
48
+ if annotated_image is None:
49
+ return None
50
+
51
+ ref = image_store.save_annotated(crop_ref, annotated_image)
52
+ return ref
53
+
54
+
55
+ def annotate_crops(state: DrawingReaderState, image_store: ImageStore) -> dict:
56
+ """Run nano-banana annotation on crops that need it."""
57
+ crop_tasks = state.get("crop_tasks", [])
58
+ image_refs = state.get("image_refs", [])
59
+
60
+ # Build a mapping: find crops that need annotation.
61
+ # The most recent batch of crops corresponds to the current crop_tasks.
62
+ # Take the LAST len(crop_tasks) crops from image_refs to match by position,
63
+ # so that on loop-back rounds we only match against the newest crops.
64
+ crops_needing_annotation: list[tuple[ImageRef, str]] = []
65
+
66
+ all_crops = [r for r in image_refs if r["crop_type"] == "crop"]
67
+ # Only the tail — the most recent batch produced by execute_crops
68
+ recent_crops = all_crops[-len(crop_tasks):] if crop_tasks else []
69
+
70
+ for i, task in enumerate(crop_tasks):
71
+ if task["annotate"] and task["annotation_prompt"] and i < len(recent_crops):
72
+ crops_needing_annotation.append(
73
+ (recent_crops[i], task["annotation_prompt"])
74
+ )
75
+
76
+ if not crops_needing_annotation:
77
+ return {"status_message": ["No annotation needed for these crops."]}
78
+
79
+ client = genai.Client(api_key=GOOGLE_API_KEY)
80
+
81
+ # Use a thread pool instead of asyncio to avoid event-loop conflicts
82
+ # with Streamlit's own event loop.
83
+ results: list[ImageRef | None | Exception] = [None] * len(crops_needing_annotation)
84
+
85
+ with ThreadPoolExecutor(max_workers=min(len(crops_needing_annotation), 4)) as pool:
86
+ future_to_idx = {}
87
+ for i, (ref, prompt) in enumerate(crops_needing_annotation):
88
+ future = pool.submit(
89
+ _annotate_single_crop_sync, client, ref, prompt, image_store,
90
+ )
91
+ future_to_idx[future] = i
92
+
93
+ for future in as_completed(future_to_idx):
94
+ idx = future_to_idx[future]
95
+ try:
96
+ results[idx] = future.result()
97
+ except Exception as e:
98
+ results[idx] = e
99
+
100
+ annotated_refs: list[ImageRef] = []
101
+ errors: list[str] = []
102
+ for i, result in enumerate(results):
103
+ if isinstance(result, Exception):
104
+ errors.append(f"Annotation {i} failed: {result}")
105
+ elif result is not None:
106
+ annotated_refs.append(result)
107
+ else:
108
+ errors.append(f"Annotation {i} returned no image")
109
+
110
+ status = f"Annotated {len(annotated_refs)} of {len(crops_needing_annotation)} crops."
111
+ if errors:
112
+ status += f" Issues: {'; '.join(errors)}"
113
+
114
+ return {
115
+ "image_refs": annotated_refs,
116
+ "status_message": [status],
117
+ }
nodes/code_lookup.py ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """code_lookup node — lightweight snippet reviewer (no multi-turn tool loop).
2
+
3
+ Flow:
4
+ 1. discover_code_locations() — ChromaDB semantic search (~1-2 sec per query)
5
+ 2. GPT reviews the raw snippets in a SINGLE call — flags relevant ones with
6
+ a relevance tag and brief note
7
+ 3. Raw flagged snippets + GPT notes go to the compliance analyst
8
+
9
+ No fetch_full_chapter in the initial pass. The compliance analyst can request
10
+ targeted chapter fetches via additional_code_queries if it needs more context.
11
+ """
12
+ from __future__ import annotations
13
+
14
+ import json
15
+ import logging
16
+ from concurrent.futures import ThreadPoolExecutor, as_completed
17
+ from datetime import datetime
18
+
19
+ from openai import OpenAI
20
+
21
+ from config import CODE_LOOKUP_MODEL, OPENAI_API_KEY
22
+ from prompts.code_lookup import CODE_REVIEWER_SYSTEM_PROMPT
23
+ from state import AgentMessage, CodeQuery, CodeSection, ComplianceState
24
+ from tools.chroma_tools import QueryCache, discover_code_locations
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ # ---------------------------------------------------------------------------
30
+ # Single-call snippet reviewer (replaces the multi-turn tool loop)
31
+ # ---------------------------------------------------------------------------
32
+
33
+ def _review_snippets(
34
+ research_goal: str,
35
+ discover_report: str,
36
+ ) -> tuple[str, list[CodeSection]]:
37
+ """GPT reviews discover results in ONE call.
38
+
39
+ Returns (brief_review, flagged_sections).
40
+ """
41
+ client = OpenAI(api_key=OPENAI_API_KEY)
42
+
43
+ response = client.chat.completions.create(
44
+ model=CODE_LOOKUP_MODEL,
45
+ messages=[
46
+ {"role": "system", "content": CODE_REVIEWER_SYSTEM_PROMPT},
47
+ {
48
+ "role": "user",
49
+ "content": (
50
+ f"## Research Goal\n{research_goal}\n\n"
51
+ f"## Code Snippets from Database\n{discover_report}"
52
+ ),
53
+ },
54
+ ],
55
+ response_format={"type": "json_object"},
56
+ )
57
+
58
+ raw = response.choices[0].message.content or "{}"
59
+ try:
60
+ parsed = json.loads(raw)
61
+ except json.JSONDecodeError:
62
+ logger.warning("GPT snippet review returned invalid JSON, using raw text.")
63
+ return raw, []
64
+
65
+ flagged_sections: list[CodeSection] = []
66
+ for item in parsed.get("relevant_sections", []):
67
+ flagged_sections.append(
68
+ CodeSection(
69
+ section_full=item.get("section_id", "?"),
70
+ code_type=item.get("code_type", "?"),
71
+ parent_major=item.get("chapter", "?"),
72
+ text=item.get("snippet", "")[:1500],
73
+ relevance=item.get("relevance_note", ""),
74
+ )
75
+ )
76
+
77
+ summary = parsed.get("summary", "No summary provided.")
78
+ return summary, flagged_sections
79
+
80
+
81
+ def _run_single_lookup(
82
+ cq: CodeQuery,
83
+ query_cache: QueryCache | None = None,
84
+ ) -> tuple[str, list[CodeSection], str]:
85
+ """Run discover + review for ONE code query.
86
+
87
+ Returns (summary, flagged_sections, discover_report_raw).
88
+ """
89
+ research_goal = f"{cq['query']} (Context: {cq['context']})"
90
+ logger.info("ChromaDB query: %s", research_goal)
91
+
92
+ # Step 1: ChromaDB discover (fast, ~1-2s)
93
+ discover_report = discover_code_locations(research_goal, cache=query_cache)
94
+
95
+ # Step 2: GPT reviews snippets in a single call
96
+ summary, flagged = _review_snippets(research_goal, discover_report)
97
+
98
+ return summary, flagged, discover_report
99
+
100
+
101
+ # ---------------------------------------------------------------------------
102
+ # LangGraph node functions
103
+ # ---------------------------------------------------------------------------
104
+
105
+
106
+ def initial_code_lookup(state: ComplianceState) -> dict:
107
+ """Run the initial code lookup based on planner's code_queries.
108
+
109
+ All queries run in PARALLEL via ThreadPoolExecutor.
110
+ Each query = 1 discover + 1 GPT review call (no multi-turn loop).
111
+ """
112
+ code_queries = state.get("code_queries", [])
113
+ if not code_queries:
114
+ return {
115
+ "code_report": "No code queries were planned.",
116
+ "code_sections": [],
117
+ "discussion_log": [
118
+ AgentMessage(
119
+ timestamp=datetime.now().strftime("%H:%M:%S"),
120
+ agent="code_analyst",
121
+ action="search_code",
122
+ summary="No code queries to execute.",
123
+ detail="The planner did not generate any code queries.",
124
+ evidence_refs=[],
125
+ )
126
+ ],
127
+ "status_message": ["No code queries to execute."],
128
+ }
129
+
130
+ query_cache = QueryCache()
131
+ discussion_messages: list[AgentMessage] = []
132
+
133
+ # Add "searching" messages for all queries upfront
134
+ for cq in code_queries:
135
+ discussion_messages.append(
136
+ AgentMessage(
137
+ timestamp=datetime.now().strftime("%H:%M:%S"),
138
+ agent="code_analyst",
139
+ action="search_code",
140
+ summary=f"Searching: {cq['query'][:80]}...",
141
+ detail=f"Focus area: {cq['focus_area']}\nContext: {cq['context']}",
142
+ evidence_refs=[],
143
+ )
144
+ )
145
+
146
+ # Execute ALL queries concurrently
147
+ results: dict[int, tuple[str, list[CodeSection], str]] = {}
148
+
149
+ with ThreadPoolExecutor(max_workers=min(len(code_queries), 4)) as pool:
150
+ futures = {
151
+ pool.submit(_run_single_lookup, cq, query_cache): i
152
+ for i, cq in enumerate(code_queries)
153
+ }
154
+ for future in as_completed(futures):
155
+ i = futures[future]
156
+ try:
157
+ summary, flagged, _raw = future.result()
158
+ results[i] = (summary, flagged, _raw)
159
+ cq = code_queries[i]
160
+ section_ids = ", ".join(
161
+ s["section_full"] for s in flagged
162
+ )
163
+ discussion_messages.append(
164
+ AgentMessage(
165
+ timestamp=datetime.now().strftime("%H:%M:%S"),
166
+ agent="code_analyst",
167
+ action="search_code",
168
+ summary=(
169
+ f"Flagged {len(flagged)} sections "
170
+ f"for '{cq['query']}'"
171
+ ),
172
+ detail=(
173
+ f"**Query:** {cq['query']}\n"
174
+ f"**Focus:** {cq['focus_area']}\n\n"
175
+ f"**Sections:** {section_ids}\n\n"
176
+ f"{summary[:800]}"
177
+ ),
178
+ evidence_refs=[s["section_full"] for s in flagged],
179
+ )
180
+ )
181
+ except Exception as e:
182
+ logger.error("Code query %d failed: %s", i, e)
183
+ results[i] = (f"Error: {e}", [], "")
184
+ discussion_messages.append(
185
+ AgentMessage(
186
+ timestamp=datetime.now().strftime("%H:%M:%S"),
187
+ agent="code_analyst",
188
+ action="search_code",
189
+ summary=f"Query {i + 1} failed: {e}",
190
+ detail=str(e),
191
+ evidence_refs=[],
192
+ )
193
+ )
194
+
195
+ # Reassemble in original order
196
+ report_parts: list[str] = []
197
+ all_sections: list[CodeSection] = []
198
+ for i in range(len(code_queries)):
199
+ summary, flagged, _raw = results.get(i, ("No result", [], ""))
200
+ cq = code_queries[i]
201
+ report_parts.append(
202
+ f"### Query {i + 1}: {cq['focus_area']}\n{summary}"
203
+ )
204
+ all_sections.extend(flagged)
205
+
206
+ combined_report = "\n\n---\n\n".join(report_parts)
207
+
208
+ return {
209
+ "code_report": combined_report,
210
+ "code_sections": all_sections,
211
+ "discussion_log": discussion_messages,
212
+ "status_message": [
213
+ f"Code lookup complete. {len(all_sections)} relevant sections "
214
+ f"flagged across {len(code_queries)} queries."
215
+ ],
216
+ }
217
+
218
+
219
+ def targeted_code_lookup(state: ComplianceState) -> dict:
220
+ """Run additional code lookups requested by the compliance analyst.
221
+
222
+ These may use fetch_full_chapter for deeper context when the analyst
223
+ needs full exception text or cross-reference detail.
224
+ """
225
+ additional_queries = state.get("additional_code_queries", [])
226
+ if not additional_queries:
227
+ return {
228
+ "status_message": ["No additional code queries."],
229
+ }
230
+
231
+ query_cache = QueryCache()
232
+ all_sections: list[CodeSection] = []
233
+ report_parts: list[str] = []
234
+ discussion_messages: list[AgentMessage] = []
235
+
236
+ for i, cq in enumerate(additional_queries):
237
+ discussion_messages.append(
238
+ AgentMessage(
239
+ timestamp=datetime.now().strftime("%H:%M:%S"),
240
+ agent="code_analyst",
241
+ action="search_code",
242
+ summary=f"Follow-up search: {cq['query'][:80]}...",
243
+ detail=f"Requested by compliance analyst.\nFocus: {cq['focus_area']}",
244
+ evidence_refs=[],
245
+ )
246
+ )
247
+
248
+ summary, flagged, _raw = _run_single_lookup(cq, query_cache)
249
+ report_parts.append(summary)
250
+ all_sections.extend(flagged)
251
+
252
+ section_ids = ", ".join(s["section_full"] for s in flagged)
253
+ discussion_messages.append(
254
+ AgentMessage(
255
+ timestamp=datetime.now().strftime("%H:%M:%S"),
256
+ agent="code_analyst",
257
+ action="search_code",
258
+ summary=(
259
+ f"Follow-up: flagged {len(flagged)} sections "
260
+ f"for '{cq['query']}'"
261
+ ),
262
+ detail=(
263
+ f"**Query:** {cq['query']}\n"
264
+ f"**Focus:** {cq['focus_area']}\n\n"
265
+ f"**Sections:** {section_ids}\n\n"
266
+ f"{summary[:800]}"
267
+ ),
268
+ evidence_refs=[s["section_full"] for s in flagged],
269
+ )
270
+ )
271
+
272
+ # Append to existing report
273
+ existing_report = state.get("code_report", "")
274
+ new_report = "\n\n---\n\n".join(report_parts)
275
+ combined_report = f"{existing_report}\n\n## FOLLOW-UP CODE RESEARCH\n\n{new_report}"
276
+
277
+ return {
278
+ "code_report": combined_report,
279
+ "code_sections": all_sections,
280
+ "additional_code_queries": [], # Clear after processing
281
+ "discussion_log": discussion_messages,
282
+ "status_message": [
283
+ f"Targeted code lookup complete. {len(all_sections)} additional sections "
284
+ f"from {len(additional_queries)} follow-up queries."
285
+ ],
286
+ }
nodes/compliance_analyst.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """compliance_analyst node — multimodal fusion of images + code for compliance determination."""
2
+ from __future__ import annotations
3
+
4
+ import json
5
+ import re
6
+ from datetime import datetime
7
+
8
+ from google import genai
9
+ from google.genai import types
10
+
11
+ from config import ANALYZER_MODEL, GOOGLE_API_KEY
12
+ from prompts.compliance_analyst import COMPLIANCE_ANALYST_SYSTEM_PROMPT
13
+ from state import AgentMessage, CodeQuery, ComplianceState, CropTask
14
+ from tools.image_store import ImageStore
15
+
16
+
17
+ def compliance_analyst(state: ComplianceState, image_store: ImageStore) -> dict:
18
+ """Review all cropped images AND code sections to produce compliance findings."""
19
+ question = state["question"]
20
+ image_refs = state.get("image_refs", [])
21
+ code_report = state.get("code_report", "")
22
+ legend_pages = set(state.get("legend_pages", []))
23
+ investigation_round = state.get("investigation_round", 0)
24
+ discussion_log = state.get("discussion_log", [])
25
+
26
+ client = genai.Client(api_key=GOOGLE_API_KEY)
27
+
28
+ # Build multimodal content
29
+ content_parts: list[types.Part] = []
30
+
31
+ # 1. User question
32
+ content_parts.append(types.Part.from_text(text=f"USER COMPLIANCE QUESTION: {question}"))
33
+
34
+ # 2. Code report (legal requirements)
35
+ if code_report:
36
+ content_parts.append(
37
+ types.Part.from_text(
38
+ text=f"\n=== LEGAL REQUIREMENTS FROM NYC CODE ===\n{code_report}"
39
+ )
40
+ )
41
+ else:
42
+ content_parts.append(
43
+ types.Part.from_text(text="\n=== NO CODE SECTIONS RETRIEVED ===\n")
44
+ )
45
+
46
+ # 3. Discussion log summary (what previous agents found)
47
+ if discussion_log:
48
+ log_summary = "\n".join(
49
+ f"[{m['timestamp']}] {m['agent']}: {m['summary']}"
50
+ for m in discussion_log[-10:] # Last 10 messages
51
+ )
52
+ content_parts.append(
53
+ types.Part.from_text(
54
+ text=f"\n=== AGENT DISCUSSION LOG ===\n{log_summary}"
55
+ )
56
+ )
57
+
58
+ # 4. Images — legends first, then detail crops, then annotated
59
+ legend_refs = [r for r in image_refs if r["page_num"] in legend_pages and r["crop_type"] == "crop"]
60
+ detail_crops = [r for r in image_refs if r["page_num"] not in legend_pages and r["crop_type"] == "crop"]
61
+ annotated_refs = [r for r in image_refs if r["crop_type"] == "annotated"]
62
+
63
+ ordered_refs = legend_refs + detail_crops + annotated_refs
64
+
65
+ if legend_refs:
66
+ content_parts.append(
67
+ types.Part.from_text(text="\n=== LEGEND / SCHEDULE CROPS (study these first) ===")
68
+ )
69
+
70
+ first_detail_id = detail_crops[0]["id"] if detail_crops else None
71
+ first_annotated_id = annotated_refs[0]["id"] if annotated_refs else None
72
+
73
+ for ref in ordered_refs:
74
+ if first_detail_id is not None and ref["id"] == first_detail_id:
75
+ content_parts.append(types.Part.from_text(text="\n=== DETAIL CROPS ==="))
76
+ if first_annotated_id is not None and ref["id"] == first_annotated_id:
77
+ content_parts.append(
78
+ types.Part.from_text(text="\n=== ANNOTATED CROPS (highlighted versions) ===")
79
+ )
80
+
81
+ content_parts.append(types.Part.from_text(text=f"\nImage: {ref['label']}"))
82
+ try:
83
+ content_parts.append(image_store.to_gemini_part(ref))
84
+ except Exception as e:
85
+ content_parts.append(
86
+ types.Part.from_text(text=f"(Could not load image: {e})")
87
+ )
88
+
89
+ # 5. Investigation round context
90
+ content_parts.append(
91
+ types.Part.from_text(
92
+ text=(
93
+ f"\nThis is investigation round {investigation_round + 1}. "
94
+ "Analyze the drawings against the code requirements. "
95
+ "If you need more evidence (crops or code lookups), include a JSON block at the end."
96
+ )
97
+ )
98
+ )
99
+
100
+ # Call Gemini
101
+ response = client.models.generate_content(
102
+ model=ANALYZER_MODEL,
103
+ contents=[types.Content(role="user", parts=content_parts)],
104
+ config=types.GenerateContentConfig(
105
+ system_instruction=COMPLIANCE_ANALYST_SYSTEM_PROMPT,
106
+ ),
107
+ )
108
+
109
+ analysis_text = response.text
110
+
111
+ # Parse additional investigation requests
112
+ needs_more = False
113
+ additional_crops: list[CropTask] = []
114
+ additional_code_queries: list[CodeQuery] = []
115
+
116
+ json_match = re.search(
117
+ r"```json\s*(\{.*?\"needs_more\"\s*:\s*true.*?\})\s*```",
118
+ analysis_text,
119
+ re.DOTALL,
120
+ )
121
+ if json_match:
122
+ try:
123
+ extra = json.loads(json_match.group(1))
124
+ if extra.get("needs_more"):
125
+ needs_more = True
126
+
127
+ for t in extra.get("additional_crops", []):
128
+ raw_page = int(t.get("page_num", 1))
129
+ additional_crops.append(
130
+ CropTask(
131
+ page_num=raw_page - 1,
132
+ crop_instruction=t.get("crop_instruction", ""),
133
+ annotate=bool(t.get("annotate", False)),
134
+ annotation_prompt=t.get("annotation_prompt", ""),
135
+ label=t.get("label", "Additional crop"),
136
+ priority=int(t.get("priority", 1)),
137
+ )
138
+ )
139
+
140
+ for q in extra.get("additional_code_queries", []):
141
+ additional_code_queries.append(
142
+ CodeQuery(
143
+ query=q.get("query", ""),
144
+ focus_area=q.get("focus_area", ""),
145
+ context=q.get("context", ""),
146
+ priority=int(q.get("priority", 0)),
147
+ )
148
+ )
149
+ except (json.JSONDecodeError, KeyError):
150
+ pass
151
+
152
+ # Clean the JSON block from the analysis text
153
+ analysis_text = analysis_text[: json_match.start()].strip()
154
+
155
+ # Build discussion message
156
+ if needs_more:
157
+ summary = (
158
+ f"Round {investigation_round + 1} analysis complete. "
159
+ f"Requesting {len(additional_crops)} more crops and "
160
+ f"{len(additional_code_queries)} more code lookups."
161
+ )
162
+ else:
163
+ summary = f"Round {investigation_round + 1} compliance analysis complete."
164
+
165
+ discussion_msg = AgentMessage(
166
+ timestamp=datetime.now().strftime("%H:%M:%S"),
167
+ agent="compliance_analyst",
168
+ action="analyze" if not needs_more else "request_more",
169
+ summary=summary,
170
+ detail=analysis_text[:1500],
171
+ evidence_refs=[ref["id"] for ref in image_refs[:5]],
172
+ )
173
+
174
+ result: dict = {
175
+ "compliance_analysis": analysis_text,
176
+ "investigation_round": investigation_round + 1,
177
+ "needs_more_investigation": needs_more,
178
+ "discussion_log": [discussion_msg],
179
+ "status_message": [summary],
180
+ }
181
+
182
+ if additional_crops:
183
+ result["crop_tasks"] = additional_crops
184
+ result["additional_crop_tasks"] = additional_crops
185
+ if additional_code_queries:
186
+ result["additional_code_queries"] = additional_code_queries
187
+
188
+ return result
nodes/compliance_planner.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """compliance_planner node — dual-plan generation (crops + code queries)."""
2
+ from __future__ import annotations
3
+
4
+ import json
5
+ import re
6
+ from datetime import datetime
7
+
8
+ from google import genai
9
+ from google.genai import types
10
+
11
+ from config import GOOGLE_API_KEY, PLANNER_MODEL
12
+ from prompts.compliance_planner import COMPLIANCE_PLANNER_SYSTEM_PROMPT
13
+ from state import AgentMessage, CodeQuery, ComplianceState, CropTask
14
+
15
+
16
+ def compliance_planner(state: ComplianceState) -> dict:
17
+ """Analyze page metadata + user question and produce dual plans for
18
+ image cropping AND code lookup."""
19
+ question = state["question"]
20
+ num_pages = state.get("num_pages", 0)
21
+ page_metadata_json = state.get("page_metadata_json", "")
22
+ investigation_round = state.get("investigation_round", 0)
23
+
24
+ client = genai.Client(api_key=GOOGLE_API_KEY)
25
+
26
+ question_text = (
27
+ f"USER COMPLIANCE QUESTION: {question}\n\n"
28
+ f"The PDF has {num_pages} pages (1-indexed, from page 1 to page {num_pages}).\n"
29
+ f"This is investigation round {investigation_round + 1}.\n\n"
30
+ )
31
+
32
+ if page_metadata_json:
33
+ question_text += f"PAGE METADATA:\n{page_metadata_json}"
34
+ else:
35
+ question_text += (
36
+ "No page metadata available. Based on the question alone, "
37
+ "plan what code lookups are needed. Crop tasks will use default pages."
38
+ )
39
+
40
+ response = client.models.generate_content(
41
+ model=PLANNER_MODEL,
42
+ contents=[types.Content(role="user", parts=[types.Part.from_text(text=question_text)])],
43
+ config=types.GenerateContentConfig(
44
+ system_instruction=COMPLIANCE_PLANNER_SYSTEM_PROMPT,
45
+ ),
46
+ )
47
+
48
+ response_text = response.text.strip()
49
+
50
+ # Parse JSON response
51
+ json_match = re.search(r"\{.*\}", response_text, re.DOTALL)
52
+
53
+ target_pages: list[int] = []
54
+ legend_pages: list[int] = []
55
+ crop_tasks: list[CropTask] = []
56
+ code_queries: list[CodeQuery] = []
57
+
58
+ if json_match:
59
+ try:
60
+ parsed = json.loads(json_match.group())
61
+ valid_0indexed = set(range(num_pages))
62
+
63
+ target_pages = [
64
+ int(p) - 1 for p in parsed.get("target_pages", [])
65
+ if int(p) - 1 in valid_0indexed
66
+ ]
67
+ legend_pages = [
68
+ int(p) - 1 for p in parsed.get("legend_pages", [])
69
+ if int(p) - 1 in valid_0indexed
70
+ ]
71
+
72
+ for t in parsed.get("crop_tasks", []):
73
+ raw_page = int(t.get("page_num", 1))
74
+ crop_tasks.append(
75
+ CropTask(
76
+ page_num=raw_page - 1,
77
+ crop_instruction=t.get("crop_instruction", ""),
78
+ annotate=bool(t.get("annotate", False)),
79
+ annotation_prompt=t.get("annotation_prompt", ""),
80
+ label=t.get("label", f"Page {raw_page} crop"),
81
+ priority=int(t.get("priority", 1)),
82
+ )
83
+ )
84
+
85
+ for q in parsed.get("code_queries", []):
86
+ code_queries.append(
87
+ CodeQuery(
88
+ query=q.get("query", ""),
89
+ focus_area=q.get("focus_area", ""),
90
+ context=q.get("context", ""),
91
+ priority=int(q.get("priority", 0)),
92
+ )
93
+ )
94
+
95
+ except (json.JSONDecodeError, ValueError, KeyError):
96
+ pass
97
+
98
+ # Sort crop tasks by priority
99
+ crop_tasks.sort(key=lambda t: t["priority"])
100
+
101
+ # Fallback: if nothing identified, use first 5 pages
102
+ if not target_pages and not crop_tasks:
103
+ target_pages = list(range(min(num_pages, 5)))
104
+
105
+ # Build discussion log message
106
+ crop_summary = f"{len(crop_tasks)} crop tasks on pages {', '.join(str(p + 1) for p in target_pages[:5])}"
107
+ code_summary = f"{len(code_queries)} code queries"
108
+ if code_queries:
109
+ code_summary += f" ({', '.join(q['focus_area'] for q in code_queries[:3])})"
110
+
111
+ discussion_msg = AgentMessage(
112
+ timestamp=datetime.now().strftime("%H:%M:%S"),
113
+ agent="planner",
114
+ action="plan",
115
+ summary=f"Planned {crop_summary} and {code_summary}.",
116
+ detail=response_text,
117
+ evidence_refs=[],
118
+ )
119
+
120
+ return {
121
+ "target_pages": target_pages,
122
+ "legend_pages": legend_pages,
123
+ "crop_tasks": crop_tasks,
124
+ "code_queries": code_queries,
125
+ "discussion_log": [discussion_msg],
126
+ "status_message": [
127
+ f"Selected {len(target_pages)} pages ({len(legend_pages)} legends), "
128
+ f"planned {len(crop_tasks)} crop tasks, {len(code_queries)} code queries."
129
+ ],
130
+ }
nodes/cropper.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """execute_crops node — Gemini code_execution for agentic cropping (PoC 1 style)."""
2
+ from __future__ import annotations
3
+
4
+ import io
5
+ import logging
6
+ import time
7
+ import uuid
8
+ from collections.abc import Callable
9
+ from concurrent.futures import ThreadPoolExecutor, as_completed
10
+
11
+ from google import genai
12
+ from google.genai import types
13
+ from PIL import Image
14
+
15
+ from config import CROPPER_MODEL, GOOGLE_API_KEY
16
+ from prompts.cropper import CROPPER_PROMPT_TEMPLATE
17
+ from state import CropTask, DrawingReaderState, ImageRef
18
+ from tools.crop_cache import CropCache
19
+ from tools.image_store import ImageStore
20
+ from tools.pdf_processor import get_page_image_bytes
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+ # Type alias for the progress callback.
25
+ # Signature: (completed_ref, crop_task, source, completed_count, total_count)
26
+ ProgressCallback = Callable[[ImageRef, CropTask, str, int, int], None]
27
+
28
+ # Retry settings for transient API errors (429 / 503)
29
+ MAX_RETRIES = 3
30
+ RETRY_BASE_DELAY = 2.0 # seconds
31
+
32
+
33
+ def _extract_last_image(response) -> Image.Image | None:
34
+ """Extract the last generated image from a Gemini code_execution response."""
35
+ last_image = None
36
+ for part in response.candidates[0].content.parts:
37
+ # Try as_image() first
38
+ try:
39
+ img_data = part.as_image()
40
+ if img_data is not None:
41
+ last_image = Image.open(io.BytesIO(img_data.image_bytes))
42
+ continue
43
+ except Exception:
44
+ pass
45
+ # Fallback: inline_data
46
+ try:
47
+ if hasattr(part, "inline_data") and part.inline_data is not None:
48
+ img_bytes = part.inline_data.data
49
+ last_image = Image.open(io.BytesIO(img_bytes))
50
+ except Exception:
51
+ pass
52
+ return last_image
53
+
54
+
55
+ def _execute_single_crop_sync(
56
+ client: genai.Client,
57
+ page_image_bytes: bytes,
58
+ crop_task: CropTask,
59
+ image_store: ImageStore,
60
+ ) -> tuple[ImageRef, bool]:
61
+ """Execute one crop via Gemini code_execution (synchronous).
62
+
63
+ Includes retry logic for transient 503/429 errors.
64
+
65
+ Returns
66
+ -------
67
+ (image_ref, is_fallback)
68
+ ``is_fallback`` is True when Gemini failed to produce a crop and the
69
+ full page image was returned instead. Fallbacks should NOT be cached.
70
+ """
71
+ prompt = CROPPER_PROMPT_TEMPLATE.format(
72
+ crop_instruction=crop_task["crop_instruction"],
73
+ )
74
+
75
+ image_part = types.Part.from_bytes(data=page_image_bytes, mime_type="image/png")
76
+
77
+ # Retry loop for transient API errors
78
+ response = None
79
+ for attempt in range(MAX_RETRIES):
80
+ try:
81
+ response = client.models.generate_content(
82
+ model=CROPPER_MODEL,
83
+ contents=[image_part, prompt],
84
+ config=types.GenerateContentConfig(
85
+ tools=[types.Tool(code_execution=types.ToolCodeExecution)]
86
+ ),
87
+ )
88
+ break
89
+ except Exception as e:
90
+ err_str = str(e)
91
+ if ("503" in err_str or "429" in err_str or "UNAVAILABLE" in err_str):
92
+ delay = RETRY_BASE_DELAY * (2 ** attempt)
93
+ logger.warning(
94
+ "Crop API error (attempt %d/%d): %s — retrying in %.1fs",
95
+ attempt + 1, MAX_RETRIES, err_str[:120], delay,
96
+ )
97
+ time.sleep(delay)
98
+ else:
99
+ raise
100
+
101
+ is_fallback = True
102
+ if response is not None:
103
+ final_image = _extract_last_image(response)
104
+ if final_image is not None:
105
+ is_fallback = False
106
+ else:
107
+ final_image = Image.open(io.BytesIO(page_image_bytes))
108
+ else:
109
+ # All retries exhausted
110
+ final_image = Image.open(io.BytesIO(page_image_bytes))
111
+
112
+ crop_id = f"crop_{uuid.uuid4().hex[:6]}"
113
+ ref = image_store.save_crop(
114
+ page_num=crop_task["page_num"],
115
+ crop_id=crop_id,
116
+ image=final_image,
117
+ label=crop_task["label"],
118
+ )
119
+ return ref, is_fallback
120
+
121
+
122
+ def execute_crops(
123
+ state: DrawingReaderState,
124
+ image_store: ImageStore,
125
+ crop_cache: CropCache | None = None,
126
+ progress_callback: ProgressCallback | None = None,
127
+ ) -> dict:
128
+ """Execute all crop tasks concurrently, reusing cached crops when possible.
129
+
130
+ Parameters
131
+ ----------
132
+ progress_callback
133
+ Optional callback invoked on the **main thread** each time a crop
134
+ completes (or is served from cache). Called with
135
+ ``(image_ref, crop_task, source, completed_count, total_count)``
136
+ where *source* is ``"cached"``, ``"completed"``, or ``"fallback"``.
137
+ """
138
+ crop_tasks = state.get("crop_tasks", [])
139
+ page_image_dir = state["page_image_dir"]
140
+
141
+ if not crop_tasks:
142
+ return {"status_message": ["No crop tasks to execute."]}
143
+
144
+ total_count = len(crop_tasks)
145
+ completed_count = 0
146
+
147
+ # ----- Phase 1: Separate cache hits from tasks that need API calls -----
148
+ image_refs: list[ImageRef] = [] # final ordered results
149
+ tasks_to_execute: list[tuple[int, CropTask]] = [] # (original_index, task)
150
+ cache_hits = 0
151
+
152
+ for i, ct in enumerate(crop_tasks):
153
+ if crop_cache is not None:
154
+ cached_ref = crop_cache.lookup(ct["page_num"], ct["crop_instruction"])
155
+ if cached_ref is not None:
156
+ image_refs.append(cached_ref)
157
+ cache_hits += 1
158
+ completed_count += 1
159
+ logger.info(
160
+ "Reusing cached crop for '%s' (page %d)",
161
+ ct["label"], ct["page_num"],
162
+ )
163
+ # Notify the UI immediately for each cache hit
164
+ if progress_callback is not None:
165
+ progress_callback(
166
+ cached_ref, ct, "cached", completed_count, total_count,
167
+ )
168
+ continue
169
+ # Not cached — needs an API call
170
+ tasks_to_execute.append((i, ct))
171
+
172
+ # ----- Phase 2: Execute uncached crops via Gemini -----
173
+ errors: list[str] = []
174
+
175
+ if tasks_to_execute:
176
+ client = genai.Client(api_key=GOOGLE_API_KEY)
177
+
178
+ with ThreadPoolExecutor(max_workers=min(len(tasks_to_execute), 4)) as pool:
179
+ future_to_idx: dict = {}
180
+ for exec_idx, (_, ct) in enumerate(tasks_to_execute):
181
+ page_bytes = get_page_image_bytes(page_image_dir, ct["page_num"])
182
+ future = pool.submit(
183
+ _execute_single_crop_sync, client, page_bytes, ct, image_store,
184
+ )
185
+ future_to_idx[future] = exec_idx
186
+
187
+ # Process results as they arrive — this runs on the MAIN thread,
188
+ # so we can safely invoke the Streamlit progress callback here.
189
+ for future in as_completed(future_to_idx):
190
+ exec_idx = future_to_idx[future]
191
+ orig_idx, ct = tasks_to_execute[exec_idx]
192
+ try:
193
+ ref, is_fallback = future.result()
194
+ image_refs.append(ref)
195
+ completed_count += 1
196
+
197
+ # Register in cache (only successful targeted crops)
198
+ if crop_cache is not None:
199
+ crop_cache.register(
200
+ page_num=ct["page_num"],
201
+ crop_instruction=ct["crop_instruction"],
202
+ label=ct["label"],
203
+ image_ref=ref,
204
+ is_fallback=is_fallback,
205
+ )
206
+
207
+ # Notify the UI as each crop completes
208
+ if progress_callback is not None:
209
+ source = "fallback" if is_fallback else "completed"
210
+ progress_callback(
211
+ ref, ct, source, completed_count, total_count,
212
+ )
213
+
214
+ except Exception as e:
215
+ completed_count += 1
216
+ errors.append(f"Crop task {orig_idx} failed: {e}")
217
+ logger.error("Crop task %d failed: %s", orig_idx, e)
218
+
219
+ # ----- Phase 3: Build status message -----
220
+ api_count = len(tasks_to_execute) - len(errors)
221
+ parts = [f"Completed {len(image_refs)} of {total_count} crops"]
222
+ if cache_hits:
223
+ parts.append(f"({cache_hits} from cache, {api_count} new)")
224
+ if errors:
225
+ parts.append(f"Errors: {'; '.join(errors)}")
226
+ status = ". ".join(parts) + "."
227
+
228
+ if crop_cache is not None:
229
+ logger.info(crop_cache.stats)
230
+
231
+ return {
232
+ "image_refs": image_refs,
233
+ "status_message": [status],
234
+ }
nodes/deliberation.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """deliberation node — GPT peer review of Gemini's compliance analysis."""
2
+ from __future__ import annotations
3
+
4
+ from datetime import datetime
5
+
6
+ from openai import OpenAI
7
+
8
+ from config import DELIBERATION_MODEL, OPENAI_API_KEY
9
+ from prompts.deliberation import DELIBERATION_SYSTEM_PROMPT
10
+ from state import AgentMessage, ComplianceState
11
+ from tools.image_store import ImageStore
12
+
13
+
14
+ def deliberation(state: ComplianceState, image_store: ImageStore) -> dict:
15
+ """Send compliance analysis + images + code report to GPT for peer review."""
16
+ question = state["question"]
17
+ compliance_analysis = state.get("compliance_analysis", "")
18
+ code_report = state.get("code_report", "")
19
+ image_refs = state.get("image_refs", [])
20
+
21
+ if not compliance_analysis:
22
+ return {
23
+ "reviewer_analysis": "",
24
+ "discussion_log": [
25
+ AgentMessage(
26
+ timestamp=datetime.now().strftime("%H:%M:%S"),
27
+ agent="reviewer",
28
+ action="review",
29
+ summary="No analysis to review.",
30
+ detail="",
31
+ evidence_refs=[],
32
+ )
33
+ ],
34
+ "status_message": ["No analysis to review."],
35
+ }
36
+
37
+ client = OpenAI(api_key=OPENAI_API_KEY)
38
+
39
+ # Build multimodal message
40
+ user_content: list[dict] = [
41
+ {"type": "text", "text": f"USER COMPLIANCE QUESTION: {question}"},
42
+ {"type": "text", "text": f"\n=== LEGAL REQUIREMENTS ===\n{code_report}"},
43
+ {"type": "text", "text": f"\n=== ANALYST'S COMPLIANCE FINDINGS ===\n{compliance_analysis}"},
44
+ {"type": "text", "text": "\nBELOW ARE THE SAME CROPPED IMAGES THE ANALYST EXAMINED:"},
45
+ ]
46
+
47
+ for ref in image_refs:
48
+ user_content.append(
49
+ {"type": "text", "text": f"\nImage: {ref['label']}"}
50
+ )
51
+ try:
52
+ user_content.append(image_store.to_openai_base64(ref))
53
+ except Exception as e:
54
+ user_content.append(
55
+ {"type": "text", "text": f"(Could not load image: {e})"}
56
+ )
57
+
58
+ user_content.append(
59
+ {"type": "text", "text": "\nPerform your peer review of the compliance determination."}
60
+ )
61
+
62
+ response = client.chat.completions.create(
63
+ model=DELIBERATION_MODEL,
64
+ messages=[
65
+ {"role": "system", "content": DELIBERATION_SYSTEM_PROMPT},
66
+ {"role": "user", "content": user_content},
67
+ ],
68
+ )
69
+
70
+ review_text = response.choices[0].message.content or ""
71
+
72
+ discussion_msg = AgentMessage(
73
+ timestamp=datetime.now().strftime("%H:%M:%S"),
74
+ agent="reviewer",
75
+ action="review",
76
+ summary=f"Peer review complete. {review_text[:100]}...",
77
+ detail=review_text[:1500],
78
+ evidence_refs=[],
79
+ )
80
+
81
+ return {
82
+ "reviewer_analysis": review_text,
83
+ "discussion_log": [discussion_msg],
84
+ "status_message": ["Deliberation/peer review complete."],
85
+ }
nodes/final_verdict.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """final_verdict node — synthesize compliance analysis + optional peer review into structured verdict."""
2
+ from __future__ import annotations
3
+
4
+ from datetime import datetime
5
+
6
+ from google import genai
7
+ from google.genai import types
8
+
9
+ from config import GOOGLE_API_KEY, VERDICT_MODEL
10
+ from state import AgentMessage, ComplianceState
11
+
12
+
13
+ def final_verdict(state: ComplianceState) -> dict:
14
+ """Produce the final compliance verdict, synthesizing all evidence."""
15
+ question = state["question"]
16
+ compliance_analysis = state.get("compliance_analysis", "")
17
+ reviewer_analysis = state.get("reviewer_analysis", "")
18
+ code_report = state.get("code_report", "")
19
+ enable_consensus = state.get("enable_consensus", False)
20
+
21
+ # If no consensus was run, pass through the analyst's determination
22
+ if not enable_consensus or not reviewer_analysis:
23
+ verdict_msg = AgentMessage(
24
+ timestamp=datetime.now().strftime("%H:%M:%S"),
25
+ agent="compliance_analyst",
26
+ action="verdict",
27
+ summary="Final compliance verdict issued.",
28
+ detail=compliance_analysis[:1000],
29
+ evidence_refs=[],
30
+ )
31
+ return {
32
+ "final_verdict": compliance_analysis,
33
+ "discussion_log": [verdict_msg],
34
+ "status_message": ["Final verdict ready."],
35
+ }
36
+
37
+ # Synthesize both perspectives
38
+ client = genai.Client(api_key=GOOGLE_API_KEY)
39
+
40
+ synthesis_prompt = f"""\
41
+ You are producing a FINAL COMPLIANCE VERDICT for a NYC building code review.
42
+
43
+ USER QUESTION: {question}
44
+
45
+ CODE REQUIREMENTS (from legal research):
46
+ {code_report[:3000]}
47
+
48
+ ANALYST A (Gemini) compliance findings:
49
+ {compliance_analysis}
50
+
51
+ ANALYST B (GPT) peer review:
52
+ {reviewer_analysis}
53
+
54
+ YOUR TASK:
55
+ 1. If both analysts AGREE on compliance status: produce a confident, unified verdict.
56
+ 2. If they PARTIALLY AGREE: produce the verdict based on agreed points, and explicitly \
57
+ note areas of disagreement with evidence from both sides.
58
+ 3. If they DISAGREE: present both interpretations, explain the discrepancy, and state \
59
+ which determination appears better supported by the evidence.
60
+
61
+ OUTPUT FORMAT:
62
+
63
+ ### Compliance Verdict
64
+ **Status:** Compliant | Non-Compliant | Partially Compliant | Unverifiable
65
+
66
+ ### Legal Basis
67
+ For each code requirement checked:
68
+ - **[Code Type] SS[Section] — [Title]**
69
+ - Requirement: [specific measurable requirement]
70
+ - Drawing Evidence: [what was observed]
71
+ - Determination: [compliant/non-compliant/unverifiable]
72
+
73
+ ### Key Findings
74
+ - Bullet points of the most important compliance determinations
75
+
76
+ ### Analyst Consensus
77
+ - Agreement/disagreement between Gemini and GPT analysts
78
+ - Resolution of any conflicts
79
+
80
+ ### Limitations
81
+ - What could not be verified and why
82
+ - Recommended follow-up actions
83
+
84
+ Always cite BOTH code sections AND image crop labels for every factual claim.
85
+ """
86
+
87
+ response = client.models.generate_content(
88
+ model=VERDICT_MODEL,
89
+ contents=[synthesis_prompt],
90
+ )
91
+
92
+ verdict_text = response.text
93
+
94
+ verdict_msg = AgentMessage(
95
+ timestamp=datetime.now().strftime("%H:%M:%S"),
96
+ agent="compliance_analyst",
97
+ action="verdict",
98
+ summary="Final synthesized compliance verdict issued.",
99
+ detail=verdict_text[:1000],
100
+ evidence_refs=[],
101
+ )
102
+
103
+ return {
104
+ "final_verdict": verdict_text,
105
+ "discussion_log": [verdict_msg],
106
+ "status_message": ["Final synthesized verdict ready."],
107
+ }
nodes/metadata_generator.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Background page metadata generator — extracts per-page descriptions from the full PDF.
2
+
3
+ Uses parallel batch processing: the PDF is split into 5-page chunks and each
4
+ chunk is sent to Gemini concurrently for faster metadata extraction.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import json
9
+ import logging
10
+ import math
11
+ from concurrent.futures import ThreadPoolExecutor, as_completed
12
+
13
+ from google import genai
14
+ from google.genai import types
15
+
16
+ from config import GOOGLE_API_KEY, METADATA_MODEL
17
+ from prompts.metadata import METADATA_SYSTEM_PROMPT
18
+ from tools.pdf_processor import extract_page_range_bytes
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ # Number of PDF pages per batch sent to Gemini in parallel.
23
+ BATCH_SIZE = 5
24
+
25
+
26
+ # ---------------------------------------------------------------------------
27
+ # JSON extraction helper
28
+ # ---------------------------------------------------------------------------
29
+
30
+ def _extract_json_array(response_text: str) -> list[dict]:
31
+ """Extract the outermost balanced JSON array from a response string."""
32
+ start = response_text.find("[")
33
+ if start == -1:
34
+ raise ValueError("No JSON array found in metadata generation response")
35
+
36
+ depth = 0
37
+ end = None
38
+ for i in range(start, len(response_text)):
39
+ if response_text[i] == "[":
40
+ depth += 1
41
+ elif response_text[i] == "]":
42
+ depth -= 1
43
+ if depth == 0:
44
+ end = i
45
+ break
46
+
47
+ if end is None:
48
+ raise ValueError("No matching closing bracket found in metadata response")
49
+
50
+ result = json.loads(response_text[start : end + 1])
51
+ if not isinstance(result, list):
52
+ raise ValueError(f"Expected list, got {type(result)}")
53
+ return result
54
+
55
+
56
+ # ---------------------------------------------------------------------------
57
+ # Single-batch API call
58
+ # ---------------------------------------------------------------------------
59
+
60
+ def _generate_batch(
61
+ pdf_path: str,
62
+ page_start_0: int,
63
+ page_end_0: int,
64
+ page_start_1: int,
65
+ page_end_1: int,
66
+ ) -> list[dict]:
67
+ """Generate metadata for a contiguous range of pages.
68
+
69
+ Args:
70
+ pdf_path: Path to the full PDF on disk.
71
+ page_start_0: First page (0-indexed, inclusive) for PDF extraction.
72
+ page_end_0: Last page (0-indexed, inclusive) for PDF extraction.
73
+ page_start_1: First page (1-indexed) — used in the prompt text.
74
+ page_end_1: Last page (1-indexed) — used in the prompt text.
75
+
76
+ Returns:
77
+ List of metadata dicts for the pages in this batch.
78
+ """
79
+ client = genai.Client(api_key=GOOGLE_API_KEY)
80
+
81
+ batch_pdf_bytes = extract_page_range_bytes(pdf_path, page_start_0, page_end_0)
82
+ pdf_part = types.Part.from_bytes(data=batch_pdf_bytes, mime_type="application/pdf")
83
+
84
+ num_batch_pages = page_end_1 - page_start_1 + 1
85
+ instruction_text = (
86
+ f"This PDF excerpt contains {num_batch_pages} page(s), "
87
+ f"corresponding to pages {page_start_1} through {page_end_1} of the full drawing set.\n"
88
+ f"Generate structured metadata for ALL {num_batch_pages} page(s). "
89
+ f"Use page numbers {page_start_1} through {page_end_1} (1-indexed). "
90
+ f"Return a JSON array with exactly {num_batch_pages} objects."
91
+ )
92
+ instruction_part = types.Part.from_text(text=instruction_text)
93
+
94
+ response = client.models.generate_content(
95
+ model=METADATA_MODEL,
96
+ contents=[types.Content(role="user", parts=[pdf_part, instruction_part])],
97
+ config=types.GenerateContentConfig(
98
+ system_instruction=METADATA_SYSTEM_PROMPT,
99
+ ),
100
+ )
101
+
102
+ return _extract_json_array(response.text.strip())
103
+
104
+
105
+ # ---------------------------------------------------------------------------
106
+ # Public entry point
107
+ # ---------------------------------------------------------------------------
108
+
109
+ def generate_page_metadata(
110
+ pdf_path: str,
111
+ num_pages: int,
112
+ progress_callback=None,
113
+ ) -> list[dict]:
114
+ """Extract per-page structured metadata from a PDF using parallel batches.
115
+
116
+ The PDF is split into chunks of ``BATCH_SIZE`` pages. Each chunk is sent to
117
+ Gemini concurrently via a thread pool. Results are merged, any missing
118
+ pages are back-filled, and the list is returned sorted by page number.
119
+
120
+ Args:
121
+ pdf_path: Path to the full PDF.
122
+ num_pages: Total number of pages.
123
+ progress_callback: Optional ``(completed_batches, total_batches, page_range_str) -> None``
124
+ called after each batch finishes.
125
+
126
+ Returns a list of dicts (1-indexed page_num), one per page.
127
+ Raises on failure (caller is responsible for error handling).
128
+ """
129
+ num_batches = math.ceil(num_pages / BATCH_SIZE)
130
+ logger.info(
131
+ "Starting parallel metadata generation: %d pages in %d batches of %d",
132
+ num_pages, num_batches, BATCH_SIZE,
133
+ )
134
+
135
+ all_results: list[dict] = []
136
+ errors: list[str] = []
137
+ completed_count = 0
138
+
139
+ with ThreadPoolExecutor(max_workers=num_batches) as executor:
140
+ futures = {}
141
+ for batch_idx in range(num_batches):
142
+ page_start_0 = batch_idx * BATCH_SIZE
143
+ page_end_0 = min(page_start_0 + BATCH_SIZE - 1, num_pages - 1)
144
+ page_start_1 = page_start_0 + 1
145
+ page_end_1 = page_end_0 + 1
146
+
147
+ future = executor.submit(
148
+ _generate_batch,
149
+ pdf_path,
150
+ page_start_0,
151
+ page_end_0,
152
+ page_start_1,
153
+ page_end_1,
154
+ )
155
+ futures[future] = (page_start_1, page_end_1)
156
+
157
+ for future in as_completed(futures):
158
+ batch_range = futures[future]
159
+ try:
160
+ batch_results = future.result()
161
+ all_results.extend(batch_results)
162
+ completed_count += 1
163
+ logger.info("Batch pages %d-%d complete: %d entries", batch_range[0], batch_range[1], len(batch_results))
164
+ if progress_callback is not None:
165
+ progress_callback(
166
+ completed_count,
167
+ num_batches,
168
+ f"Pages {batch_range[0]}-{batch_range[1]}",
169
+ )
170
+ except Exception as e:
171
+ completed_count += 1
172
+ errors.append(f"Batch pages {batch_range[0]}-{batch_range[1]} failed: {e}")
173
+ logger.exception("Batch pages %d-%d failed", batch_range[0], batch_range[1])
174
+ if progress_callback is not None:
175
+ progress_callback(
176
+ completed_count,
177
+ num_batches,
178
+ f"Pages {batch_range[0]}-{batch_range[1]} (failed)",
179
+ )
180
+
181
+ if errors and not all_results:
182
+ raise RuntimeError(
183
+ f"All metadata batches failed:\n" + "\n".join(errors)
184
+ )
185
+
186
+ if errors:
187
+ logger.warning("Some batches failed (results will have gaps): %s", errors)
188
+
189
+ # Metadata stays 1-indexed (as the model produced it) because it will be
190
+ # passed as context text to the planner model, which also uses 1-indexed.
191
+ # The planner's *output* is converted to 0-indexed in nodes/planner.py.
192
+
193
+ # Fill in any missing pages with minimal entries (1-indexed)
194
+ covered_pages = {item.get("page_num") for item in all_results}
195
+ for p in range(1, num_pages + 1):
196
+ if p not in covered_pages:
197
+ all_results.append({
198
+ "page_num": p,
199
+ "sheet_id": "unknown",
200
+ "sheet_title": "Unknown",
201
+ "discipline": "other",
202
+ "page_type": "other",
203
+ "description": "Metadata not extracted for this page.",
204
+ "key_elements": [],
205
+ "spatial_coverage": "",
206
+ })
207
+
208
+ # Sort by page number
209
+ all_results.sort(key=lambda x: x.get("page_num", 0))
210
+
211
+ return all_results
prompts/__init__.py ADDED
File without changes
prompts/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (232 Bytes). View file
 
prompts/__pycache__/annotator.cpython-313.pyc ADDED
Binary file (1.1 kB). View file
 
prompts/__pycache__/code_lookup.cpython-313.pyc ADDED
Binary file (1.95 kB). View file
 
prompts/__pycache__/compliance_analyst.cpython-313.pyc ADDED
Binary file (4.29 kB). View file
 
prompts/__pycache__/compliance_planner.cpython-313.pyc ADDED
Binary file (3.2 kB). View file
 
prompts/__pycache__/cropper.cpython-313.pyc ADDED
Binary file (1.18 kB). View file
 
prompts/__pycache__/deliberation.cpython-313.pyc ADDED
Binary file (1.71 kB). View file