Ryan2219 commited on
Commit
d8d538f
·
verified ·
1 Parent(s): 4a413b3

Upload 47 files

Browse files
Files changed (48) hide show
  1. .gitattributes +1 -0
  2. default_drawings/NorthMaconPark.pdf +3 -0
  3. nodes/__init__.py +0 -0
  4. nodes/__pycache__/__init__.cpython-313.pyc +0 -0
  5. nodes/__pycache__/analyzer.cpython-313.pyc +0 -0
  6. nodes/__pycache__/annotator.cpython-313.pyc +0 -0
  7. nodes/__pycache__/consensus.cpython-313.pyc +0 -0
  8. nodes/__pycache__/cropper.cpython-313.pyc +0 -0
  9. nodes/__pycache__/ingest.cpython-313.pyc +0 -0
  10. nodes/__pycache__/legends.cpython-313.pyc +0 -0
  11. nodes/__pycache__/metadata_generator.cpython-313.pyc +0 -0
  12. nodes/__pycache__/planner.cpython-313.pyc +0 -0
  13. nodes/__pycache__/retrieve.cpython-313.pyc +0 -0
  14. nodes/__pycache__/synthesizer.cpython-313.pyc +0 -0
  15. nodes/analyzer.py +132 -0
  16. nodes/annotator.py +117 -0
  17. nodes/consensus.py +58 -0
  18. nodes/cropper.py +204 -0
  19. nodes/ingest.py +22 -0
  20. nodes/metadata_generator.py +186 -0
  21. nodes/planner.py +127 -0
  22. nodes/synthesizer.py +58 -0
  23. prompts/__init__.py +0 -0
  24. prompts/__pycache__/__init__.cpython-313.pyc +0 -0
  25. prompts/__pycache__/analyzer.cpython-313.pyc +0 -0
  26. prompts/__pycache__/annotator.cpython-313.pyc +0 -0
  27. prompts/__pycache__/consensus.cpython-313.pyc +0 -0
  28. prompts/__pycache__/cropper.cpython-313.pyc +0 -0
  29. prompts/__pycache__/metadata.cpython-313.pyc +0 -0
  30. prompts/__pycache__/planner.cpython-313.pyc +0 -0
  31. prompts/analyzer.py +56 -0
  32. prompts/annotator.py +22 -0
  33. prompts/consensus.py +29 -0
  34. prompts/cropper.py +21 -0
  35. prompts/metadata.py +40 -0
  36. prompts/planner.py +186 -0
  37. tools/__init__.py +0 -0
  38. tools/__pycache__/__init__.cpython-313.pyc +0 -0
  39. tools/__pycache__/crop_cache.cpython-313.pyc +0 -0
  40. tools/__pycache__/file_search.cpython-313.pyc +0 -0
  41. tools/__pycache__/image_store.cpython-313.pyc +0 -0
  42. tools/__pycache__/metadata_cache.cpython-313.pyc +0 -0
  43. tools/__pycache__/pdf_processor.cpython-313.pyc +0 -0
  44. tools/__pycache__/vector_store.cpython-313.pyc +0 -0
  45. tools/crop_cache.py +176 -0
  46. tools/image_store.py +138 -0
  47. tools/metadata_cache.py +131 -0
  48. tools/pdf_processor.py +95 -0
.gitattributes CHANGED
@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  Gemini_Generated_Image_3ow7sj3ow7sj3ow7.png filter=lfs diff=lfs merge=lfs -text
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  Gemini_Generated_Image_3ow7sj3ow7sj3ow7.png filter=lfs diff=lfs merge=lfs -text
37
+ default_drawings/NorthMaconPark.pdf filter=lfs diff=lfs merge=lfs -text
default_drawings/NorthMaconPark.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9aed76b73fbe205e1579e3a00be6e95b7564e72594b1fdb83311819d447f8fc4
3
+ size 39114794
nodes/__init__.py ADDED
File without changes
nodes/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (218 Bytes). View file
 
nodes/__pycache__/analyzer.cpython-313.pyc ADDED
Binary file (5.87 kB). View file
 
nodes/__pycache__/annotator.cpython-313.pyc ADDED
Binary file (5.05 kB). View file
 
nodes/__pycache__/consensus.cpython-313.pyc ADDED
Binary file (2.37 kB). View file
 
nodes/__pycache__/cropper.cpython-313.pyc ADDED
Binary file (7.69 kB). View file
 
nodes/__pycache__/ingest.cpython-313.pyc ADDED
Binary file (917 Bytes). View file
 
nodes/__pycache__/legends.cpython-313.pyc ADDED
Binary file (3.57 kB). View file
 
nodes/__pycache__/metadata_generator.cpython-313.pyc ADDED
Binary file (7.51 kB). View file
 
nodes/__pycache__/planner.cpython-313.pyc ADDED
Binary file (5.53 kB). View file
 
nodes/__pycache__/retrieve.cpython-313.pyc ADDED
Binary file (6.3 kB). View file
 
nodes/__pycache__/synthesizer.cpython-313.pyc ADDED
Binary file (2.23 kB). View file
 
nodes/analyzer.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """analyze_findings node — parent agent examines crops and answers the question."""
2
+ from __future__ import annotations
3
+
4
+ import json
5
+ import re
6
+
7
+ from google import genai
8
+ from google.genai import types
9
+
10
+ from config import ANALYZER_MODEL, GOOGLE_API_KEY
11
+ from prompts.analyzer import ANALYZER_SYSTEM_PROMPT
12
+ from state import CropTask, DrawingReaderState
13
+ from tools.image_store import ImageStore
14
+
15
+
16
+ def analyze_findings(state: DrawingReaderState, image_store: ImageStore) -> dict:
17
+ """Review all cropped/annotated images and produce an answer."""
18
+ question = state["question"]
19
+ image_refs = state.get("image_refs", [])
20
+ legend_pages = set(state.get("legend_pages", []))
21
+ investigation_round = state.get("investigation_round", 0)
22
+
23
+ client = genai.Client(api_key=GOOGLE_API_KEY)
24
+
25
+ # Build multimodal content — legends first, then crops, then annotated
26
+ content_parts: list[types.Part] = []
27
+
28
+ content_parts.append(types.Part.from_text(text=f"USER QUESTION: {question}"))
29
+
30
+ # Sort images: legend crops first, then detail crops, then annotated versions
31
+ legend_refs = [r for r in image_refs if r["page_num"] in legend_pages and r["crop_type"] == "crop"]
32
+ detail_crops = [r for r in image_refs if r["page_num"] not in legend_pages and r["crop_type"] == "crop"]
33
+ annotated_refs = [r for r in image_refs if r["crop_type"] == "annotated"]
34
+
35
+ ordered_refs = legend_refs + detail_crops + annotated_refs
36
+
37
+ # Add section headers and images in order
38
+ first_detail_id = detail_crops[0]["id"] if detail_crops else None
39
+ first_annotated_id = annotated_refs[0]["id"] if annotated_refs else None
40
+
41
+ if legend_refs:
42
+ content_parts.append(
43
+ types.Part.from_text(
44
+ text="\n=== LEGEND / SCHEDULE CROPS (study these first) ===",
45
+ )
46
+ )
47
+
48
+ for ref in ordered_refs:
49
+ if first_detail_id is not None and ref["id"] == first_detail_id:
50
+ content_parts.append(
51
+ types.Part.from_text(text="\n=== DETAIL CROPS ===")
52
+ )
53
+ if first_annotated_id is not None and ref["id"] == first_annotated_id:
54
+ content_parts.append(
55
+ types.Part.from_text(
56
+ text="\n=== ANNOTATED CROPS (numbered/highlighted versions) ===",
57
+ )
58
+ )
59
+
60
+ content_parts.append(
61
+ types.Part.from_text(text=f"\nImage: {ref['label']}")
62
+ )
63
+ try:
64
+ content_parts.append(image_store.to_gemini_part(ref))
65
+ except Exception as e:
66
+ content_parts.append(
67
+ types.Part.from_text(text=f"(Could not load image: {e})")
68
+ )
69
+
70
+ content_parts.append(
71
+ types.Part.from_text(
72
+ text=f"\nThis is investigation round {investigation_round + 1}. "
73
+ "Analyze the images and answer the user's question. "
74
+ "If you need more crops, include a JSON block at the end of your response."
75
+ )
76
+ )
77
+
78
+ response = client.models.generate_content(
79
+ model=ANALYZER_MODEL,
80
+ contents=[types.Content(role="user", parts=content_parts)],
81
+ config=types.GenerateContentConfig(
82
+ system_instruction=ANALYZER_SYSTEM_PROMPT,
83
+ ),
84
+ )
85
+
86
+ analysis_text = response.text
87
+
88
+ # Check if the model requested additional investigation
89
+ needs_more = False
90
+ additional_crops: list[CropTask] = []
91
+
92
+ json_match = re.search(
93
+ r'```json\s*(\{.*?"needs_more"\s*:\s*true.*?\})\s*```',
94
+ analysis_text,
95
+ re.DOTALL,
96
+ )
97
+ if json_match:
98
+ try:
99
+ extra = json.loads(json_match.group(1))
100
+ if extra.get("needs_more"):
101
+ needs_more = True
102
+ for t in extra.get("additional_crops", []):
103
+ raw_page = int(t.get("page_num", 1))
104
+ additional_crops.append(
105
+ CropTask(
106
+ page_num=raw_page - 1, # convert 1-indexed → 0-indexed
107
+ crop_instruction=t.get("crop_instruction", ""),
108
+ annotate=bool(t.get("annotate", False)),
109
+ annotation_prompt=t.get("annotation_prompt", ""),
110
+ label=t.get("label", "Additional crop"),
111
+ priority=int(t.get("priority", 1)),
112
+ )
113
+ )
114
+ except (json.JSONDecodeError, KeyError):
115
+ pass
116
+
117
+ # Clean the JSON block from the analysis text
118
+ analysis_text = analysis_text[: json_match.start()].strip()
119
+
120
+ result: dict = {
121
+ "gemini_analysis": analysis_text,
122
+ "investigation_round": investigation_round + 1,
123
+ "needs_more_investigation": needs_more,
124
+ "status_message": "Analysis complete."
125
+ if not needs_more
126
+ else f"Requesting {len(additional_crops)} additional crops (round {investigation_round + 2}).",
127
+ }
128
+
129
+ if additional_crops:
130
+ result["crop_tasks"] = additional_crops
131
+
132
+ return result
nodes/annotator.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """annotate_crops node — nano-banana (Gemini image generation) for semantic annotation."""
2
+ from __future__ import annotations
3
+
4
+ import io
5
+ from concurrent.futures import ThreadPoolExecutor, as_completed
6
+
7
+ from google import genai
8
+ from google.genai import types
9
+ from PIL import Image
10
+
11
+ from config import ANNOTATOR_MODEL, GOOGLE_API_KEY
12
+ from prompts.annotator import ANNOTATION_WRAPPER
13
+ from state import DrawingReaderState, ImageRef
14
+ from tools.image_store import ImageStore
15
+
16
+
17
+ def _extract_generated_image(response) -> Image.Image | None:
18
+ """Extract the generated image from a Gemini image-generation response."""
19
+ for part in response.candidates[0].content.parts:
20
+ if part.inline_data is not None:
21
+ return Image.open(io.BytesIO(part.inline_data.data))
22
+ return None
23
+
24
+
25
+ def _annotate_single_crop_sync(
26
+ client: genai.Client,
27
+ crop_ref: ImageRef,
28
+ annotation_prompt: str,
29
+ image_store: ImageStore,
30
+ ) -> ImageRef | None:
31
+ """Annotate one crop using nano-banana (synchronous)."""
32
+ crop_bytes = image_store.load_bytes(crop_ref)
33
+
34
+ full_prompt = ANNOTATION_WRAPPER.format(annotation_prompt=annotation_prompt)
35
+
36
+ response = client.models.generate_content(
37
+ model=ANNOTATOR_MODEL,
38
+ contents=[
39
+ types.Part.from_bytes(data=crop_bytes, mime_type="image/png"),
40
+ full_prompt,
41
+ ],
42
+ config=types.GenerateContentConfig(
43
+ response_modalities=["TEXT", "IMAGE"],
44
+ ),
45
+ )
46
+
47
+ annotated_image = _extract_generated_image(response)
48
+ if annotated_image is None:
49
+ return None
50
+
51
+ ref = image_store.save_annotated(crop_ref, annotated_image)
52
+ return ref
53
+
54
+
55
+ def annotate_crops(state: DrawingReaderState, image_store: ImageStore) -> dict:
56
+ """Run nano-banana annotation on crops that need it."""
57
+ crop_tasks = state.get("crop_tasks", [])
58
+ image_refs = state.get("image_refs", [])
59
+
60
+ # Build a mapping: find crops that need annotation.
61
+ # The most recent batch of crops corresponds to the current crop_tasks.
62
+ # Take the LAST len(crop_tasks) crops from image_refs to match by position,
63
+ # so that on loop-back rounds we only match against the newest crops.
64
+ crops_needing_annotation: list[tuple[ImageRef, str]] = []
65
+
66
+ all_crops = [r for r in image_refs if r["crop_type"] == "crop"]
67
+ # Only the tail — the most recent batch produced by execute_crops
68
+ recent_crops = all_crops[-len(crop_tasks):] if crop_tasks else []
69
+
70
+ for i, task in enumerate(crop_tasks):
71
+ if task["annotate"] and task["annotation_prompt"] and i < len(recent_crops):
72
+ crops_needing_annotation.append(
73
+ (recent_crops[i], task["annotation_prompt"])
74
+ )
75
+
76
+ if not crops_needing_annotation:
77
+ return {"status_message": "No annotation needed for these crops."}
78
+
79
+ client = genai.Client(api_key=GOOGLE_API_KEY)
80
+
81
+ # Use a thread pool instead of asyncio to avoid event-loop conflicts
82
+ # with Streamlit's own event loop.
83
+ results: list[ImageRef | None | Exception] = [None] * len(crops_needing_annotation)
84
+
85
+ with ThreadPoolExecutor(max_workers=min(len(crops_needing_annotation), 4)) as pool:
86
+ future_to_idx = {}
87
+ for i, (ref, prompt) in enumerate(crops_needing_annotation):
88
+ future = pool.submit(
89
+ _annotate_single_crop_sync, client, ref, prompt, image_store,
90
+ )
91
+ future_to_idx[future] = i
92
+
93
+ for future in as_completed(future_to_idx):
94
+ idx = future_to_idx[future]
95
+ try:
96
+ results[idx] = future.result()
97
+ except Exception as e:
98
+ results[idx] = e
99
+
100
+ annotated_refs: list[ImageRef] = []
101
+ errors: list[str] = []
102
+ for i, result in enumerate(results):
103
+ if isinstance(result, Exception):
104
+ errors.append(f"Annotation {i} failed: {result}")
105
+ elif result is not None:
106
+ annotated_refs.append(result)
107
+ else:
108
+ errors.append(f"Annotation {i} returned no image")
109
+
110
+ status = f"Annotated {len(annotated_refs)} of {len(crops_needing_annotation)} crops."
111
+ if errors:
112
+ status += f" Issues: {'; '.join(errors)}"
113
+
114
+ return {
115
+ "image_refs": annotated_refs,
116
+ "status_message": status,
117
+ }
nodes/consensus.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """consensus_review node — GPT-4o reviews Gemini's analysis."""
2
+ from __future__ import annotations
3
+
4
+ from openai import OpenAI
5
+
6
+ from config import CONSENSUS_MODEL, OPENAI_API_KEY
7
+ from prompts.consensus import CONSENSUS_SYSTEM_PROMPT
8
+ from state import DrawingReaderState
9
+ from tools.image_store import ImageStore
10
+
11
+
12
+ def consensus_review(state: DrawingReaderState, image_store: ImageStore) -> dict:
13
+ """Send crops + Gemini's draft to GPT-4o for peer review."""
14
+ question = state["question"]
15
+ gemini_analysis = state.get("gemini_analysis", "")
16
+ image_refs = state.get("image_refs", [])
17
+
18
+ if not gemini_analysis:
19
+ return {"gpt_analysis": "", "status_message": "No analysis to review."}
20
+
21
+ client = OpenAI(api_key=OPENAI_API_KEY)
22
+
23
+ # Build multimodal message for GPT
24
+ user_content: list[dict] = [
25
+ {"type": "text", "text": f"USER QUESTION: {question}"},
26
+ {"type": "text", "text": f"ANALYST'S DRAFT ANSWER:\n{gemini_analysis}"},
27
+ {"type": "text", "text": "\nBELOW ARE THE SAME CROPPED IMAGES THE ANALYST EXAMINED:"},
28
+ ]
29
+
30
+ for ref in image_refs:
31
+ user_content.append(
32
+ {"type": "text", "text": f"\nImage: {ref['label']}"}
33
+ )
34
+ try:
35
+ user_content.append(image_store.to_openai_base64(ref))
36
+ except Exception as e:
37
+ user_content.append(
38
+ {"type": "text", "text": f"(Could not load image: {e})"}
39
+ )
40
+
41
+ user_content.append(
42
+ {"type": "text", "text": "\nPerform your peer review as specified."}
43
+ )
44
+
45
+ response = client.chat.completions.create(
46
+ model=CONSENSUS_MODEL,
47
+ messages=[
48
+ {"role": "system", "content": CONSENSUS_SYSTEM_PROMPT},
49
+ {"role": "user", "content": user_content},
50
+ ],
51
+ )
52
+
53
+ gpt_analysis = response.choices[0].message.content or ""
54
+
55
+ return {
56
+ "gpt_analysis": gpt_analysis,
57
+ "status_message": "GPT consensus review complete.",
58
+ }
nodes/cropper.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """execute_crops node — Gemini code_execution for agentic cropping (PoC 1 style)."""
2
+ from __future__ import annotations
3
+
4
+ import io
5
+ import logging
6
+ import uuid
7
+ from collections.abc import Callable
8
+ from concurrent.futures import ThreadPoolExecutor, as_completed
9
+
10
+ from google import genai
11
+ from google.genai import types
12
+ from PIL import Image
13
+
14
+ from config import CROPPER_MODEL, GOOGLE_API_KEY
15
+ from prompts.cropper import CROPPER_PROMPT_TEMPLATE
16
+ from state import CropTask, DrawingReaderState, ImageRef
17
+ from tools.crop_cache import CropCache
18
+ from tools.image_store import ImageStore
19
+ from tools.pdf_processor import get_page_image_bytes
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+ # Type alias for the progress callback.
24
+ # Signature: (completed_ref, crop_task, source, completed_count, total_count)
25
+ ProgressCallback = Callable[[ImageRef, CropTask, str, int, int], None]
26
+
27
+
28
+ def _extract_last_image(response) -> Image.Image | None:
29
+ """Extract the last generated image from a Gemini code_execution response."""
30
+ last_image = None
31
+ for part in response.candidates[0].content.parts:
32
+ # Try as_image() first
33
+ try:
34
+ img_data = part.as_image()
35
+ if img_data is not None:
36
+ last_image = Image.open(io.BytesIO(img_data.image_bytes))
37
+ continue
38
+ except Exception:
39
+ pass
40
+ # Fallback: inline_data
41
+ try:
42
+ if hasattr(part, "inline_data") and part.inline_data is not None:
43
+ img_bytes = part.inline_data.data
44
+ last_image = Image.open(io.BytesIO(img_bytes))
45
+ except Exception:
46
+ pass
47
+ return last_image
48
+
49
+
50
+ def _execute_single_crop_sync(
51
+ client: genai.Client,
52
+ page_image_bytes: bytes,
53
+ crop_task: CropTask,
54
+ image_store: ImageStore,
55
+ ) -> tuple[ImageRef, bool]:
56
+ """Execute one crop via Gemini code_execution (synchronous).
57
+
58
+ Returns
59
+ -------
60
+ (image_ref, is_fallback)
61
+ ``is_fallback`` is True when Gemini failed to produce a crop and the
62
+ full page image was returned instead. Fallbacks should NOT be cached.
63
+ """
64
+ prompt = CROPPER_PROMPT_TEMPLATE.format(
65
+ crop_instruction=crop_task["crop_instruction"],
66
+ )
67
+
68
+ image_part = types.Part.from_bytes(data=page_image_bytes, mime_type="image/png")
69
+
70
+ response = client.models.generate_content(
71
+ model=CROPPER_MODEL,
72
+ contents=[image_part, prompt],
73
+ config=types.GenerateContentConfig(
74
+ tools=[types.Tool(code_execution=types.ToolCodeExecution)]
75
+ ),
76
+ )
77
+
78
+ final_image = _extract_last_image(response)
79
+ is_fallback = final_image is None
80
+ if is_fallback:
81
+ # Fallback: return the full page image if cropping failed
82
+ final_image = Image.open(io.BytesIO(page_image_bytes))
83
+
84
+ crop_id = f"crop_{uuid.uuid4().hex[:6]}"
85
+ ref = image_store.save_crop(
86
+ page_num=crop_task["page_num"],
87
+ crop_id=crop_id,
88
+ image=final_image,
89
+ label=crop_task["label"],
90
+ )
91
+ return ref, is_fallback
92
+
93
+
94
+ def execute_crops(
95
+ state: DrawingReaderState,
96
+ image_store: ImageStore,
97
+ crop_cache: CropCache | None = None,
98
+ progress_callback: ProgressCallback | None = None,
99
+ ) -> dict:
100
+ """Execute all crop tasks concurrently, reusing cached crops when possible.
101
+
102
+ Parameters
103
+ ----------
104
+ progress_callback
105
+ Optional callback invoked on the **main thread** each time a crop
106
+ completes (or is served from cache). Called with
107
+ ``(image_ref, crop_task, source, completed_count, total_count)``
108
+ where *source* is ``"cached"``, ``"completed"``, or ``"fallback"``.
109
+ """
110
+ crop_tasks = state.get("crop_tasks", [])
111
+ page_image_dir = state["page_image_dir"]
112
+
113
+ if not crop_tasks:
114
+ return {"status_message": "No crop tasks to execute."}
115
+
116
+ total_count = len(crop_tasks)
117
+ completed_count = 0
118
+
119
+ # ----- Phase 1: Separate cache hits from tasks that need API calls -----
120
+ image_refs: list[ImageRef] = [] # final ordered results
121
+ tasks_to_execute: list[tuple[int, CropTask]] = [] # (original_index, task)
122
+ cache_hits = 0
123
+
124
+ for i, ct in enumerate(crop_tasks):
125
+ if crop_cache is not None:
126
+ cached_ref = crop_cache.lookup(ct["page_num"], ct["crop_instruction"])
127
+ if cached_ref is not None:
128
+ image_refs.append(cached_ref)
129
+ cache_hits += 1
130
+ completed_count += 1
131
+ logger.info(
132
+ "Reusing cached crop for '%s' (page %d)",
133
+ ct["label"], ct["page_num"],
134
+ )
135
+ # Notify the UI immediately for each cache hit
136
+ if progress_callback is not None:
137
+ progress_callback(
138
+ cached_ref, ct, "cached", completed_count, total_count,
139
+ )
140
+ continue
141
+ # Not cached — needs an API call
142
+ tasks_to_execute.append((i, ct))
143
+
144
+ # ----- Phase 2: Execute uncached crops via Gemini -----
145
+ errors: list[str] = []
146
+
147
+ if tasks_to_execute:
148
+ client = genai.Client(api_key=GOOGLE_API_KEY)
149
+
150
+ with ThreadPoolExecutor(max_workers=min(len(tasks_to_execute), 4)) as pool:
151
+ future_to_idx: dict = {}
152
+ for exec_idx, (_, ct) in enumerate(tasks_to_execute):
153
+ page_bytes = get_page_image_bytes(page_image_dir, ct["page_num"])
154
+ future = pool.submit(
155
+ _execute_single_crop_sync, client, page_bytes, ct, image_store,
156
+ )
157
+ future_to_idx[future] = exec_idx
158
+
159
+ # Process results as they arrive — this runs on the MAIN thread,
160
+ # so we can safely invoke the Streamlit progress callback here.
161
+ for future in as_completed(future_to_idx):
162
+ exec_idx = future_to_idx[future]
163
+ orig_idx, ct = tasks_to_execute[exec_idx]
164
+ try:
165
+ ref, is_fallback = future.result()
166
+ image_refs.append(ref)
167
+ completed_count += 1
168
+
169
+ # Register in cache (only successful targeted crops)
170
+ if crop_cache is not None:
171
+ crop_cache.register(
172
+ page_num=ct["page_num"],
173
+ crop_instruction=ct["crop_instruction"],
174
+ label=ct["label"],
175
+ image_ref=ref,
176
+ is_fallback=is_fallback,
177
+ )
178
+
179
+ # Notify the UI as each crop completes
180
+ if progress_callback is not None:
181
+ source = "fallback" if is_fallback else "completed"
182
+ progress_callback(
183
+ ref, ct, source, completed_count, total_count,
184
+ )
185
+
186
+ except Exception as e:
187
+ errors.append(f"Crop task {orig_idx} failed: {e}")
188
+
189
+ # ----- Phase 3: Build status message -----
190
+ api_count = len(tasks_to_execute) - len(errors)
191
+ parts = [f"Completed {len(image_refs)} of {total_count} crops"]
192
+ if cache_hits:
193
+ parts.append(f"({cache_hits} from cache, {api_count} new)")
194
+ if errors:
195
+ parts.append(f"Errors: {'; '.join(errors)}")
196
+ status = ". ".join(parts) + "."
197
+
198
+ if crop_cache is not None:
199
+ logger.info(crop_cache.stats)
200
+
201
+ return {
202
+ "image_refs": image_refs,
203
+ "status_message": status,
204
+ }
nodes/ingest.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ingest_pdf node — renders all PDF pages as images at configured DPI."""
2
+ from __future__ import annotations
3
+
4
+ from state import DrawingReaderState
5
+ from tools import pdf_processor
6
+
7
+
8
+ def ingest_pdf(state: DrawingReaderState) -> dict:
9
+ """Render all PDF pages as PNGs for downstream visual analysis.
10
+
11
+ Pages are rendered at PDF_RENDER_DPI (100 DPI) which balances
12
+ speed and quality for construction drawings.
13
+ """
14
+ pdf_path = state["pdf_path"]
15
+ page_image_dir = state["page_image_dir"]
16
+
17
+ num_pages = pdf_processor.render_pages(pdf_path, page_image_dir)
18
+
19
+ return {
20
+ "num_pages": num_pages,
21
+ "status_message": f"Converted {num_pages} pages to images.",
22
+ }
nodes/metadata_generator.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Background page metadata generator — extracts per-page descriptions from the full PDF.
2
+
3
+ Uses parallel batch processing: the PDF is split into 5-page chunks and each
4
+ chunk is sent to Gemini concurrently for faster metadata extraction.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import json
9
+ import logging
10
+ import math
11
+ from concurrent.futures import ThreadPoolExecutor, as_completed
12
+
13
+ from google import genai
14
+ from google.genai import types
15
+
16
+ from config import GOOGLE_API_KEY, METADATA_MODEL
17
+ from prompts.metadata import METADATA_SYSTEM_PROMPT
18
+ from tools.pdf_processor import extract_page_range_bytes
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ # Number of PDF pages per batch sent to Gemini in parallel.
23
+ BATCH_SIZE = 5
24
+
25
+
26
+ # ---------------------------------------------------------------------------
27
+ # JSON extraction helper
28
+ # ---------------------------------------------------------------------------
29
+
30
+ def _extract_json_array(response_text: str) -> list[dict]:
31
+ """Extract the outermost balanced JSON array from a response string."""
32
+ start = response_text.find("[")
33
+ if start == -1:
34
+ raise ValueError("No JSON array found in metadata generation response")
35
+
36
+ depth = 0
37
+ end = None
38
+ for i in range(start, len(response_text)):
39
+ if response_text[i] == "[":
40
+ depth += 1
41
+ elif response_text[i] == "]":
42
+ depth -= 1
43
+ if depth == 0:
44
+ end = i
45
+ break
46
+
47
+ if end is None:
48
+ raise ValueError("No matching closing bracket found in metadata response")
49
+
50
+ result = json.loads(response_text[start : end + 1])
51
+ if not isinstance(result, list):
52
+ raise ValueError(f"Expected list, got {type(result)}")
53
+ return result
54
+
55
+
56
+ # ---------------------------------------------------------------------------
57
+ # Single-batch API call
58
+ # ---------------------------------------------------------------------------
59
+
60
+ def _generate_batch(
61
+ pdf_path: str,
62
+ page_start_0: int,
63
+ page_end_0: int,
64
+ page_start_1: int,
65
+ page_end_1: int,
66
+ ) -> list[dict]:
67
+ """Generate metadata for a contiguous range of pages.
68
+
69
+ Args:
70
+ pdf_path: Path to the full PDF on disk.
71
+ page_start_0: First page (0-indexed, inclusive) for PDF extraction.
72
+ page_end_0: Last page (0-indexed, inclusive) for PDF extraction.
73
+ page_start_1: First page (1-indexed) — used in the prompt text.
74
+ page_end_1: Last page (1-indexed) — used in the prompt text.
75
+
76
+ Returns:
77
+ List of metadata dicts for the pages in this batch.
78
+ """
79
+ client = genai.Client(api_key=GOOGLE_API_KEY)
80
+
81
+ batch_pdf_bytes = extract_page_range_bytes(pdf_path, page_start_0, page_end_0)
82
+ pdf_part = types.Part.from_bytes(data=batch_pdf_bytes, mime_type="application/pdf")
83
+
84
+ num_batch_pages = page_end_1 - page_start_1 + 1
85
+ instruction_text = (
86
+ f"This PDF excerpt contains {num_batch_pages} page(s), "
87
+ f"corresponding to pages {page_start_1} through {page_end_1} of the full drawing set.\n"
88
+ f"Generate structured metadata for ALL {num_batch_pages} page(s). "
89
+ f"Use page numbers {page_start_1} through {page_end_1} (1-indexed). "
90
+ f"Return a JSON array with exactly {num_batch_pages} objects."
91
+ )
92
+ instruction_part = types.Part.from_text(text=instruction_text)
93
+
94
+ response = client.models.generate_content(
95
+ model=METADATA_MODEL,
96
+ contents=[types.Content(role="user", parts=[pdf_part, instruction_part])],
97
+ config=types.GenerateContentConfig(
98
+ system_instruction=METADATA_SYSTEM_PROMPT,
99
+ ),
100
+ )
101
+
102
+ return _extract_json_array(response.text.strip())
103
+
104
+
105
+ # ---------------------------------------------------------------------------
106
+ # Public entry point
107
+ # ---------------------------------------------------------------------------
108
+
109
+ def generate_page_metadata(pdf_path: str, num_pages: int) -> list[dict]:
110
+ """Extract per-page structured metadata from a PDF using parallel batches.
111
+
112
+ The PDF is split into chunks of ``BATCH_SIZE`` pages. Each chunk is sent to
113
+ Gemini concurrently via a thread pool. Results are merged, any missing
114
+ pages are back-filled, and the list is returned sorted by page number.
115
+
116
+ Returns a list of dicts (1-indexed page_num), one per page.
117
+ Raises on failure (caller is responsible for error handling).
118
+ """
119
+ num_batches = math.ceil(num_pages / BATCH_SIZE)
120
+ logger.info(
121
+ "Starting parallel metadata generation: %d pages in %d batches of %d",
122
+ num_pages, num_batches, BATCH_SIZE,
123
+ )
124
+
125
+ all_results: list[dict] = []
126
+ errors: list[str] = []
127
+
128
+ with ThreadPoolExecutor(max_workers=num_batches) as executor:
129
+ futures = {}
130
+ for batch_idx in range(num_batches):
131
+ page_start_0 = batch_idx * BATCH_SIZE
132
+ page_end_0 = min(page_start_0 + BATCH_SIZE - 1, num_pages - 1)
133
+ page_start_1 = page_start_0 + 1
134
+ page_end_1 = page_end_0 + 1
135
+
136
+ future = executor.submit(
137
+ _generate_batch,
138
+ pdf_path,
139
+ page_start_0,
140
+ page_end_0,
141
+ page_start_1,
142
+ page_end_1,
143
+ )
144
+ futures[future] = (page_start_1, page_end_1)
145
+
146
+ for future in as_completed(futures):
147
+ batch_range = futures[future]
148
+ try:
149
+ batch_results = future.result()
150
+ all_results.extend(batch_results)
151
+ logger.info("Batch pages %d-%d complete: %d entries", batch_range[0], batch_range[1], len(batch_results))
152
+ except Exception as e:
153
+ errors.append(f"Batch pages {batch_range[0]}-{batch_range[1]} failed: {e}")
154
+ logger.exception("Batch pages %d-%d failed", batch_range[0], batch_range[1])
155
+
156
+ if errors and not all_results:
157
+ raise RuntimeError(
158
+ f"All metadata batches failed:\n" + "\n".join(errors)
159
+ )
160
+
161
+ if errors:
162
+ logger.warning("Some batches failed (results will have gaps): %s", errors)
163
+
164
+ # Metadata stays 1-indexed (as the model produced it) because it will be
165
+ # passed as context text to the planner model, which also uses 1-indexed.
166
+ # The planner's *output* is converted to 0-indexed in nodes/planner.py.
167
+
168
+ # Fill in any missing pages with minimal entries (1-indexed)
169
+ covered_pages = {item.get("page_num") for item in all_results}
170
+ for p in range(1, num_pages + 1):
171
+ if p not in covered_pages:
172
+ all_results.append({
173
+ "page_num": p,
174
+ "sheet_id": "unknown",
175
+ "sheet_title": "Unknown",
176
+ "discipline": "other",
177
+ "page_type": "other",
178
+ "description": "Metadata not extracted for this page.",
179
+ "key_elements": [],
180
+ "spatial_coverage": "",
181
+ })
182
+
183
+ # Sort by page number
184
+ all_results.sort(key=lambda x: x.get("page_num", 0))
185
+
186
+ return all_results
nodes/planner.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """plan_and_select node — plans crop tasks from PDF or cached page metadata."""
2
+ from __future__ import annotations
3
+
4
+ import json
5
+ import re
6
+ from pathlib import Path
7
+
8
+ from google import genai
9
+ from google.genai import types
10
+
11
+ from config import GOOGLE_API_KEY, PLANNER_MODEL
12
+ from prompts.planner import PLANNER_SYSTEM_PROMPT, PLANNER_SYSTEM_PROMPT_METADATA
13
+ from state import CropTask, DrawingReaderState
14
+
15
+
16
+ def plan_and_select(state: DrawingReaderState) -> dict:
17
+ """Identify relevant pages and produce crop tasks.
18
+
19
+ Two modes:
20
+ - **Metadata mode** (fast): when ``page_metadata_json`` is available, the planner
21
+ works from structured text descriptions — no PDF upload needed.
22
+ - **PDF mode** (fallback): uploads the full PDF as a native PDF part to Gemini.
23
+ """
24
+ question = state["question"]
25
+ pdf_path = state["pdf_path"]
26
+ num_pages = state.get("num_pages", 0)
27
+ investigation_round = state.get("investigation_round", 0)
28
+ page_metadata_json = state.get("page_metadata_json", "")
29
+
30
+ client = genai.Client(api_key=GOOGLE_API_KEY)
31
+
32
+ if page_metadata_json:
33
+ # ---- Metadata-based planning (fast, no PDF upload) ----
34
+ question_text = (
35
+ f"USER QUESTION: {question}\n\n"
36
+ f"The PDF has {num_pages} pages (1-indexed, from page 1 to page {num_pages}).\n"
37
+ f"This is investigation round {investigation_round + 1}.\n\n"
38
+ f"PAGE METADATA:\n{page_metadata_json}"
39
+ )
40
+ question_part = types.Part.from_text(text=question_text)
41
+
42
+ response = client.models.generate_content(
43
+ model=PLANNER_MODEL,
44
+ contents=[types.Content(role="user", parts=[question_part])],
45
+ config=types.GenerateContentConfig(
46
+ system_instruction=PLANNER_SYSTEM_PROMPT_METADATA,
47
+ ),
48
+ )
49
+ planning_mode = "metadata"
50
+ else:
51
+ # ---- Full PDF upload (fallback) ----
52
+ pdf_bytes = Path(pdf_path).read_bytes()
53
+ pdf_part = types.Part.from_bytes(data=pdf_bytes, mime_type="application/pdf")
54
+
55
+ question_text = (
56
+ f"USER QUESTION: {question}\n\n"
57
+ f"The PDF has {num_pages} pages (1-indexed, from page 1 to page {num_pages}).\n"
58
+ f"This is investigation round {investigation_round + 1}."
59
+ )
60
+ question_part = types.Part.from_text(text=question_text)
61
+
62
+ response = client.models.generate_content(
63
+ model=PLANNER_MODEL,
64
+ contents=[types.Content(role="user", parts=[pdf_part, question_part])],
65
+ config=types.GenerateContentConfig(
66
+ system_instruction=PLANNER_SYSTEM_PROMPT,
67
+ ),
68
+ )
69
+ planning_mode = "pdf"
70
+
71
+ response_text = response.text.strip()
72
+
73
+ # Parse the JSON response
74
+ # Expected: {"target_pages": [...], "legend_pages": [...], "crop_tasks": [...]}
75
+ json_match = re.search(r"\{.*\}", response_text, re.DOTALL)
76
+
77
+ target_pages: list[int] = []
78
+ legend_pages: list[int] = []
79
+ crop_tasks: list[CropTask] = []
80
+
81
+ if json_match:
82
+ try:
83
+ parsed = json.loads(json_match.group())
84
+
85
+ # Model returns 1-indexed page numbers; convert to 0-indexed for internal use.
86
+ valid_0indexed = set(range(num_pages))
87
+ target_pages = [
88
+ int(p) - 1 for p in parsed.get("target_pages", [])
89
+ if int(p) - 1 in valid_0indexed
90
+ ]
91
+ legend_pages = [
92
+ int(p) - 1 for p in parsed.get("legend_pages", [])
93
+ if int(p) - 1 in valid_0indexed
94
+ ]
95
+
96
+ for t in parsed.get("crop_tasks", []):
97
+ raw_page = int(t.get("page_num", 1))
98
+ crop_tasks.append(
99
+ CropTask(
100
+ page_num=raw_page - 1, # convert 1-indexed → 0-indexed
101
+ crop_instruction=t.get("crop_instruction", ""),
102
+ annotate=bool(t.get("annotate", False)),
103
+ annotation_prompt=t.get("annotation_prompt", ""),
104
+ label=t.get("label", f"Page {raw_page} crop"),
105
+ priority=int(t.get("priority", 1)),
106
+ )
107
+ )
108
+ except (json.JSONDecodeError, ValueError, KeyError):
109
+ pass
110
+
111
+ # Sort by priority (legends = 0 first)
112
+ crop_tasks.sort(key=lambda t: t["priority"])
113
+
114
+ # Fallback: if nothing identified, use first 5 pages
115
+ if not target_pages and not crop_tasks:
116
+ target_pages = list(range(min(num_pages, 5)))
117
+
118
+ mode_label = "from page index" if planning_mode == "metadata" else "from full PDF"
119
+ return {
120
+ "target_pages": target_pages,
121
+ "legend_pages": legend_pages,
122
+ "crop_tasks": crop_tasks,
123
+ "status_message": (
124
+ f"Selected {len(target_pages)} pages ({len(legend_pages)} legends), "
125
+ f"planned {len(crop_tasks)} crop tasks ({mode_label})."
126
+ ),
127
+ }
nodes/synthesizer.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """synthesize_answer node — final answer combining Gemini + optional GPT perspectives."""
2
+ from __future__ import annotations
3
+
4
+ from google import genai
5
+ from google.genai import types
6
+
7
+ from config import GOOGLE_API_KEY, SYNTHESIZER_MODEL
8
+ from state import DrawingReaderState
9
+
10
+
11
+ def synthesize_answer(state: DrawingReaderState) -> dict:
12
+ """Produce the final answer, synthesizing consensus if present."""
13
+ gemini_analysis = state.get("gemini_analysis", "")
14
+ gpt_analysis = state.get("gpt_analysis", "")
15
+ question = state["question"]
16
+ enable_consensus = state.get("enable_consensus", False)
17
+
18
+ # If no consensus was run, pass through Gemini's analysis directly
19
+ if not enable_consensus or not gpt_analysis:
20
+ return {
21
+ "final_answer": gemini_analysis,
22
+ "status_message": "Final answer ready.",
23
+ }
24
+
25
+ # Synthesize both perspectives
26
+ client = genai.Client(api_key=GOOGLE_API_KEY)
27
+
28
+ synthesis_prompt = f"""\
29
+ You are producing a FINAL ANSWER to a construction drawing question.
30
+
31
+ USER QUESTION: {question}
32
+
33
+ ANALYST A (Gemini) says:
34
+ {gemini_analysis}
35
+
36
+ ANALYST B (GPT) peer review:
37
+ {gpt_analysis}
38
+
39
+ YOUR TASK:
40
+ 1. If both analysts AGREE: produce a confident, unified answer citing the consensus.
41
+ 2. If they PARTIALLY AGREE: produce the answer based on the agreed points, and \
42
+ explicitly note areas of disagreement with evidence from both sides.
43
+ 3. If they DISAGREE: present both interpretations clearly, explain the discrepancy, \
44
+ and state which interpretation appears better supported by the evidence (or that \
45
+ the question cannot be definitively answered from the available images).
46
+
47
+ Always cite page numbers, sheet names, and image labels for every factual claim.
48
+ """
49
+
50
+ response = client.models.generate_content(
51
+ model=SYNTHESIZER_MODEL,
52
+ contents=[synthesis_prompt],
53
+ )
54
+
55
+ return {
56
+ "final_answer": response.text,
57
+ "status_message": "Final synthesized answer ready.",
58
+ }
prompts/__init__.py ADDED
File without changes
prompts/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (220 Bytes). View file
 
prompts/__pycache__/analyzer.cpython-313.pyc ADDED
Binary file (3.06 kB). View file
 
prompts/__pycache__/annotator.cpython-313.pyc ADDED
Binary file (984 Bytes). View file
 
prompts/__pycache__/consensus.cpython-313.pyc ADDED
Binary file (1.46 kB). View file
 
prompts/__pycache__/cropper.cpython-313.pyc ADDED
Binary file (1.07 kB). View file
 
prompts/__pycache__/metadata.cpython-313.pyc ADDED
Binary file (2.48 kB). View file
 
prompts/__pycache__/planner.cpython-313.pyc ADDED
Binary file (8.79 kB). View file
 
prompts/analyzer.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """System prompt for the analyze_findings node."""
2
+
3
+ ANALYZER_SYSTEM_PROMPT = """\
4
+ You are a senior expert in architecture, MEP engineering, structural engineering, \
5
+ and construction documentation.
6
+
7
+ You are the ANALYST in a multi-step drawing analysis workflow. You receive:
8
+ - The user's question
9
+ - Cropped images from relevant drawing pages (some may be annotated)
10
+ - Retrieved text context from the document
11
+ - Image labels describing what each crop shows
12
+
13
+ The FIRST images are always LEGENDS, SCHEDULES, or GENERAL NOTES. Study these \
14
+ carefully to understand all symbols, abbreviations, and conventions BEFORE examining \
15
+ the detail crops that follow.
16
+
17
+ YOUR RESPONSIBILITIES:
18
+ 1. **Study legends first.** Identify all relevant symbols, callouts, and abbreviations \
19
+ from the legend crops before analyzing any detail crops.
20
+ 2. **Examine each crop carefully.** For annotated crops, reference the numbered \
21
+ annotations (e.g., "Item #3 shows a supply air diffuser").
22
+ 3. **Provide spatially-grounded answers.** Always describe WHERE things are: \
23
+ "Located in the upper-left quadrant, north of AHU-1, adjacent to Room 204."
24
+ 4. **Describe symbols visually.** When referencing equipment or symbols, describe \
25
+ what they look like: "the circular symbol with radiating lines represents..."
26
+ 5. **Cite your sources.** Reference images by their labels: "As shown in the crop \
27
+ labeled 'Page 12 (M-101) - Gymnasium Diffusers'..."
28
+ 6. **Be honest about uncertainty.** If you cannot clearly see something, or if \
29
+ information is ambiguous, say so explicitly. Never guess.
30
+ 7. **Trace paths step-by-step.** When describing duct routes, piping, or conduit: \
31
+ describe the path in spatial order from source to destination.
32
+
33
+ ANSWER FORMAT:
34
+ - Restate the question briefly
35
+ - Walk through your reasoning with references to specific crops and annotations
36
+ - Provide a clear, definitive answer (or explain what is uncertain and why)
37
+ - Mention page numbers and sheet names for every factual claim
38
+
39
+ ADDITIONAL INVESTIGATION:
40
+ If you determine that the provided crops are INSUFFICIENT to answer the question, \
41
+ you may request additional crops. To do this, include a JSON block at the END of \
42
+ your response in this exact format. ALL PAGE NUMBERS ARE 1-INDEXED (first page = 1).
43
+
44
+ ```json
45
+ {"needs_more": true, "reason": "brief explanation of what information is missing", "additional_crops": [
46
+ {"page_num": 15, "crop_instruction": "...", "annotate": false, "annotation_prompt": "", "label": "...", "priority": 1}
47
+ ]}
48
+ ```
49
+
50
+ RULES FOR ADDITIONAL CROPS:
51
+ - Only request crops for areas you have NOT already examined. Never re-request the \
52
+ same page region — you already have those images above.
53
+ - Each additional crop must target a DIFFERENT area or page than what was already provided.
54
+ - Explain briefly WHY you need each additional crop (what information is missing).
55
+ - Do not request them speculatively — only when truly necessary to answer the question.
56
+ """
prompts/annotator.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Prompt wrapper for nano-banana annotation.
2
+
3
+ The actual annotation prompt is written per-task by the planner node.
4
+ This module provides a wrapper that ensures consistent instructions
5
+ around the planner's specific annotation request.
6
+ """
7
+
8
+ ANNOTATION_WRAPPER = """\
9
+ You are annotating a cropped section of a construction/engineering drawing.
10
+
11
+ {annotation_prompt}
12
+
13
+ CRITICAL RULES:
14
+ - Keep the original drawing CLEARLY VISIBLE underneath your annotations.
15
+ - Use bright, high-contrast colors that stand out against the drawing.
16
+ - Make labels and numbers large enough to read easily.
17
+ - Number items sequentially (1, 2, 3...) when counting.
18
+ - Use consistent colors: RED for primary items of interest, BLUE for secondary \
19
+ items, GREEN for paths/traces.
20
+ - Do not remove, obscure, or redraw any part of the original drawing.
21
+ - Output the annotated image.
22
+ """
prompts/consensus.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """System prompt for the GPT consensus review node."""
2
+
3
+ CONSENSUS_SYSTEM_PROMPT = """\
4
+ You are a senior expert in architecture, MEP engineering, structural engineering, \
5
+ and construction documentation.
6
+
7
+ You are performing a PEER REVIEW of another analyst's interpretation of construction \
8
+ drawings. You will receive:
9
+ - The user's original question
10
+ - The same cropped drawing images that the analyst examined
11
+ - The analyst's draft answer
12
+
13
+ YOUR TASK:
14
+ 1. Independently examine each cropped image.
15
+ 2. Compare your observations against the analyst's claims.
16
+ 3. For each factual claim in the analyst's response, determine if you:
17
+ - AGREE (you see the same evidence in the images)
18
+ - DISAGREE (you see different evidence, or the claim is not supported)
19
+ - CANNOT VERIFY (the available images don't clearly show what's claimed)
20
+ 4. If you disagree, explain specifically what you see differently and cite which \
21
+ image/crop contradicts the finding.
22
+ 5. Note any details the analyst may have MISSED that are visible in the images.
23
+
24
+ OUTPUT FORMAT:
25
+ - Start with your overall assessment: AGREE / PARTIALLY AGREE / DISAGREE
26
+ - List each point of agreement or disagreement with image references
27
+ - Provide your own answer to the user's question if it differs from the analyst's
28
+ - Be specific and cite crop labels for every observation
29
+ """
prompts/cropper.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Prompt template for the execute_crops node (Gemini code_execution)."""
2
+
3
+ CROPPER_PROMPT_TEMPLATE = """\
4
+ You are processing a construction drawing page. Your task:
5
+ {crop_instruction}
6
+
7
+ Instructions:
8
+ 1. Examine the full image to orient yourself and locate the requested area.
9
+ 2. Use Python with PIL/Pillow to crop the image to just the requested region.
10
+ 3. Add padding of approximately 40 pixels on each side (clamped to image bounds).
11
+ 4. Iterate if needed - if your first crop is too wide, too narrow, or misses the \
12
+ target, refine it. Take up to 3 attempts to get a tight, accurate crop.
13
+ 5. Output the final cropped image.
14
+
15
+ IMPORTANT RULES:
16
+ - Do NOT annotate, draw on, or modify the image content in any way.
17
+ - Just produce a clean, accurate crop of the requested area.
18
+ - The final output must be the best possible crop.
19
+ - If you cannot locate the requested area, crop to the most likely region and \
20
+ note this in your text output.
21
+ """
prompts/metadata.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """System prompt for the background page metadata generator."""
2
+
3
+ METADATA_SYSTEM_PROMPT = """\
4
+ You are a senior expert in architecture, MEP engineering, structural engineering, \
5
+ and construction documentation.
6
+
7
+ You are analyzing a BATCH of pages from a construction drawing PDF to generate \
8
+ structured metadata for each page. This metadata will be used by a downstream \
9
+ planner to select relevant pages WITHOUT needing to re-examine the full PDF visually.
10
+
11
+ YOUR TASK: For EVERY page in this batch, produce a JSON object with these fields. \
12
+ Use the page numbers specified in the user instruction (1-indexed).
13
+
14
+ - "page_num": integer, 1-indexed page number as specified in the instruction
15
+ - "sheet_id": string, the sheet number/ID from the title block (e.g., "M-101", \
16
+ "A-201", "G-001"). Use "unknown" if not visible.
17
+ - "sheet_title": string, the sheet title from the title block (e.g., "First Floor \
18
+ HVAC Plan"). Use "Untitled" if not visible.
19
+ - "discipline": one of "mechanical", "electrical", "plumbing", "architectural", \
20
+ "structural", "civil", "general", "fire_protection", "demolition", "other"
21
+ - "page_type": one of "floor_plan", "legend", "schedule", "detail", "section", \
22
+ "elevation", "title_sheet", "notes", "diagram", "cover", "other"
23
+ - "description": 2-4 sentences describing what is visible on this page. Be specific \
24
+ about spatial areas covered, equipment shown, systems depicted.
25
+ - "key_elements": list of strings naming notable items visible (equipment tags, room \
26
+ names, system names, detail callouts). Include 5-15 items per page.
27
+ - "spatial_coverage": string describing what physical area or zone this page covers \
28
+ (e.g., "First floor, east wing", "Building section A-A looking north", "Roof plan"). \
29
+ Empty string for legends/schedules.
30
+
31
+ YOU MUST RETURN A SINGLE JSON ARRAY containing one object per page. No other text \
32
+ before or after. The array must be ordered by page_num.
33
+
34
+ IMPORTANT RULES:
35
+ 1. Cover EVERY page in this batch. Do not skip any.
36
+ 2. Be specific in descriptions — mention room numbers, equipment tags, duct sizes, \
37
+ panel names.
38
+ 3. For legend/schedule pages, list the specific items they define in key_elements.
39
+ 4. Use discipline-specific vocabulary (e.g., "VAV box", "branch circuit", "sanitary riser").
40
+ """
prompts/planner.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """System prompts for the plan_and_select node.
2
+
3
+ Two variants:
4
+ - PLANNER_SYSTEM_PROMPT: used when the full PDF is uploaded (first question / no metadata)
5
+ - PLANNER_SYSTEM_PROMPT_METADATA: used when pre-computed page metadata is available
6
+ """
7
+
8
+ PLANNER_SYSTEM_PROMPT = """\
9
+ You are a senior expert in architecture, MEP engineering, structural engineering, \
10
+ and construction documentation. You specialize in interpreting construction drawing \
11
+ sets (architectural, mechanical, electrical, plumbing, structural, civil, demolition).
12
+
13
+ You are the PLANNER in a multi-step drawing analysis workflow. You receive the \
14
+ COMPLETE PDF of a construction drawing set along with the user's question.
15
+
16
+ YOUR JOB: Analyze the entire PDF to understand what is on each page, then produce \
17
+ a plan that identifies relevant pages and specifies crop tasks for downstream agents.
18
+
19
+ YOU MUST RETURN A SINGLE JSON OBJECT with three keys. No other text before or after.
20
+
21
+ {
22
+ "target_pages": [1-indexed page numbers relevant to the question],
23
+ "legend_pages": [1-indexed page numbers that are legends/schedules/notes],
24
+ "crop_tasks": [list of crop task objects]
25
+ }
26
+
27
+ WORKFLOW RULES:
28
+
29
+ 1. **Review the entire PDF first.** Understand the drawing set structure: title \
30
+ sheets, legends, floor plans, details, schedules, sections, elevations.
31
+
32
+ 2. **Select target pages.** Identify the pages most relevant to the user's question \
33
+ (up to 10). Include both the detail pages AND any legends/schedules needed to \
34
+ interpret those pages.
35
+
36
+ 3. **Identify legend pages.** From your target pages, flag which ones are legends, \
37
+ schedules, abbreviation lists, symbol keys, general notes, keynotes, or \
38
+ specification tables. Only include legends DIRECTLY relevant to the question -- \
39
+ e.g., if the question is about electrical, include the electrical legend only, \
40
+ NOT plumbing or structural legends.
41
+
42
+ 4. **Plan crop tasks.** For each relevant area on each target page, create a crop \
43
+ task object with these fields:
44
+ - "page_num": 1-indexed page number (first page = 1)
45
+ - "crop_instruction": precise description of what region to crop, e.g., \
46
+ "Crop to Room 204 and its immediate surroundings showing all ductwork connections"
47
+ - "annotate": true/false -- set true when the question requires counting items, \
48
+ tracing paths, identifying spatial relationships, or distinguishing similar items. \
49
+ Set false for legend/schedule/text crops.
50
+ - "annotation_prompt": when annotate=true, a clear prompt describing what to \
51
+ highlight (colors, numbering, what to annotate). Always include "Keep the \
52
+ original drawing clearly visible underneath." Leave empty string when annotate=false.
53
+ - "label": descriptive label for the crop, e.g., "Page 12 (M-101) - Gymnasium HVAC layout"
54
+ - "priority": 0 for legends/schedules, 1 for detail crops
55
+
56
+ 5. **Legends first.** Always include crop tasks for relevant legends. Assign priority=0.
57
+
58
+ 6. **Be specific.** Each crop instruction must describe a precise region of the page. \
59
+ NEVER write "Crop to the relevant area."
60
+
61
+ 7. **Minimize work.** Choose the FEWEST crops needed for completeness. Each crop is \
62
+ expensive (5-30 seconds). Aim for 3-6 total crop tasks. If two items are on the \
63
+ same area of a page, use ONE crop covering both. One well-targeted crop per page \
64
+ is usually sufficient.
65
+
66
+ 8. **Labels matter.** Each crop needs a descriptive label that the analysis model \
67
+ will use to reference the image.
68
+
69
+ ALL PAGE NUMBERS ARE 1-INDEXED. The first page of the PDF is page 1, not page 0.
70
+
71
+ EXAMPLE OUTPUT:
72
+ {
73
+ "target_pages": [5, 12, 14],
74
+ "legend_pages": [5],
75
+ "crop_tasks": [
76
+ {
77
+ "page_num": 5,
78
+ "crop_instruction": "Crop to the HVAC legend showing all duct symbols and abbreviations.",
79
+ "annotate": false,
80
+ "annotation_prompt": "",
81
+ "label": "Page 5 (M-001 Legend) - HVAC Symbol Legend",
82
+ "priority": 0
83
+ },
84
+ {
85
+ "page_num": 12,
86
+ "crop_instruction": "Crop to the gymnasium area showing all supply air diffusers and ductwork.",
87
+ "annotate": true,
88
+ "annotation_prompt": "Draw bright red numbered bounding boxes (1, 2, 3...) around each supply air diffuser symbol. Draw blue boxes around any AHU or RTU. Keep the original drawing clearly visible underneath.",
89
+ "label": "Page 12 (M-101) - Gymnasium Diffusers",
90
+ "priority": 1
91
+ }
92
+ ]
93
+ }
94
+ """
95
+
96
+
97
+ PLANNER_SYSTEM_PROMPT_METADATA = """\
98
+ You are a senior expert in architecture, MEP engineering, structural engineering, \
99
+ and construction documentation. You specialize in interpreting construction drawing \
100
+ sets (architectural, mechanical, electrical, plumbing, structural, civil, demolition).
101
+
102
+ You are the PLANNER in a multi-step drawing analysis workflow. You DO NOT have the \
103
+ visual PDF. Instead, you receive STRUCTURED METADATA describing each page of the \
104
+ drawing set. Use this metadata to select relevant pages and plan crop tasks.
105
+
106
+ The metadata for each page includes:
107
+ - sheet_id: the sheet number (e.g., "M-101")
108
+ - sheet_title: the sheet name (e.g., "First Floor HVAC Plan")
109
+ - discipline: mechanical/electrical/plumbing/architectural/etc.
110
+ - page_type: floor_plan/legend/schedule/detail/section/elevation/etc.
111
+ - description: 2-4 sentences describing what is visible
112
+ - key_elements: list of notable items (equipment tags, room names, etc.)
113
+ - spatial_coverage: what physical area the page covers
114
+
115
+ YOU MUST RETURN A SINGLE JSON OBJECT with three keys. No other text before or after.
116
+
117
+ {
118
+ "target_pages": [1-indexed page numbers relevant to the question],
119
+ "legend_pages": [1-indexed page numbers that are legends/schedules/notes],
120
+ "crop_tasks": [list of crop task objects]
121
+ }
122
+
123
+ WORKFLOW RULES:
124
+
125
+ 1. **Scan all page metadata.** Match the user's question against page descriptions, \
126
+ key_elements, disciplines, and spatial_coverage to find relevant pages.
127
+
128
+ 2. **Select target pages.** Choose the pages most relevant to the user's question \
129
+ (up to 10). Use the discipline, page_type, key_elements, and description fields \
130
+ to make informed selections.
131
+
132
+ 3. **Identify legend pages.** Use the page_type and discipline fields to find \
133
+ relevant legends. Only include legends for the discipline(s) relevant to \
134
+ the question.
135
+
136
+ 4. **Plan crop tasks.** Based on each page's description and key_elements, create \
137
+ crop tasks targeting specific regions mentioned in the metadata. Each crop task:
138
+ - "page_num": 1-indexed page number (first page = 1)
139
+ - "crop_instruction": precise description of what region to crop. Use information \
140
+ from the page's description and key_elements to write specific instructions.
141
+ - "annotate": true when the question requires counting, tracing, or spatial analysis. \
142
+ false for legends/schedules.
143
+ - "annotation_prompt": when annotate=true, describe what to highlight. Include \
144
+ "Keep the original drawing clearly visible underneath." Empty string when annotate=false.
145
+ - "label": descriptive label using sheet_id and sheet_title from metadata.
146
+ - "priority": 0 for legends/schedules, 1 for detail crops.
147
+
148
+ 5. **Legends first.** Always include crop tasks for relevant legends with priority=0.
149
+
150
+ 6. **Be specific.** Use key_elements and description text from metadata to write \
151
+ precise crop instructions.
152
+
153
+ 7. **Minimize work.** Choose the FEWEST crops needed for completeness. Each crop is \
154
+ expensive (5-30 seconds). Aim for 3-6 total crop tasks. Merge overlapping regions \
155
+ into a single broader crop rather than creating separate crops for adjacent areas \
156
+ on the same page. One well-targeted crop per page is usually sufficient.
157
+
158
+ 8. **Labels matter.** Each crop needs a descriptive label that the analysis model \
159
+ will use to reference the image.
160
+
161
+ ALL PAGE NUMBERS ARE 1-INDEXED. The first page of the PDF is page 1, not page 0.
162
+
163
+ EXAMPLE OUTPUT:
164
+ {
165
+ "target_pages": [5, 12, 14],
166
+ "legend_pages": [5],
167
+ "crop_tasks": [
168
+ {
169
+ "page_num": 5,
170
+ "crop_instruction": "Crop to the HVAC legend showing all duct symbols and abbreviations.",
171
+ "annotate": false,
172
+ "annotation_prompt": "",
173
+ "label": "Page 5 (M-001 Legend) - HVAC Symbol Legend",
174
+ "priority": 0
175
+ },
176
+ {
177
+ "page_num": 12,
178
+ "crop_instruction": "Crop to the gymnasium area showing all supply air diffusers and ductwork.",
179
+ "annotate": true,
180
+ "annotation_prompt": "Draw bright red numbered bounding boxes (1, 2, 3...) around each supply air diffuser symbol. Draw blue boxes around any AHU or RTU. Keep the original drawing clearly visible underneath.",
181
+ "label": "Page 12 (M-101) - Gymnasium Diffusers",
182
+ "priority": 1
183
+ }
184
+ ]
185
+ }
186
+ """
tools/__init__.py ADDED
File without changes
tools/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (218 Bytes). View file
 
tools/__pycache__/crop_cache.cpython-313.pyc ADDED
Binary file (6.87 kB). View file
 
tools/__pycache__/file_search.cpython-313.pyc ADDED
Binary file (2.72 kB). View file
 
tools/__pycache__/image_store.cpython-313.pyc ADDED
Binary file (6.64 kB). View file
 
tools/__pycache__/metadata_cache.cpython-313.pyc ADDED
Binary file (6.93 kB). View file
 
tools/__pycache__/pdf_processor.cpython-313.pyc ADDED
Binary file (4.47 kB). View file
 
tools/__pycache__/vector_store.cpython-313.pyc ADDED
Binary file (3.26 kB). View file
 
tools/crop_cache.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """In-session crop cache — avoids redundant Gemini API calls for identical crops.
2
+
3
+ Stored in ``st.session_state`` so it persists across questions within a single
4
+ Streamlit session, but is discarded when the session ends.
5
+
6
+ Matching strategy:
7
+ - **Exact match** on ``(page_num, crop_instruction)`` is the primary lookup.
8
+ - **Fuzzy match** with a simple normalized overlap score handles cases where
9
+ the planner rephrases slightly (e.g., "Crop the gymnasium area" vs
10
+ "Crop gymnasium area showing diffusers"). Only matches above a high
11
+ threshold (0.85) are considered hits to avoid false positives.
12
+ """
13
+ from __future__ import annotations
14
+
15
+ import logging
16
+ import re
17
+ from dataclasses import dataclass, field
18
+
19
+ from state import ImageRef
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ @dataclass
25
+ class CachedCrop:
26
+ """A cached crop entry with its original instruction and result."""
27
+ page_num: int
28
+ crop_instruction: str
29
+ label: str
30
+ image_ref: ImageRef
31
+ # Normalised token set for fuzzy matching (computed once at insert time)
32
+ _tokens: frozenset[str] = field(default_factory=frozenset, repr=False)
33
+
34
+
35
+ def _normalise_tokens(text: str) -> frozenset[str]:
36
+ """Lowercase, strip punctuation, split into a token set."""
37
+ cleaned = re.sub(r"[^a-z0-9\s]", "", text.lower())
38
+ return frozenset(cleaned.split())
39
+
40
+
41
+ def _token_overlap(a: frozenset[str], b: frozenset[str]) -> float:
42
+ """Jaccard-style overlap: |intersection| / |union|."""
43
+ if not a or not b:
44
+ return 0.0
45
+ return len(a & b) / len(a | b)
46
+
47
+
48
+ class CropCache:
49
+ """Session-scoped cache mapping (page, instruction) → ImageRef.
50
+
51
+ Thread-safe for concurrent reads (dict lookups under CPython's GIL) but
52
+ writes are serialised via the single-threaded Streamlit main thread.
53
+ """
54
+
55
+ # Minimum token-overlap score to accept a fuzzy match.
56
+ # Tuned so that minor rephrasing (dropping "the", "all") still matches
57
+ # (~0.78 overlap) while genuinely different instructions miss (~0.06-0.42).
58
+ FUZZY_THRESHOLD = 0.70
59
+
60
+ def __init__(self) -> None:
61
+ # Primary index: exact (page_num, instruction) → CachedCrop
62
+ self._exact: dict[tuple[int, str], CachedCrop] = {}
63
+ # Secondary list for fuzzy scanning (same objects as _exact values)
64
+ self._entries: list[CachedCrop] = []
65
+ self._hit_count = 0
66
+ self._miss_count = 0
67
+
68
+ # ------------------------------------------------------------------
69
+ # Public API
70
+ # ------------------------------------------------------------------
71
+
72
+ def lookup(self, page_num: int, crop_instruction: str) -> ImageRef | None:
73
+ """Return a cached ImageRef if a matching crop exists, else None.
74
+
75
+ Tries exact match first, then falls back to fuzzy token overlap
76
+ restricted to the same page.
77
+ """
78
+ key = (page_num, crop_instruction)
79
+
80
+ # 1. Exact match
81
+ if key in self._exact:
82
+ self._hit_count += 1
83
+ entry = self._exact[key]
84
+ logger.info(
85
+ "CropCache HIT (exact) page=%d instruction='%s' → %s",
86
+ page_num, crop_instruction[:60], entry.image_ref["id"],
87
+ )
88
+ return entry.image_ref
89
+
90
+ # 2. Fuzzy match — only among entries on the same page
91
+ query_tokens = _normalise_tokens(crop_instruction)
92
+ best_score = 0.0
93
+ best_entry: CachedCrop | None = None
94
+
95
+ for entry in self._entries:
96
+ if entry.page_num != page_num:
97
+ continue
98
+ score = _token_overlap(query_tokens, entry._tokens)
99
+ if score > best_score:
100
+ best_score = score
101
+ best_entry = entry
102
+
103
+ if best_entry is not None and best_score >= self.FUZZY_THRESHOLD:
104
+ self._hit_count += 1
105
+ logger.info(
106
+ "CropCache HIT (fuzzy %.2f) page=%d instruction='%s' → %s",
107
+ best_score, page_num, crop_instruction[:60],
108
+ best_entry.image_ref["id"],
109
+ )
110
+ return best_entry.image_ref
111
+
112
+ self._miss_count += 1
113
+ return None
114
+
115
+ def register(
116
+ self,
117
+ page_num: int,
118
+ crop_instruction: str,
119
+ label: str,
120
+ image_ref: ImageRef,
121
+ *,
122
+ is_fallback: bool = False,
123
+ ) -> None:
124
+ """Register a successful crop in the cache.
125
+
126
+ Parameters
127
+ ----------
128
+ is_fallback
129
+ If True, the crop is a full-page fallback (Gemini failed to crop).
130
+ These are NOT cached because they don't represent a useful targeted crop.
131
+ """
132
+ if is_fallback:
133
+ logger.debug(
134
+ "CropCache SKIP (fallback) page=%d instruction='%s'",
135
+ page_num, crop_instruction[:60],
136
+ )
137
+ return
138
+
139
+ key = (page_num, crop_instruction)
140
+ if key in self._exact:
141
+ return # already cached
142
+
143
+ entry = CachedCrop(
144
+ page_num=page_num,
145
+ crop_instruction=crop_instruction,
146
+ label=label,
147
+ image_ref=image_ref,
148
+ _tokens=_normalise_tokens(crop_instruction),
149
+ )
150
+ self._exact[key] = entry
151
+ self._entries.append(entry)
152
+ logger.info(
153
+ "CropCache REGISTER page=%d instruction='%s' → %s",
154
+ page_num, crop_instruction[:60], image_ref["id"],
155
+ )
156
+
157
+ @property
158
+ def size(self) -> int:
159
+ return len(self._entries)
160
+
161
+ @property
162
+ def stats(self) -> str:
163
+ total = self._hit_count + self._miss_count
164
+ rate = (self._hit_count / total * 100) if total > 0 else 0
165
+ return (
166
+ f"CropCache: {self.size} entries, "
167
+ f"{self._hit_count} hits / {self._miss_count} misses "
168
+ f"({rate:.0f}% hit rate)"
169
+ )
170
+
171
+ def clear(self) -> None:
172
+ """Reset the cache (e.g., when a new PDF is loaded)."""
173
+ self._exact.clear()
174
+ self._entries.clear()
175
+ self._hit_count = 0
176
+ self._miss_count = 0
tools/image_store.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import base64
4
+ import io
5
+ import os
6
+ import shutil
7
+ import uuid
8
+ from pathlib import Path
9
+
10
+ from PIL import Image
11
+
12
+ from state import ImageRef
13
+
14
+
15
+ class ImageStore:
16
+ """Disk-based image manager. LangGraph state only carries lightweight
17
+ ``ImageRef`` dicts; all heavy image bytes live on disk."""
18
+
19
+ def __init__(self, base_dir: str):
20
+ self.base_dir = Path(base_dir)
21
+ self.base_dir.mkdir(parents=True, exist_ok=True)
22
+ self._pages_dir = self.base_dir / "pages"
23
+ self._crops_dir = self.base_dir / "crops"
24
+ self._annotated_dir = self.base_dir / "annotated"
25
+ for d in (self._pages_dir, self._crops_dir, self._annotated_dir):
26
+ d.mkdir(exist_ok=True)
27
+
28
+ # ------------------------------------------------------------------
29
+ # Save helpers
30
+ # ------------------------------------------------------------------
31
+
32
+ def save_page_image(self, page_num: int, image_bytes: bytes) -> ImageRef:
33
+ img = Image.open(io.BytesIO(image_bytes))
34
+ fname = f"page_{page_num}.png"
35
+ path = self._pages_dir / fname
36
+ img.save(str(path), format="PNG")
37
+ return ImageRef(
38
+ id=f"page_{page_num}",
39
+ path=str(path),
40
+ label=f"Page {page_num} (full page)",
41
+ page_num=page_num,
42
+ crop_type="full_page",
43
+ width=img.width,
44
+ height=img.height,
45
+ )
46
+
47
+ def save_crop(
48
+ self,
49
+ page_num: int,
50
+ crop_id: str,
51
+ image: Image.Image,
52
+ label: str,
53
+ ) -> ImageRef:
54
+ fname = f"page_{page_num}_{crop_id}.png"
55
+ path = self._crops_dir / fname
56
+ image.save(str(path), format="PNG")
57
+ return ImageRef(
58
+ id=f"page_{page_num}_{crop_id}",
59
+ path=str(path),
60
+ label=label,
61
+ page_num=page_num,
62
+ crop_type="crop",
63
+ width=image.width,
64
+ height=image.height,
65
+ )
66
+
67
+ def save_annotated(
68
+ self,
69
+ source_ref: ImageRef,
70
+ annotated_image: Image.Image,
71
+ ) -> ImageRef:
72
+ ann_id = f"{source_ref['id']}_ann"
73
+ fname = f"{ann_id}.png"
74
+ path = self._annotated_dir / fname
75
+ annotated_image.save(str(path), format="PNG")
76
+ return ImageRef(
77
+ id=ann_id,
78
+ path=str(path),
79
+ label=f"{source_ref['label']} [annotated]",
80
+ page_num=source_ref["page_num"],
81
+ crop_type="annotated",
82
+ width=annotated_image.width,
83
+ height=annotated_image.height,
84
+ )
85
+
86
+ # ------------------------------------------------------------------
87
+ # Load helpers
88
+ # ------------------------------------------------------------------
89
+
90
+ def load_image(self, ref: ImageRef) -> Image.Image:
91
+ return Image.open(ref["path"])
92
+
93
+ def load_bytes(self, ref: ImageRef) -> bytes:
94
+ with open(ref["path"], "rb") as f:
95
+ return f.read()
96
+
97
+ def get_page_image_path(self, page_num: int) -> str:
98
+ return str(self._pages_dir / f"page_{page_num}.png")
99
+
100
+ def load_page_bytes(self, page_num: int) -> bytes:
101
+ path = self.get_page_image_path(page_num)
102
+ with open(path, "rb") as f:
103
+ return f.read()
104
+
105
+ # ------------------------------------------------------------------
106
+ # Format conversions for different model APIs
107
+ # ------------------------------------------------------------------
108
+
109
+ def to_gemini_part(self, ref: ImageRef):
110
+ """Return a ``google.genai.types.Part`` for Gemini multimodal prompts."""
111
+ from google.genai import types
112
+ return types.Part.from_bytes(
113
+ data=self.load_bytes(ref),
114
+ mime_type="image/png",
115
+ )
116
+
117
+ def to_openai_base64(self, ref: ImageRef) -> dict:
118
+ """Return an OpenAI-compatible image content block (base64 data URI)."""
119
+ b64 = base64.b64encode(self.load_bytes(ref)).decode("utf-8")
120
+ return {
121
+ "type": "image_url",
122
+ "image_url": {"url": f"data:image/png;base64,{b64}"},
123
+ }
124
+
125
+ def create_thumbnail(self, ref: ImageRef, max_size: int = 400) -> bytes:
126
+ img = self.load_image(ref)
127
+ img.thumbnail((max_size, max_size))
128
+ buf = io.BytesIO()
129
+ img.save(buf, format="PNG")
130
+ return buf.getvalue()
131
+
132
+ # ------------------------------------------------------------------
133
+ # Cleanup
134
+ # ------------------------------------------------------------------
135
+
136
+ def cleanup(self):
137
+ if self.base_dir.exists():
138
+ shutil.rmtree(self.base_dir, ignore_errors=True)
tools/metadata_cache.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Disk-based metadata cache + thread-safe in-memory container for background generation."""
2
+ from __future__ import annotations
3
+
4
+ import hashlib
5
+ import json
6
+ import logging
7
+ import threading
8
+ from pathlib import Path
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ # Cache directory local to the project
13
+ CACHE_DIR = Path(__file__).resolve().parent.parent / ".cache" / "metadata"
14
+
15
+ # Cache version — bump this when the metadata format changes to invalidate old caches.
16
+ # v2: switched page_num from 0-indexed to 1-indexed
17
+ # v3: removed related_legends, has_title_block, title_block_text; parallel batch generation
18
+ _CACHE_VERSION = "v3"
19
+
20
+
21
+ def _pdf_hash(pdf_bytes: bytes) -> str:
22
+ """Compute a SHA-256 hash of the PDF bytes for cache keying."""
23
+ return hashlib.sha256(pdf_bytes).hexdigest()
24
+
25
+
26
+ def get_cached_metadata(pdf_bytes: bytes) -> list[dict] | None:
27
+ """Check if metadata exists on disk for the given PDF.
28
+
29
+ Returns the metadata list if found, None otherwise.
30
+ """
31
+ cache_path = CACHE_DIR / f"{_pdf_hash(pdf_bytes)}_{_CACHE_VERSION}.json"
32
+ if cache_path.exists():
33
+ try:
34
+ return json.loads(cache_path.read_text(encoding="utf-8"))
35
+ except (json.JSONDecodeError, OSError):
36
+ return None
37
+ return None
38
+
39
+
40
+ def save_metadata(pdf_bytes: bytes, metadata_list: list[dict]) -> None:
41
+ """Save metadata to disk, keyed by PDF hash."""
42
+ CACHE_DIR.mkdir(parents=True, exist_ok=True)
43
+ cache_path = CACHE_DIR / f"{_pdf_hash(pdf_bytes)}_{_CACHE_VERSION}.json"
44
+ cache_path.write_text(
45
+ json.dumps(metadata_list, indent=2),
46
+ encoding="utf-8",
47
+ )
48
+
49
+
50
+ class MetadataState:
51
+ """Thread-safe container for background metadata generation state.
52
+
53
+ Stored as a single object in ``st.session_state``. The background thread
54
+ mutates fields on *this same object* (safe under CPython's GIL for simple
55
+ attribute assignments). The main Streamlit thread reads from it on each
56
+ rerun.
57
+ """
58
+
59
+ def __init__(self) -> None:
60
+ self.status: str = "not_started" # not_started | in_progress | ready | failed
61
+ self.data_json: str = "" # pre-serialized JSON for the planner
62
+ self.error: str | None = None
63
+ self._lock = threading.Lock()
64
+
65
+ # -- convenience helpers --------------------------------------------------
66
+
67
+ def set_ready(self, data_json: str) -> None:
68
+ with self._lock:
69
+ self.data_json = data_json
70
+ self.status = "ready"
71
+
72
+ def set_failed(self, error: str) -> None:
73
+ with self._lock:
74
+ self.error = error
75
+ self.status = "failed"
76
+
77
+ def set_in_progress(self) -> None:
78
+ with self._lock:
79
+ self.status = "in_progress"
80
+
81
+ @property
82
+ def is_ready(self) -> bool:
83
+ return self.status == "ready"
84
+
85
+ def generate_sync(
86
+ self,
87
+ pdf_path: str,
88
+ num_pages: int,
89
+ pdf_bytes: bytes,
90
+ ) -> None:
91
+ """Generate metadata synchronously (blocking).
92
+
93
+ Same logic as ``start_background_generation`` but runs in the calling
94
+ thread. Used during initialization so metadata is ready before the
95
+ user can ask questions.
96
+ """
97
+ self.set_in_progress()
98
+ try:
99
+ from nodes.metadata_generator import generate_page_metadata
100
+
101
+ metadata_list = generate_page_metadata(pdf_path, num_pages)
102
+ save_metadata(pdf_bytes, metadata_list)
103
+ self.set_ready(json.dumps(metadata_list, indent=2))
104
+ logger.info("Metadata generation complete.")
105
+ except Exception as e:
106
+ self.set_failed(str(e))
107
+ logger.exception("Metadata generation failed")
108
+
109
+ def start_background_generation(
110
+ self,
111
+ pdf_path: str,
112
+ num_pages: int,
113
+ pdf_bytes: bytes,
114
+ ) -> None:
115
+ """Launch a daemon thread that generates metadata and writes to disk cache."""
116
+ self.set_in_progress()
117
+
118
+ def _run():
119
+ try:
120
+ from nodes.metadata_generator import generate_page_metadata
121
+
122
+ metadata_list = generate_page_metadata(pdf_path, num_pages)
123
+ save_metadata(pdf_bytes, metadata_list)
124
+ self.set_ready(json.dumps(metadata_list, indent=2))
125
+ logger.info("Background metadata generation complete.")
126
+ except Exception as e:
127
+ self.set_failed(str(e))
128
+ logger.exception("Background metadata generation failed")
129
+
130
+ thread = threading.Thread(target=_run, daemon=True)
131
+ thread.start()
tools/pdf_processor.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """PDF page rendering (PyMuPDF/fitz) — upfront bulk rendering at ingest time."""
2
+ from __future__ import annotations
3
+
4
+ from pathlib import Path
5
+
6
+ import fitz # PyMuPDF
7
+
8
+ from config import PDF_RENDER_DPI
9
+
10
+
11
+ def get_page_count(pdf_path: str) -> int:
12
+ """Return the number of pages in a PDF without rendering anything."""
13
+ doc = fitz.open(pdf_path)
14
+ count = len(doc)
15
+ doc.close()
16
+ return count
17
+
18
+
19
+ def render_pages(pdf_path: str, output_dir: str, dpi: int = PDF_RENDER_DPI) -> int:
20
+ """Render every PDF page as a PNG image.
21
+
22
+ This is the primary rendering method, called once during PDF ingestion
23
+ to pre-render all pages at the configured DPI.
24
+ """
25
+ out = Path(output_dir)
26
+ out.mkdir(parents=True, exist_ok=True)
27
+
28
+ doc = fitz.open(pdf_path)
29
+ num_pages = len(doc)
30
+ zoom = dpi / 72.0
31
+ matrix = fitz.Matrix(zoom, zoom)
32
+
33
+ for page_num in range(num_pages):
34
+ page = doc.load_page(page_num)
35
+ pix = page.get_pixmap(matrix=matrix)
36
+ img_bytes = pix.tobytes("png")
37
+ img_path = out / f"page_{page_num}.png"
38
+ with open(img_path, "wb") as f:
39
+ f.write(img_bytes)
40
+
41
+ doc.close()
42
+ return num_pages
43
+
44
+
45
+ def render_single_page(
46
+ pdf_path: str,
47
+ page_num: int,
48
+ output_dir: str,
49
+ dpi: int = PDF_RENDER_DPI,
50
+ ) -> None:
51
+ """Render a single PDF page as a PNG and save to disk."""
52
+ out = Path(output_dir)
53
+ out.mkdir(parents=True, exist_ok=True)
54
+
55
+ doc = fitz.open(pdf_path)
56
+ zoom = dpi / 72.0
57
+ page = doc.load_page(page_num)
58
+ pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom))
59
+ img_path = out / f"page_{page_num}.png"
60
+ with open(img_path, "wb") as f:
61
+ f.write(pix.tobytes("png"))
62
+ doc.close()
63
+
64
+
65
+ def extract_page_range_bytes(pdf_path: str, start: int, end: int) -> bytes:
66
+ """Extract a range of pages from a PDF and return as in-memory PDF bytes.
67
+
68
+ Args:
69
+ pdf_path: Path to the source PDF.
70
+ start: First page index (0-indexed, inclusive).
71
+ end: Last page index (0-indexed, inclusive).
72
+
73
+ Returns:
74
+ Raw bytes of a new PDF containing only the specified pages.
75
+ """
76
+ src = fitz.open(pdf_path)
77
+ dst = fitz.open() # new empty PDF
78
+ dst.insert_pdf(src, from_page=start, to_page=end)
79
+ pdf_bytes = dst.tobytes()
80
+ dst.close()
81
+ src.close()
82
+ return pdf_bytes
83
+
84
+
85
+ def get_page_image_bytes(
86
+ page_image_dir: str,
87
+ page_num: int,
88
+ ) -> bytes:
89
+ """Load a pre-rendered page image from disk.
90
+
91
+ Pages are expected to already exist from the upfront bulk render
92
+ performed during PDF ingestion.
93
+ """
94
+ path = Path(page_image_dir) / f"page_{page_num}.png"
95
+ return path.read_bytes()