Spaces:

NYSERDA-CRE-Working-Group
/

Agentic_vision_drawing_chat

Sleeping

App Files Files Community

Ryan2219 commited on 17 days ago

Commit

d8d538f

verified ·

1 Parent(s): 4a413b3

Upload 47 files

Browse files

Files changed (48) hide show

.gitattributes +1 -0
default_drawings/NorthMaconPark.pdf +3 -0
nodes/__init__.py +0 -0
nodes/__pycache__/__init__.cpython-313.pyc +0 -0
nodes/__pycache__/analyzer.cpython-313.pyc +0 -0
nodes/__pycache__/annotator.cpython-313.pyc +0 -0
nodes/__pycache__/consensus.cpython-313.pyc +0 -0
nodes/__pycache__/cropper.cpython-313.pyc +0 -0
nodes/__pycache__/ingest.cpython-313.pyc +0 -0
nodes/__pycache__/legends.cpython-313.pyc +0 -0
nodes/__pycache__/metadata_generator.cpython-313.pyc +0 -0
nodes/__pycache__/planner.cpython-313.pyc +0 -0
nodes/__pycache__/retrieve.cpython-313.pyc +0 -0
nodes/__pycache__/synthesizer.cpython-313.pyc +0 -0
nodes/analyzer.py +132 -0
nodes/annotator.py +117 -0
nodes/consensus.py +58 -0
nodes/cropper.py +204 -0
nodes/ingest.py +22 -0
nodes/metadata_generator.py +186 -0
nodes/planner.py +127 -0
nodes/synthesizer.py +58 -0
prompts/__init__.py +0 -0
prompts/__pycache__/__init__.cpython-313.pyc +0 -0
prompts/__pycache__/analyzer.cpython-313.pyc +0 -0
prompts/__pycache__/annotator.cpython-313.pyc +0 -0
prompts/__pycache__/consensus.cpython-313.pyc +0 -0
prompts/__pycache__/cropper.cpython-313.pyc +0 -0
prompts/__pycache__/metadata.cpython-313.pyc +0 -0
prompts/__pycache__/planner.cpython-313.pyc +0 -0
prompts/analyzer.py +56 -0
prompts/annotator.py +22 -0
prompts/consensus.py +29 -0
prompts/cropper.py +21 -0
prompts/metadata.py +40 -0
prompts/planner.py +186 -0
tools/__init__.py +0 -0
tools/__pycache__/__init__.cpython-313.pyc +0 -0
tools/__pycache__/crop_cache.cpython-313.pyc +0 -0
tools/__pycache__/file_search.cpython-313.pyc +0 -0
tools/__pycache__/image_store.cpython-313.pyc +0 -0
tools/__pycache__/metadata_cache.cpython-313.pyc +0 -0
tools/__pycache__/pdf_processor.cpython-313.pyc +0 -0
tools/__pycache__/vector_store.cpython-313.pyc +0 -0
tools/crop_cache.py +176 -0
tools/image_store.py +138 -0
tools/metadata_cache.py +131 -0
tools/pdf_processor.py +95 -0

.gitattributes CHANGED Viewed

@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 Gemini_Generated_Image_3ow7sj3ow7sj3ow7.png filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 Gemini_Generated_Image_3ow7sj3ow7sj3ow7.png filter=lfs diff=lfs merge=lfs -text
+default_drawings/NorthMaconPark.pdf filter=lfs diff=lfs merge=lfs -text

default_drawings/NorthMaconPark.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9aed76b73fbe205e1579e3a00be6e95b7564e72594b1fdb83311819d447f8fc4
+size 39114794

nodes/__init__.py ADDED Viewed

File without changes

nodes/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (218 Bytes). View file

nodes/__pycache__/analyzer.cpython-313.pyc ADDED Viewed

Binary file (5.87 kB). View file

nodes/__pycache__/annotator.cpython-313.pyc ADDED Viewed

Binary file (5.05 kB). View file

nodes/__pycache__/consensus.cpython-313.pyc ADDED Viewed

Binary file (2.37 kB). View file

nodes/__pycache__/cropper.cpython-313.pyc ADDED Viewed

Binary file (7.69 kB). View file

nodes/__pycache__/ingest.cpython-313.pyc ADDED Viewed

Binary file (917 Bytes). View file

nodes/__pycache__/legends.cpython-313.pyc ADDED Viewed

Binary file (3.57 kB). View file

nodes/__pycache__/metadata_generator.cpython-313.pyc ADDED Viewed

Binary file (7.51 kB). View file

nodes/__pycache__/planner.cpython-313.pyc ADDED Viewed

Binary file (5.53 kB). View file

nodes/__pycache__/retrieve.cpython-313.pyc ADDED Viewed

Binary file (6.3 kB). View file

nodes/__pycache__/synthesizer.cpython-313.pyc ADDED Viewed

Binary file (2.23 kB). View file

nodes/analyzer.py ADDED Viewed

	@@ -0,0 +1,132 @@

+"""analyze_findings node — parent agent examines crops and answers the question."""
+from __future__ import annotations
+import json
+import re
+from google import genai
+from google.genai import types
+from config import ANALYZER_MODEL, GOOGLE_API_KEY
+from prompts.analyzer import ANALYZER_SYSTEM_PROMPT
+from state import CropTask, DrawingReaderState
+from tools.image_store import ImageStore
+def analyze_findings(state: DrawingReaderState, image_store: ImageStore) -> dict:
+    """Review all cropped/annotated images and produce an answer."""
+    question = state["question"]
+    image_refs = state.get("image_refs", [])
+    legend_pages = set(state.get("legend_pages", []))
+    investigation_round = state.get("investigation_round", 0)
+    client = genai.Client(api_key=GOOGLE_API_KEY)
+    # Build multimodal content — legends first, then crops, then annotated
+    content_parts: list[types.Part] = []
+    content_parts.append(types.Part.from_text(text=f"USER QUESTION: {question}"))
+    # Sort images: legend crops first, then detail crops, then annotated versions
+    legend_refs = [r for r in image_refs if r["page_num"] in legend_pages and r["crop_type"] == "crop"]
+    detail_crops = [r for r in image_refs if r["page_num"] not in legend_pages and r["crop_type"] == "crop"]
+    annotated_refs = [r for r in image_refs if r["crop_type"] == "annotated"]
+    ordered_refs = legend_refs + detail_crops + annotated_refs
+    # Add section headers and images in order
+    first_detail_id = detail_crops[0]["id"] if detail_crops else None
+    first_annotated_id = annotated_refs[0]["id"] if annotated_refs else None
+    if legend_refs:
+        content_parts.append(
+            types.Part.from_text(
+                text="\n=== LEGEND / SCHEDULE CROPS (study these first) ===",
+            )
+        )
+    for ref in ordered_refs:
+        if first_detail_id is not None and ref["id"] == first_detail_id:
+            content_parts.append(
+                types.Part.from_text(text="\n=== DETAIL CROPS ===")
+            )
+        if first_annotated_id is not None and ref["id"] == first_annotated_id:
+            content_parts.append(
+                types.Part.from_text(
+                    text="\n=== ANNOTATED CROPS (numbered/highlighted versions) ===",
+                )
+            )
+        content_parts.append(
+            types.Part.from_text(text=f"\nImage: {ref['label']}")
+        )
+        try:
+            content_parts.append(image_store.to_gemini_part(ref))
+        except Exception as e:
+            content_parts.append(
+                types.Part.from_text(text=f"(Could not load image: {e})")
+            )
+    content_parts.append(
+        types.Part.from_text(
+            text=f"\nThis is investigation round {investigation_round + 1}. "
+            "Analyze the images and answer the user's question. "
+            "If you need more crops, include a JSON block at the end of your response."
+        )
+    )
+    response = client.models.generate_content(
+        model=ANALYZER_MODEL,
+        contents=[types.Content(role="user", parts=content_parts)],
+        config=types.GenerateContentConfig(
+            system_instruction=ANALYZER_SYSTEM_PROMPT,
+        ),
+    )
+    analysis_text = response.text
+    # Check if the model requested additional investigation
+    needs_more = False
+    additional_crops: list[CropTask] = []
+    json_match = re.search(
+        r'```json\s*(\{.*?"needs_more"\s*:\s*true.*?\})\s*```',
+        analysis_text,
+        re.DOTALL,
+    )
+    if json_match:
+        try:
+            extra = json.loads(json_match.group(1))
+            if extra.get("needs_more"):
+                needs_more = True
+                for t in extra.get("additional_crops", []):
+                    raw_page = int(t.get("page_num", 1))
+                    additional_crops.append(
+                        CropTask(
+                            page_num=raw_page - 1,  # convert 1-indexed → 0-indexed
+                            crop_instruction=t.get("crop_instruction", ""),
+                            annotate=bool(t.get("annotate", False)),
+                            annotation_prompt=t.get("annotation_prompt", ""),
+                            label=t.get("label", "Additional crop"),
+                            priority=int(t.get("priority", 1)),
+                        )
+                    )
+        except (json.JSONDecodeError, KeyError):
+            pass
+        # Clean the JSON block from the analysis text
+        analysis_text = analysis_text[: json_match.start()].strip()
+    result: dict = {
+        "gemini_analysis": analysis_text,
+        "investigation_round": investigation_round + 1,
+        "needs_more_investigation": needs_more,
+        "status_message": "Analysis complete."
+        if not needs_more
+        else f"Requesting {len(additional_crops)} additional crops (round {investigation_round + 2}).",
+    }
+    if additional_crops:
+        result["crop_tasks"] = additional_crops
+    return result

nodes/annotator.py ADDED Viewed

	@@ -0,0 +1,117 @@

+"""annotate_crops node — nano-banana (Gemini image generation) for semantic annotation."""
+from __future__ import annotations
+import io
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from google import genai
+from google.genai import types
+from PIL import Image
+from config import ANNOTATOR_MODEL, GOOGLE_API_KEY
+from prompts.annotator import ANNOTATION_WRAPPER
+from state import DrawingReaderState, ImageRef
+from tools.image_store import ImageStore
+def _extract_generated_image(response) -> Image.Image | None:
+    """Extract the generated image from a Gemini image-generation response."""
+    for part in response.candidates[0].content.parts:
+        if part.inline_data is not None:
+            return Image.open(io.BytesIO(part.inline_data.data))
+    return None
+def _annotate_single_crop_sync(
+    client: genai.Client,
+    crop_ref: ImageRef,
+    annotation_prompt: str,
+    image_store: ImageStore,
+) -> ImageRef | None:
+    """Annotate one crop using nano-banana (synchronous)."""
+    crop_bytes = image_store.load_bytes(crop_ref)
+    full_prompt = ANNOTATION_WRAPPER.format(annotation_prompt=annotation_prompt)
+    response = client.models.generate_content(
+        model=ANNOTATOR_MODEL,
+        contents=[
+            types.Part.from_bytes(data=crop_bytes, mime_type="image/png"),
+            full_prompt,
+        ],
+        config=types.GenerateContentConfig(
+            response_modalities=["TEXT", "IMAGE"],
+        ),
+    )
+    annotated_image = _extract_generated_image(response)
+    if annotated_image is None:
+        return None
+    ref = image_store.save_annotated(crop_ref, annotated_image)
+    return ref
+def annotate_crops(state: DrawingReaderState, image_store: ImageStore) -> dict:
+    """Run nano-banana annotation on crops that need it."""
+    crop_tasks = state.get("crop_tasks", [])
+    image_refs = state.get("image_refs", [])
+    # Build a mapping: find crops that need annotation.
+    # The most recent batch of crops corresponds to the current crop_tasks.
+    # Take the LAST len(crop_tasks) crops from image_refs to match by position,
+    # so that on loop-back rounds we only match against the newest crops.
+    crops_needing_annotation: list[tuple[ImageRef, str]] = []
+    all_crops = [r for r in image_refs if r["crop_type"] == "crop"]
+    # Only the tail — the most recent batch produced by execute_crops
+    recent_crops = all_crops[-len(crop_tasks):] if crop_tasks else []
+    for i, task in enumerate(crop_tasks):
+        if task["annotate"] and task["annotation_prompt"] and i < len(recent_crops):
+            crops_needing_annotation.append(
+                (recent_crops[i], task["annotation_prompt"])
+            )
+    if not crops_needing_annotation:
+        return {"status_message": "No annotation needed for these crops."}
+    client = genai.Client(api_key=GOOGLE_API_KEY)
+    # Use a thread pool instead of asyncio to avoid event-loop conflicts
+    # with Streamlit's own event loop.
+    results: list[ImageRef | None | Exception] = [None] * len(crops_needing_annotation)
+    with ThreadPoolExecutor(max_workers=min(len(crops_needing_annotation), 4)) as pool:
+        future_to_idx = {}
+        for i, (ref, prompt) in enumerate(crops_needing_annotation):
+            future = pool.submit(
+                _annotate_single_crop_sync, client, ref, prompt, image_store,
+            )
+            future_to_idx[future] = i
+        for future in as_completed(future_to_idx):
+            idx = future_to_idx[future]
+            try:
+                results[idx] = future.result()
+            except Exception as e:
+                results[idx] = e
+    annotated_refs: list[ImageRef] = []
+    errors: list[str] = []
+    for i, result in enumerate(results):
+        if isinstance(result, Exception):
+            errors.append(f"Annotation {i} failed: {result}")
+        elif result is not None:
+            annotated_refs.append(result)
+        else:
+            errors.append(f"Annotation {i} returned no image")
+    status = f"Annotated {len(annotated_refs)} of {len(crops_needing_annotation)} crops."
+    if errors:
+        status += f" Issues: {'; '.join(errors)}"
+    return {
+        "image_refs": annotated_refs,
+        "status_message": status,
+    }

nodes/consensus.py ADDED Viewed

	@@ -0,0 +1,58 @@

+"""consensus_review node — GPT-4o reviews Gemini's analysis."""
+from __future__ import annotations
+from openai import OpenAI
+from config import CONSENSUS_MODEL, OPENAI_API_KEY
+from prompts.consensus import CONSENSUS_SYSTEM_PROMPT
+from state import DrawingReaderState
+from tools.image_store import ImageStore
+def consensus_review(state: DrawingReaderState, image_store: ImageStore) -> dict:
+    """Send crops + Gemini's draft to GPT-4o for peer review."""
+    question = state["question"]
+    gemini_analysis = state.get("gemini_analysis", "")
+    image_refs = state.get("image_refs", [])
+    if not gemini_analysis:
+        return {"gpt_analysis": "", "status_message": "No analysis to review."}
+    client = OpenAI(api_key=OPENAI_API_KEY)
+    # Build multimodal message for GPT
+    user_content: list[dict] = [
+        {"type": "text", "text": f"USER QUESTION: {question}"},
+        {"type": "text", "text": f"ANALYST'S DRAFT ANSWER:\n{gemini_analysis}"},
+        {"type": "text", "text": "\nBELOW ARE THE SAME CROPPED IMAGES THE ANALYST EXAMINED:"},
+    ]
+    for ref in image_refs:
+        user_content.append(
+            {"type": "text", "text": f"\nImage: {ref['label']}"}
+        )
+        try:
+            user_content.append(image_store.to_openai_base64(ref))
+        except Exception as e:
+            user_content.append(
+                {"type": "text", "text": f"(Could not load image: {e})"}
+            )
+    user_content.append(
+        {"type": "text", "text": "\nPerform your peer review as specified."}
+    )
+    response = client.chat.completions.create(
+        model=CONSENSUS_MODEL,
+        messages=[
+            {"role": "system", "content": CONSENSUS_SYSTEM_PROMPT},
+            {"role": "user", "content": user_content},
+        ],
+    )
+    gpt_analysis = response.choices[0].message.content or ""
+    return {
+        "gpt_analysis": gpt_analysis,
+        "status_message": "GPT consensus review complete.",
+    }

nodes/cropper.py ADDED Viewed

	@@ -0,0 +1,204 @@

+"""execute_crops node — Gemini code_execution for agentic cropping (PoC 1 style)."""
+from __future__ import annotations
+import io
+import logging
+import uuid
+from collections.abc import Callable
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from google import genai
+from google.genai import types
+from PIL import Image
+from config import CROPPER_MODEL, GOOGLE_API_KEY
+from prompts.cropper import CROPPER_PROMPT_TEMPLATE
+from state import CropTask, DrawingReaderState, ImageRef
+from tools.crop_cache import CropCache
+from tools.image_store import ImageStore
+from tools.pdf_processor import get_page_image_bytes
+logger = logging.getLogger(__name__)
+# Type alias for the progress callback.
+# Signature: (completed_ref, crop_task, source, completed_count, total_count)
+ProgressCallback = Callable[[ImageRef, CropTask, str, int, int], None]
+def _extract_last_image(response) -> Image.Image | None:
+    """Extract the last generated image from a Gemini code_execution response."""
+    last_image = None
+    for part in response.candidates[0].content.parts:
+        # Try as_image() first
+        try:
+            img_data = part.as_image()
+            if img_data is not None:
+                last_image = Image.open(io.BytesIO(img_data.image_bytes))
+                continue
+        except Exception:
+            pass
+        # Fallback: inline_data
+        try:
+            if hasattr(part, "inline_data") and part.inline_data is not None:
+                img_bytes = part.inline_data.data
+                last_image = Image.open(io.BytesIO(img_bytes))
+        except Exception:
+            pass
+    return last_image
+def _execute_single_crop_sync(
+    client: genai.Client,
+    page_image_bytes: bytes,
+    crop_task: CropTask,
+    image_store: ImageStore,
+) -> tuple[ImageRef, bool]:
+    """Execute one crop via Gemini code_execution (synchronous).
+    Returns
+    -------
+    (image_ref, is_fallback)
+        ``is_fallback`` is True when Gemini failed to produce a crop and the
+        full page image was returned instead.  Fallbacks should NOT be cached.
+    """
+    prompt = CROPPER_PROMPT_TEMPLATE.format(
+        crop_instruction=crop_task["crop_instruction"],
+    )
+    image_part = types.Part.from_bytes(data=page_image_bytes, mime_type="image/png")
+    response = client.models.generate_content(
+        model=CROPPER_MODEL,
+        contents=[image_part, prompt],
+        config=types.GenerateContentConfig(
+            tools=[types.Tool(code_execution=types.ToolCodeExecution)]
+        ),
+    )
+    final_image = _extract_last_image(response)
+    is_fallback = final_image is None
+    if is_fallback:
+        # Fallback: return the full page image if cropping failed
+        final_image = Image.open(io.BytesIO(page_image_bytes))
+    crop_id = f"crop_{uuid.uuid4().hex[:6]}"
+    ref = image_store.save_crop(
+        page_num=crop_task["page_num"],
+        crop_id=crop_id,
+        image=final_image,
+        label=crop_task["label"],
+    )
+    return ref, is_fallback
+def execute_crops(
+    state: DrawingReaderState,
+    image_store: ImageStore,
+    crop_cache: CropCache | None = None,
+    progress_callback: ProgressCallback | None = None,
+) -> dict:
+    """Execute all crop tasks concurrently, reusing cached crops when possible.
+    Parameters
+    ----------
+    progress_callback
+        Optional callback invoked on the **main thread** each time a crop
+        completes (or is served from cache).  Called with
+        ``(image_ref, crop_task, source, completed_count, total_count)``
+        where *source* is ``"cached"``, ``"completed"``, or ``"fallback"``.
+    """
+    crop_tasks = state.get("crop_tasks", [])
+    page_image_dir = state["page_image_dir"]
+    if not crop_tasks:
+        return {"status_message": "No crop tasks to execute."}
+    total_count = len(crop_tasks)
+    completed_count = 0
+    # ----- Phase 1: Separate cache hits from tasks that need API calls -----
+    image_refs: list[ImageRef] = []         # final ordered results
+    tasks_to_execute: list[tuple[int, CropTask]] = []  # (original_index, task)
+    cache_hits = 0
+    for i, ct in enumerate(crop_tasks):
+        if crop_cache is not None:
+            cached_ref = crop_cache.lookup(ct["page_num"], ct["crop_instruction"])
+            if cached_ref is not None:
+                image_refs.append(cached_ref)
+                cache_hits += 1
+                completed_count += 1
+                logger.info(
+                    "Reusing cached crop for '%s' (page %d)",
+                    ct["label"], ct["page_num"],
+                )
+                # Notify the UI immediately for each cache hit
+                if progress_callback is not None:
+                    progress_callback(
+                        cached_ref, ct, "cached", completed_count, total_count,
+                    )
+                continue
+        # Not cached — needs an API call
+        tasks_to_execute.append((i, ct))
+    # ----- Phase 2: Execute uncached crops via Gemini -----
+    errors: list[str] = []
+    if tasks_to_execute:
+        client = genai.Client(api_key=GOOGLE_API_KEY)
+        with ThreadPoolExecutor(max_workers=min(len(tasks_to_execute), 4)) as pool:
+            future_to_idx: dict = {}
+            for exec_idx, (_, ct) in enumerate(tasks_to_execute):
+                page_bytes = get_page_image_bytes(page_image_dir, ct["page_num"])
+                future = pool.submit(
+                    _execute_single_crop_sync, client, page_bytes, ct, image_store,
+                )
+                future_to_idx[future] = exec_idx
+            # Process results as they arrive — this runs on the MAIN thread,
+            # so we can safely invoke the Streamlit progress callback here.
+            for future in as_completed(future_to_idx):
+                exec_idx = future_to_idx[future]
+                orig_idx, ct = tasks_to_execute[exec_idx]
+                try:
+                    ref, is_fallback = future.result()
+                    image_refs.append(ref)
+                    completed_count += 1
+                    # Register in cache (only successful targeted crops)
+                    if crop_cache is not None:
+                        crop_cache.register(
+                            page_num=ct["page_num"],
+                            crop_instruction=ct["crop_instruction"],
+                            label=ct["label"],
+                            image_ref=ref,
+                            is_fallback=is_fallback,
+                        )
+                    # Notify the UI as each crop completes
+                    if progress_callback is not None:
+                        source = "fallback" if is_fallback else "completed"
+                        progress_callback(
+                            ref, ct, source, completed_count, total_count,
+                        )
+                except Exception as e:
+                    errors.append(f"Crop task {orig_idx} failed: {e}")
+    # ----- Phase 3: Build status message -----
+    api_count = len(tasks_to_execute) - len(errors)
+    parts = [f"Completed {len(image_refs)} of {total_count} crops"]
+    if cache_hits:
+        parts.append(f"({cache_hits} from cache, {api_count} new)")
+    if errors:
+        parts.append(f"Errors: {'; '.join(errors)}")
+    status = ". ".join(parts) + "."
+    if crop_cache is not None:
+        logger.info(crop_cache.stats)
+    return {
+        "image_refs": image_refs,
+        "status_message": status,
+    }

nodes/ingest.py ADDED Viewed

	@@ -0,0 +1,22 @@

+"""ingest_pdf node — renders all PDF pages as images at configured DPI."""
+from __future__ import annotations
+from state import DrawingReaderState
+from tools import pdf_processor
+def ingest_pdf(state: DrawingReaderState) -> dict:
+    """Render all PDF pages as PNGs for downstream visual analysis.
+    Pages are rendered at PDF_RENDER_DPI (100 DPI) which balances
+    speed and quality for construction drawings.
+    """
+    pdf_path = state["pdf_path"]
+    page_image_dir = state["page_image_dir"]
+    num_pages = pdf_processor.render_pages(pdf_path, page_image_dir)
+    return {
+        "num_pages": num_pages,
+        "status_message": f"Converted {num_pages} pages to images.",
+    }

nodes/metadata_generator.py ADDED Viewed

	@@ -0,0 +1,186 @@

+"""Background page metadata generator — extracts per-page descriptions from the full PDF.
+Uses parallel batch processing: the PDF is split into 5-page chunks and each
+chunk is sent to Gemini concurrently for faster metadata extraction.
+"""
+from __future__ import annotations
+import json
+import logging
+import math
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from google import genai
+from google.genai import types
+from config import GOOGLE_API_KEY, METADATA_MODEL
+from prompts.metadata import METADATA_SYSTEM_PROMPT
+from tools.pdf_processor import extract_page_range_bytes
+logger = logging.getLogger(__name__)
+# Number of PDF pages per batch sent to Gemini in parallel.
+BATCH_SIZE = 5
+# ---------------------------------------------------------------------------
+# JSON extraction helper
+# ---------------------------------------------------------------------------
+def _extract_json_array(response_text: str) -> list[dict]:
+    """Extract the outermost balanced JSON array from a response string."""
+    start = response_text.find("[")
+    if start == -1:
+        raise ValueError("No JSON array found in metadata generation response")
+    depth = 0
+    end = None
+    for i in range(start, len(response_text)):
+        if response_text[i] == "[":
+            depth += 1
+        elif response_text[i] == "]":
+            depth -= 1
+            if depth == 0:
+                end = i
+                break
+    if end is None:
+        raise ValueError("No matching closing bracket found in metadata response")
+    result = json.loads(response_text[start : end + 1])
+    if not isinstance(result, list):
+        raise ValueError(f"Expected list, got {type(result)}")
+    return result
+# ---------------------------------------------------------------------------
+# Single-batch API call
+# ---------------------------------------------------------------------------
+def _generate_batch(
+    pdf_path: str,
+    page_start_0: int,
+    page_end_0: int,
+    page_start_1: int,
+    page_end_1: int,
+) -> list[dict]:
+    """Generate metadata for a contiguous range of pages.
+    Args:
+        pdf_path: Path to the full PDF on disk.
+        page_start_0: First page (0-indexed, inclusive) for PDF extraction.
+        page_end_0: Last page (0-indexed, inclusive) for PDF extraction.
+        page_start_1: First page (1-indexed) — used in the prompt text.
+        page_end_1: Last page (1-indexed) — used in the prompt text.
+    Returns:
+        List of metadata dicts for the pages in this batch.
+    """
+    client = genai.Client(api_key=GOOGLE_API_KEY)
+    batch_pdf_bytes = extract_page_range_bytes(pdf_path, page_start_0, page_end_0)
+    pdf_part = types.Part.from_bytes(data=batch_pdf_bytes, mime_type="application/pdf")
+    num_batch_pages = page_end_1 - page_start_1 + 1
+    instruction_text = (
+        f"This PDF excerpt contains {num_batch_pages} page(s), "
+        f"corresponding to pages {page_start_1} through {page_end_1} of the full drawing set.\n"
+        f"Generate structured metadata for ALL {num_batch_pages} page(s). "
+        f"Use page numbers {page_start_1} through {page_end_1} (1-indexed). "
+        f"Return a JSON array with exactly {num_batch_pages} objects."
+    )
+    instruction_part = types.Part.from_text(text=instruction_text)
+    response = client.models.generate_content(
+        model=METADATA_MODEL,
+        contents=[types.Content(role="user", parts=[pdf_part, instruction_part])],
+        config=types.GenerateContentConfig(
+            system_instruction=METADATA_SYSTEM_PROMPT,
+        ),
+    )
+    return _extract_json_array(response.text.strip())
+# ---------------------------------------------------------------------------
+# Public entry point
+# ---------------------------------------------------------------------------
+def generate_page_metadata(pdf_path: str, num_pages: int) -> list[dict]:
+    """Extract per-page structured metadata from a PDF using parallel batches.
+    The PDF is split into chunks of ``BATCH_SIZE`` pages. Each chunk is sent to
+    Gemini concurrently via a thread pool.  Results are merged, any missing
+    pages are back-filled, and the list is returned sorted by page number.
+    Returns a list of dicts (1-indexed page_num), one per page.
+    Raises on failure (caller is responsible for error handling).
+    """
+    num_batches = math.ceil(num_pages / BATCH_SIZE)
+    logger.info(
+        "Starting parallel metadata generation: %d pages in %d batches of %d",
+        num_pages, num_batches, BATCH_SIZE,
+    )
+    all_results: list[dict] = []
+    errors: list[str] = []
+    with ThreadPoolExecutor(max_workers=num_batches) as executor:
+        futures = {}
+        for batch_idx in range(num_batches):
+            page_start_0 = batch_idx * BATCH_SIZE
+            page_end_0 = min(page_start_0 + BATCH_SIZE - 1, num_pages - 1)
+            page_start_1 = page_start_0 + 1
+            page_end_1 = page_end_0 + 1
+            future = executor.submit(
+                _generate_batch,
+                pdf_path,
+                page_start_0,
+                page_end_0,
+                page_start_1,
+                page_end_1,
+            )
+            futures[future] = (page_start_1, page_end_1)
+        for future in as_completed(futures):
+            batch_range = futures[future]
+            try:
+                batch_results = future.result()
+                all_results.extend(batch_results)
+                logger.info("Batch pages %d-%d complete: %d entries", batch_range[0], batch_range[1], len(batch_results))
+            except Exception as e:
+                errors.append(f"Batch pages {batch_range[0]}-{batch_range[1]} failed: {e}")
+                logger.exception("Batch pages %d-%d failed", batch_range[0], batch_range[1])
+    if errors and not all_results:
+        raise RuntimeError(
+            f"All metadata batches failed:\n" + "\n".join(errors)
+        )
+    if errors:
+        logger.warning("Some batches failed (results will have gaps): %s", errors)
+    # Metadata stays 1-indexed (as the model produced it) because it will be
+    # passed as context text to the planner model, which also uses 1-indexed.
+    # The planner's *output* is converted to 0-indexed in nodes/planner.py.
+    # Fill in any missing pages with minimal entries (1-indexed)
+    covered_pages = {item.get("page_num") for item in all_results}
+    for p in range(1, num_pages + 1):
+        if p not in covered_pages:
+            all_results.append({
+                "page_num": p,
+                "sheet_id": "unknown",
+                "sheet_title": "Unknown",
+                "discipline": "other",
+                "page_type": "other",
+                "description": "Metadata not extracted for this page.",
+                "key_elements": [],
+                "spatial_coverage": "",
+            })
+    # Sort by page number
+    all_results.sort(key=lambda x: x.get("page_num", 0))
+    return all_results

nodes/planner.py ADDED Viewed

	@@ -0,0 +1,127 @@

+"""plan_and_select node — plans crop tasks from PDF or cached page metadata."""
+from __future__ import annotations
+import json
+import re
+from pathlib import Path
+from google import genai
+from google.genai import types
+from config import GOOGLE_API_KEY, PLANNER_MODEL
+from prompts.planner import PLANNER_SYSTEM_PROMPT, PLANNER_SYSTEM_PROMPT_METADATA
+from state import CropTask, DrawingReaderState
+def plan_and_select(state: DrawingReaderState) -> dict:
+    """Identify relevant pages and produce crop tasks.
+    Two modes:
+    - **Metadata mode** (fast): when ``page_metadata_json`` is available, the planner
+      works from structured text descriptions — no PDF upload needed.
+    - **PDF mode** (fallback): uploads the full PDF as a native PDF part to Gemini.
+    """
+    question = state["question"]
+    pdf_path = state["pdf_path"]
+    num_pages = state.get("num_pages", 0)
+    investigation_round = state.get("investigation_round", 0)
+    page_metadata_json = state.get("page_metadata_json", "")
+    client = genai.Client(api_key=GOOGLE_API_KEY)
+    if page_metadata_json:
+        # ---- Metadata-based planning (fast, no PDF upload) ----
+        question_text = (
+            f"USER QUESTION: {question}\n\n"
+            f"The PDF has {num_pages} pages (1-indexed, from page 1 to page {num_pages}).\n"
+            f"This is investigation round {investigation_round + 1}.\n\n"
+            f"PAGE METADATA:\n{page_metadata_json}"
+        )
+        question_part = types.Part.from_text(text=question_text)
+        response = client.models.generate_content(
+            model=PLANNER_MODEL,
+            contents=[types.Content(role="user", parts=[question_part])],
+            config=types.GenerateContentConfig(
+                system_instruction=PLANNER_SYSTEM_PROMPT_METADATA,
+            ),
+        )
+        planning_mode = "metadata"
+    else:
+        # ---- Full PDF upload (fallback) ----
+        pdf_bytes = Path(pdf_path).read_bytes()
+        pdf_part = types.Part.from_bytes(data=pdf_bytes, mime_type="application/pdf")
+        question_text = (
+            f"USER QUESTION: {question}\n\n"
+            f"The PDF has {num_pages} pages (1-indexed, from page 1 to page {num_pages}).\n"
+            f"This is investigation round {investigation_round + 1}."
+        )
+        question_part = types.Part.from_text(text=question_text)
+        response = client.models.generate_content(
+            model=PLANNER_MODEL,
+            contents=[types.Content(role="user", parts=[pdf_part, question_part])],
+            config=types.GenerateContentConfig(
+                system_instruction=PLANNER_SYSTEM_PROMPT,
+            ),
+        )
+        planning_mode = "pdf"
+    response_text = response.text.strip()
+    # Parse the JSON response
+    # Expected: {"target_pages": [...], "legend_pages": [...], "crop_tasks": [...]}
+    json_match = re.search(r"\{.*\}", response_text, re.DOTALL)
+    target_pages: list[int] = []
+    legend_pages: list[int] = []
+    crop_tasks: list[CropTask] = []
+    if json_match:
+        try:
+            parsed = json.loads(json_match.group())
+            # Model returns 1-indexed page numbers; convert to 0-indexed for internal use.
+            valid_0indexed = set(range(num_pages))
+            target_pages = [
+                int(p) - 1 for p in parsed.get("target_pages", [])
+                if int(p) - 1 in valid_0indexed
+            ]
+            legend_pages = [
+                int(p) - 1 for p in parsed.get("legend_pages", [])
+                if int(p) - 1 in valid_0indexed
+            ]
+            for t in parsed.get("crop_tasks", []):
+                raw_page = int(t.get("page_num", 1))
+                crop_tasks.append(
+                    CropTask(
+                        page_num=raw_page - 1,  # convert 1-indexed → 0-indexed
+                        crop_instruction=t.get("crop_instruction", ""),
+                        annotate=bool(t.get("annotate", False)),
+                        annotation_prompt=t.get("annotation_prompt", ""),
+                        label=t.get("label", f"Page {raw_page} crop"),
+                        priority=int(t.get("priority", 1)),
+                    )
+                )
+        except (json.JSONDecodeError, ValueError, KeyError):
+            pass
+    # Sort by priority (legends = 0 first)
+    crop_tasks.sort(key=lambda t: t["priority"])
+    # Fallback: if nothing identified, use first 5 pages
+    if not target_pages and not crop_tasks:
+        target_pages = list(range(min(num_pages, 5)))
+    mode_label = "from page index" if planning_mode == "metadata" else "from full PDF"
+    return {
+        "target_pages": target_pages,
+        "legend_pages": legend_pages,
+        "crop_tasks": crop_tasks,
+        "status_message": (
+            f"Selected {len(target_pages)} pages ({len(legend_pages)} legends), "
+            f"planned {len(crop_tasks)} crop tasks ({mode_label})."
+        ),
+    }

nodes/synthesizer.py ADDED Viewed

	@@ -0,0 +1,58 @@

+"""synthesize_answer node — final answer combining Gemini + optional GPT perspectives."""
+from __future__ import annotations
+from google import genai
+from google.genai import types
+from config import GOOGLE_API_KEY, SYNTHESIZER_MODEL
+from state import DrawingReaderState
+def synthesize_answer(state: DrawingReaderState) -> dict:
+    """Produce the final answer, synthesizing consensus if present."""
+    gemini_analysis = state.get("gemini_analysis", "")
+    gpt_analysis = state.get("gpt_analysis", "")
+    question = state["question"]
+    enable_consensus = state.get("enable_consensus", False)
+    # If no consensus was run, pass through Gemini's analysis directly
+    if not enable_consensus or not gpt_analysis:
+        return {
+            "final_answer": gemini_analysis,
+            "status_message": "Final answer ready.",
+        }
+    # Synthesize both perspectives
+    client = genai.Client(api_key=GOOGLE_API_KEY)
+    synthesis_prompt = f"""\
+You are producing a FINAL ANSWER to a construction drawing question.
+USER QUESTION: {question}
+ANALYST A (Gemini) says:
+{gemini_analysis}
+ANALYST B (GPT) peer review:
+{gpt_analysis}
+YOUR TASK:
+1. If both analysts AGREE: produce a confident, unified answer citing the consensus.
+2. If they PARTIALLY AGREE: produce the answer based on the agreed points, and \
+   explicitly note areas of disagreement with evidence from both sides.
+3. If they DISAGREE: present both interpretations clearly, explain the discrepancy, \
+   and state which interpretation appears better supported by the evidence (or that \
+   the question cannot be definitively answered from the available images).
+Always cite page numbers, sheet names, and image labels for every factual claim.
+"""
+    response = client.models.generate_content(
+        model=SYNTHESIZER_MODEL,
+        contents=[synthesis_prompt],
+    )
+    return {
+        "final_answer": response.text,
+        "status_message": "Final synthesized answer ready.",
+    }

prompts/__init__.py ADDED Viewed

File without changes

prompts/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (220 Bytes). View file

prompts/__pycache__/analyzer.cpython-313.pyc ADDED Viewed

Binary file (3.06 kB). View file

prompts/__pycache__/annotator.cpython-313.pyc ADDED Viewed

Binary file (984 Bytes). View file

prompts/__pycache__/consensus.cpython-313.pyc ADDED Viewed

Binary file (1.46 kB). View file

prompts/__pycache__/cropper.cpython-313.pyc ADDED Viewed

Binary file (1.07 kB). View file

prompts/__pycache__/metadata.cpython-313.pyc ADDED Viewed

Binary file (2.48 kB). View file

prompts/__pycache__/planner.cpython-313.pyc ADDED Viewed

Binary file (8.79 kB). View file

prompts/analyzer.py ADDED Viewed

	@@ -0,0 +1,56 @@

+"""System prompt for the analyze_findings node."""
+ANALYZER_SYSTEM_PROMPT = """\
+You are a senior expert in architecture, MEP engineering, structural engineering, \
+and construction documentation.
+You are the ANALYST in a multi-step drawing analysis workflow. You receive:
+- The user's question
+- Cropped images from relevant drawing pages (some may be annotated)
+- Retrieved text context from the document
+- Image labels describing what each crop shows
+The FIRST images are always LEGENDS, SCHEDULES, or GENERAL NOTES. Study these \
+carefully to understand all symbols, abbreviations, and conventions BEFORE examining \
+the detail crops that follow.
+YOUR RESPONSIBILITIES:
+1. **Study legends first.** Identify all relevant symbols, callouts, and abbreviations \
+   from the legend crops before analyzing any detail crops.
+2. **Examine each crop carefully.** For annotated crops, reference the numbered \
+   annotations (e.g., "Item #3 shows a supply air diffuser").
+3. **Provide spatially-grounded answers.** Always describe WHERE things are: \
+   "Located in the upper-left quadrant, north of AHU-1, adjacent to Room 204."
+4. **Describe symbols visually.** When referencing equipment or symbols, describe \
+   what they look like: "the circular symbol with radiating lines represents..."
+5. **Cite your sources.** Reference images by their labels: "As shown in the crop \
+   labeled 'Page 12 (M-101) - Gymnasium Diffusers'..."
+6. **Be honest about uncertainty.** If you cannot clearly see something, or if \
+   information is ambiguous, say so explicitly. Never guess.
+7. **Trace paths step-by-step.** When describing duct routes, piping, or conduit: \
+   describe the path in spatial order from source to destination.
+ANSWER FORMAT:
+- Restate the question briefly
+- Walk through your reasoning with references to specific crops and annotations
+- Provide a clear, definitive answer (or explain what is uncertain and why)
+- Mention page numbers and sheet names for every factual claim
+ADDITIONAL INVESTIGATION:
+If you determine that the provided crops are INSUFFICIENT to answer the question, \
+you may request additional crops. To do this, include a JSON block at the END of \
+your response in this exact format. ALL PAGE NUMBERS ARE 1-INDEXED (first page = 1).
+```json
+{"needs_more": true, "reason": "brief explanation of what information is missing", "additional_crops": [
+  {"page_num": 15, "crop_instruction": "...", "annotate": false, "annotation_prompt": "", "label": "...", "priority": 1}
+]}
+```
+RULES FOR ADDITIONAL CROPS:
+- Only request crops for areas you have NOT already examined. Never re-request the \
+  same page region — you already have those images above.
+- Each additional crop must target a DIFFERENT area or page than what was already provided.
+- Explain briefly WHY you need each additional crop (what information is missing).
+- Do not request them speculatively — only when truly necessary to answer the question.
+"""

prompts/annotator.py ADDED Viewed

	@@ -0,0 +1,22 @@

+"""Prompt wrapper for nano-banana annotation.
+The actual annotation prompt is written per-task by the planner node.
+This module provides a wrapper that ensures consistent instructions
+around the planner's specific annotation request.
+"""
+ANNOTATION_WRAPPER = """\
+You are annotating a cropped section of a construction/engineering drawing.
+{annotation_prompt}
+CRITICAL RULES:
+- Keep the original drawing CLEARLY VISIBLE underneath your annotations.
+- Use bright, high-contrast colors that stand out against the drawing.
+- Make labels and numbers large enough to read easily.
+- Number items sequentially (1, 2, 3...) when counting.
+- Use consistent colors: RED for primary items of interest, BLUE for secondary \
+  items, GREEN for paths/traces.
+- Do not remove, obscure, or redraw any part of the original drawing.
+- Output the annotated image.
+"""

prompts/consensus.py ADDED Viewed

	@@ -0,0 +1,29 @@

+"""System prompt for the GPT consensus review node."""
+CONSENSUS_SYSTEM_PROMPT = """\
+You are a senior expert in architecture, MEP engineering, structural engineering, \
+and construction documentation.
+You are performing a PEER REVIEW of another analyst's interpretation of construction \
+drawings. You will receive:
+- The user's original question
+- The same cropped drawing images that the analyst examined
+- The analyst's draft answer
+YOUR TASK:
+1. Independently examine each cropped image.
+2. Compare your observations against the analyst's claims.
+3. For each factual claim in the analyst's response, determine if you:
+   - AGREE (you see the same evidence in the images)
+   - DISAGREE (you see different evidence, or the claim is not supported)
+   - CANNOT VERIFY (the available images don't clearly show what's claimed)
+4. If you disagree, explain specifically what you see differently and cite which \
+   image/crop contradicts the finding.
+5. Note any details the analyst may have MISSED that are visible in the images.
+OUTPUT FORMAT:
+- Start with your overall assessment: AGREE / PARTIALLY AGREE / DISAGREE
+- List each point of agreement or disagreement with image references
+- Provide your own answer to the user's question if it differs from the analyst's
+- Be specific and cite crop labels for every observation
+"""

prompts/cropper.py ADDED Viewed

	@@ -0,0 +1,21 @@

+"""Prompt template for the execute_crops node (Gemini code_execution)."""
+CROPPER_PROMPT_TEMPLATE = """\
+You are processing a construction drawing page. Your task:
+{crop_instruction}
+Instructions:
+1. Examine the full image to orient yourself and locate the requested area.
+2. Use Python with PIL/Pillow to crop the image to just the requested region.
+3. Add padding of approximately 40 pixels on each side (clamped to image bounds).
+4. Iterate if needed - if your first crop is too wide, too narrow, or misses the \
+   target, refine it. Take up to 3 attempts to get a tight, accurate crop.
+5. Output the final cropped image.
+IMPORTANT RULES:
+- Do NOT annotate, draw on, or modify the image content in any way.
+- Just produce a clean, accurate crop of the requested area.
+- The final output must be the best possible crop.
+- If you cannot locate the requested area, crop to the most likely region and \
+  note this in your text output.
+"""

prompts/metadata.py ADDED Viewed

	@@ -0,0 +1,40 @@

+"""System prompt for the background page metadata generator."""
+METADATA_SYSTEM_PROMPT = """\
+You are a senior expert in architecture, MEP engineering, structural engineering, \
+and construction documentation.
+You are analyzing a BATCH of pages from a construction drawing PDF to generate \
+structured metadata for each page. This metadata will be used by a downstream \
+planner to select relevant pages WITHOUT needing to re-examine the full PDF visually.
+YOUR TASK: For EVERY page in this batch, produce a JSON object with these fields. \
+Use the page numbers specified in the user instruction (1-indexed).
+- "page_num": integer, 1-indexed page number as specified in the instruction
+- "sheet_id": string, the sheet number/ID from the title block (e.g., "M-101", \
+  "A-201", "G-001"). Use "unknown" if not visible.
+- "sheet_title": string, the sheet title from the title block (e.g., "First Floor \
+  HVAC Plan"). Use "Untitled" if not visible.
+- "discipline": one of "mechanical", "electrical", "plumbing", "architectural", \
+  "structural", "civil", "general", "fire_protection", "demolition", "other"
+- "page_type": one of "floor_plan", "legend", "schedule", "detail", "section", \
+  "elevation", "title_sheet", "notes", "diagram", "cover", "other"
+- "description": 2-4 sentences describing what is visible on this page. Be specific \
+  about spatial areas covered, equipment shown, systems depicted.
+- "key_elements": list of strings naming notable items visible (equipment tags, room \
+  names, system names, detail callouts). Include 5-15 items per page.
+- "spatial_coverage": string describing what physical area or zone this page covers \
+  (e.g., "First floor, east wing", "Building section A-A looking north", "Roof plan"). \
+  Empty string for legends/schedules.
+YOU MUST RETURN A SINGLE JSON ARRAY containing one object per page. No other text \
+before or after. The array must be ordered by page_num.
+IMPORTANT RULES:
+1. Cover EVERY page in this batch. Do not skip any.
+2. Be specific in descriptions — mention room numbers, equipment tags, duct sizes, \
+   panel names.
+3. For legend/schedule pages, list the specific items they define in key_elements.
+4. Use discipline-specific vocabulary (e.g., "VAV box", "branch circuit", "sanitary riser").
+"""

prompts/planner.py ADDED Viewed

	@@ -0,0 +1,186 @@

+"""System prompts for the plan_and_select node.
+Two variants:
+- PLANNER_SYSTEM_PROMPT: used when the full PDF is uploaded (first question / no metadata)
+- PLANNER_SYSTEM_PROMPT_METADATA: used when pre-computed page metadata is available
+"""
+PLANNER_SYSTEM_PROMPT = """\
+You are a senior expert in architecture, MEP engineering, structural engineering, \
+and construction documentation. You specialize in interpreting construction drawing \
+sets (architectural, mechanical, electrical, plumbing, structural, civil, demolition).
+You are the PLANNER in a multi-step drawing analysis workflow. You receive the \
+COMPLETE PDF of a construction drawing set along with the user's question.
+YOUR JOB: Analyze the entire PDF to understand what is on each page, then produce \
+a plan that identifies relevant pages and specifies crop tasks for downstream agents.
+YOU MUST RETURN A SINGLE JSON OBJECT with three keys. No other text before or after.
+{
+  "target_pages": [1-indexed page numbers relevant to the question],
+  "legend_pages": [1-indexed page numbers that are legends/schedules/notes],
+  "crop_tasks": [list of crop task objects]
+}
+WORKFLOW RULES:
+1. **Review the entire PDF first.** Understand the drawing set structure: title \
+   sheets, legends, floor plans, details, schedules, sections, elevations.
+2. **Select target pages.** Identify the pages most relevant to the user's question \
+   (up to 10). Include both the detail pages AND any legends/schedules needed to \
+   interpret those pages.
+3. **Identify legend pages.** From your target pages, flag which ones are legends, \
+   schedules, abbreviation lists, symbol keys, general notes, keynotes, or \
+   specification tables. Only include legends DIRECTLY relevant to the question -- \
+   e.g., if the question is about electrical, include the electrical legend only, \
+   NOT plumbing or structural legends.
+4. **Plan crop tasks.** For each relevant area on each target page, create a crop \
+   task object with these fields:
+   - "page_num": 1-indexed page number (first page = 1)
+   - "crop_instruction": precise description of what region to crop, e.g., \
+     "Crop to Room 204 and its immediate surroundings showing all ductwork connections"
+   - "annotate": true/false -- set true when the question requires counting items, \
+     tracing paths, identifying spatial relationships, or distinguishing similar items. \
+     Set false for legend/schedule/text crops.
+   - "annotation_prompt": when annotate=true, a clear prompt describing what to \
+     highlight (colors, numbering, what to annotate). Always include "Keep the \
+     original drawing clearly visible underneath." Leave empty string when annotate=false.
+   - "label": descriptive label for the crop, e.g., "Page 12 (M-101) - Gymnasium HVAC layout"
+   - "priority": 0 for legends/schedules, 1 for detail crops
+5. **Legends first.** Always include crop tasks for relevant legends. Assign priority=0.
+6. **Be specific.** Each crop instruction must describe a precise region of the page. \
+   NEVER write "Crop to the relevant area."
+7. **Minimize work.** Choose the FEWEST crops needed for completeness. Each crop is \
+   expensive (5-30 seconds). Aim for 3-6 total crop tasks. If two items are on the \
+   same area of a page, use ONE crop covering both. One well-targeted crop per page \
+   is usually sufficient.
+8. **Labels matter.** Each crop needs a descriptive label that the analysis model \
+   will use to reference the image.
+ALL PAGE NUMBERS ARE 1-INDEXED. The first page of the PDF is page 1, not page 0.
+EXAMPLE OUTPUT:
+{
+  "target_pages": [5, 12, 14],
+  "legend_pages": [5],
+  "crop_tasks": [
+    {
+      "page_num": 5,
+      "crop_instruction": "Crop to the HVAC legend showing all duct symbols and abbreviations.",
+      "annotate": false,
+      "annotation_prompt": "",
+      "label": "Page 5 (M-001 Legend) - HVAC Symbol Legend",
+      "priority": 0
+    },
+    {
+      "page_num": 12,
+      "crop_instruction": "Crop to the gymnasium area showing all supply air diffusers and ductwork.",
+      "annotate": true,
+      "annotation_prompt": "Draw bright red numbered bounding boxes (1, 2, 3...) around each supply air diffuser symbol. Draw blue boxes around any AHU or RTU. Keep the original drawing clearly visible underneath.",
+      "label": "Page 12 (M-101) - Gymnasium Diffusers",
+      "priority": 1
+    }
+  ]
+}
+"""
+PLANNER_SYSTEM_PROMPT_METADATA = """\
+You are a senior expert in architecture, MEP engineering, structural engineering, \
+and construction documentation. You specialize in interpreting construction drawing \
+sets (architectural, mechanical, electrical, plumbing, structural, civil, demolition).
+You are the PLANNER in a multi-step drawing analysis workflow. You DO NOT have the \
+visual PDF. Instead, you receive STRUCTURED METADATA describing each page of the \
+drawing set. Use this metadata to select relevant pages and plan crop tasks.
+The metadata for each page includes:
+- sheet_id: the sheet number (e.g., "M-101")
+- sheet_title: the sheet name (e.g., "First Floor HVAC Plan")
+- discipline: mechanical/electrical/plumbing/architectural/etc.
+- page_type: floor_plan/legend/schedule/detail/section/elevation/etc.
+- description: 2-4 sentences describing what is visible
+- key_elements: list of notable items (equipment tags, room names, etc.)
+- spatial_coverage: what physical area the page covers
+YOU MUST RETURN A SINGLE JSON OBJECT with three keys. No other text before or after.
+{
+  "target_pages": [1-indexed page numbers relevant to the question],
+  "legend_pages": [1-indexed page numbers that are legends/schedules/notes],
+  "crop_tasks": [list of crop task objects]
+}
+WORKFLOW RULES:
+1. **Scan all page metadata.** Match the user's question against page descriptions, \
+   key_elements, disciplines, and spatial_coverage to find relevant pages.
+2. **Select target pages.** Choose the pages most relevant to the user's question \
+   (up to 10). Use the discipline, page_type, key_elements, and description fields \
+   to make informed selections.
+3. **Identify legend pages.** Use the page_type and discipline fields to find \
+   relevant legends. Only include legends for the discipline(s) relevant to \
+   the question.
+4. **Plan crop tasks.** Based on each page's description and key_elements, create \
+   crop tasks targeting specific regions mentioned in the metadata. Each crop task:
+   - "page_num": 1-indexed page number (first page = 1)
+   - "crop_instruction": precise description of what region to crop. Use information \
+     from the page's description and key_elements to write specific instructions.
+   - "annotate": true when the question requires counting, tracing, or spatial analysis. \
+     false for legends/schedules.
+   - "annotation_prompt": when annotate=true, describe what to highlight. Include \
+     "Keep the original drawing clearly visible underneath." Empty string when annotate=false.
+   - "label": descriptive label using sheet_id and sheet_title from metadata.
+   - "priority": 0 for legends/schedules, 1 for detail crops.
+5. **Legends first.** Always include crop tasks for relevant legends with priority=0.
+6. **Be specific.** Use key_elements and description text from metadata to write \
+   precise crop instructions.
+7. **Minimize work.** Choose the FEWEST crops needed for completeness. Each crop is \
+   expensive (5-30 seconds). Aim for 3-6 total crop tasks. Merge overlapping regions \
+   into a single broader crop rather than creating separate crops for adjacent areas \
+   on the same page. One well-targeted crop per page is usually sufficient.
+8. **Labels matter.** Each crop needs a descriptive label that the analysis model \
+   will use to reference the image.
+ALL PAGE NUMBERS ARE 1-INDEXED. The first page of the PDF is page 1, not page 0.
+EXAMPLE OUTPUT:
+{
+  "target_pages": [5, 12, 14],
+  "legend_pages": [5],
+  "crop_tasks": [
+    {
+      "page_num": 5,
+      "crop_instruction": "Crop to the HVAC legend showing all duct symbols and abbreviations.",
+      "annotate": false,
+      "annotation_prompt": "",
+      "label": "Page 5 (M-001 Legend) - HVAC Symbol Legend",
+      "priority": 0
+    },
+    {
+      "page_num": 12,
+      "crop_instruction": "Crop to the gymnasium area showing all supply air diffusers and ductwork.",
+      "annotate": true,
+      "annotation_prompt": "Draw bright red numbered bounding boxes (1, 2, 3...) around each supply air diffuser symbol. Draw blue boxes around any AHU or RTU. Keep the original drawing clearly visible underneath.",
+      "label": "Page 12 (M-101) - Gymnasium Diffusers",
+      "priority": 1
+    }
+  ]
+}
+"""

tools/__init__.py ADDED Viewed

File without changes

tools/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (218 Bytes). View file

tools/__pycache__/crop_cache.cpython-313.pyc ADDED Viewed

Binary file (6.87 kB). View file

tools/__pycache__/file_search.cpython-313.pyc ADDED Viewed

Binary file (2.72 kB). View file

tools/__pycache__/image_store.cpython-313.pyc ADDED Viewed

Binary file (6.64 kB). View file

tools/__pycache__/metadata_cache.cpython-313.pyc ADDED Viewed

Binary file (6.93 kB). View file

tools/__pycache__/pdf_processor.cpython-313.pyc ADDED Viewed

Binary file (4.47 kB). View file

tools/__pycache__/vector_store.cpython-313.pyc ADDED Viewed

Binary file (3.26 kB). View file

tools/crop_cache.py ADDED Viewed

	@@ -0,0 +1,176 @@

+"""In-session crop cache — avoids redundant Gemini API calls for identical crops.
+Stored in ``st.session_state`` so it persists across questions within a single
+Streamlit session, but is discarded when the session ends.
+Matching strategy:
+- **Exact match** on ``(page_num, crop_instruction)`` is the primary lookup.
+- **Fuzzy match** with a simple normalized overlap score handles cases where
+  the planner rephrases slightly (e.g., "Crop the gymnasium area" vs
+  "Crop gymnasium area showing diffusers").  Only matches above a high
+  threshold (0.85) are considered hits to avoid false positives.
+"""
+from __future__ import annotations
+import logging
+import re
+from dataclasses import dataclass, field
+from state import ImageRef
+logger = logging.getLogger(__name__)
+@dataclass
+class CachedCrop:
+    """A cached crop entry with its original instruction and result."""
+    page_num: int
+    crop_instruction: str
+    label: str
+    image_ref: ImageRef
+    # Normalised token set for fuzzy matching (computed once at insert time)
+    _tokens: frozenset[str] = field(default_factory=frozenset, repr=False)
+def _normalise_tokens(text: str) -> frozenset[str]:
+    """Lowercase, strip punctuation, split into a token set."""
+    cleaned = re.sub(r"[^a-z0-9\s]", "", text.lower())
+    return frozenset(cleaned.split())
+def _token_overlap(a: frozenset[str], b: frozenset[str]) -> float:
+    """Jaccard-style overlap: |intersection| / |union|."""
+    if not a or not b:
+        return 0.0
+    return len(a & b) / len(a | b)
+class CropCache:
+    """Session-scoped cache mapping (page, instruction) → ImageRef.
+    Thread-safe for concurrent reads (dict lookups under CPython's GIL) but
+    writes are serialised via the single-threaded Streamlit main thread.
+    """
+    # Minimum token-overlap score to accept a fuzzy match.
+    # Tuned so that minor rephrasing (dropping "the", "all") still matches
+    # (~0.78 overlap) while genuinely different instructions miss (~0.06-0.42).
+    FUZZY_THRESHOLD = 0.70
+    def __init__(self) -> None:
+        # Primary index: exact (page_num, instruction) → CachedCrop
+        self._exact: dict[tuple[int, str], CachedCrop] = {}
+        # Secondary list for fuzzy scanning (same objects as _exact values)
+        self._entries: list[CachedCrop] = []
+        self._hit_count = 0
+        self._miss_count = 0
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+    def lookup(self, page_num: int, crop_instruction: str) -> ImageRef | None:
+        """Return a cached ImageRef if a matching crop exists, else None.
+        Tries exact match first, then falls back to fuzzy token overlap
+        restricted to the same page.
+        """
+        key = (page_num, crop_instruction)
+        # 1. Exact match
+        if key in self._exact:
+            self._hit_count += 1
+            entry = self._exact[key]
+            logger.info(
+                "CropCache HIT (exact) page=%d instruction='%s' → %s",
+                page_num, crop_instruction[:60], entry.image_ref["id"],
+            )
+            return entry.image_ref
+        # 2. Fuzzy match — only among entries on the same page
+        query_tokens = _normalise_tokens(crop_instruction)
+        best_score = 0.0
+        best_entry: CachedCrop | None = None
+        for entry in self._entries:
+            if entry.page_num != page_num:
+                continue
+            score = _token_overlap(query_tokens, entry._tokens)
+            if score > best_score:
+                best_score = score
+                best_entry = entry
+        if best_entry is not None and best_score >= self.FUZZY_THRESHOLD:
+            self._hit_count += 1
+            logger.info(
+                "CropCache HIT (fuzzy %.2f) page=%d instruction='%s' → %s",
+                best_score, page_num, crop_instruction[:60],
+                best_entry.image_ref["id"],
+            )
+            return best_entry.image_ref
+        self._miss_count += 1
+        return None
+    def register(
+        self,
+        page_num: int,
+        crop_instruction: str,
+        label: str,
+        image_ref: ImageRef,
+        *,
+        is_fallback: bool = False,
+    ) -> None:
+        """Register a successful crop in the cache.
+        Parameters
+        ----------
+        is_fallback
+            If True, the crop is a full-page fallback (Gemini failed to crop).
+            These are NOT cached because they don't represent a useful targeted crop.
+        """
+        if is_fallback:
+            logger.debug(
+                "CropCache SKIP (fallback) page=%d instruction='%s'",
+                page_num, crop_instruction[:60],
+            )
+            return
+        key = (page_num, crop_instruction)
+        if key in self._exact:
+            return  # already cached
+        entry = CachedCrop(
+            page_num=page_num,
+            crop_instruction=crop_instruction,
+            label=label,
+            image_ref=image_ref,
+            _tokens=_normalise_tokens(crop_instruction),
+        )
+        self._exact[key] = entry
+        self._entries.append(entry)
+        logger.info(
+            "CropCache REGISTER page=%d instruction='%s' → %s",
+            page_num, crop_instruction[:60], image_ref["id"],
+        )
+    @property
+    def size(self) -> int:
+        return len(self._entries)
+    @property
+    def stats(self) -> str:
+        total = self._hit_count + self._miss_count
+        rate = (self._hit_count / total * 100) if total > 0 else 0
+        return (
+            f"CropCache: {self.size} entries, "
+            f"{self._hit_count} hits / {self._miss_count} misses "
+            f"({rate:.0f}% hit rate)"
+        )
+    def clear(self) -> None:
+        """Reset the cache (e.g., when a new PDF is loaded)."""
+        self._exact.clear()
+        self._entries.clear()
+        self._hit_count = 0
+        self._miss_count = 0

tools/image_store.py ADDED Viewed

	@@ -0,0 +1,138 @@

+from __future__ import annotations
+import base64
+import io
+import os
+import shutil
+import uuid
+from pathlib import Path
+from PIL import Image
+from state import ImageRef
+class ImageStore:
+    """Disk-based image manager.  LangGraph state only carries lightweight
+    ``ImageRef`` dicts; all heavy image bytes live on disk."""
+    def __init__(self, base_dir: str):
+        self.base_dir = Path(base_dir)
+        self.base_dir.mkdir(parents=True, exist_ok=True)
+        self._pages_dir = self.base_dir / "pages"
+        self._crops_dir = self.base_dir / "crops"
+        self._annotated_dir = self.base_dir / "annotated"
+        for d in (self._pages_dir, self._crops_dir, self._annotated_dir):
+            d.mkdir(exist_ok=True)
+    # ------------------------------------------------------------------
+    # Save helpers
+    # ------------------------------------------------------------------
+    def save_page_image(self, page_num: int, image_bytes: bytes) -> ImageRef:
+        img = Image.open(io.BytesIO(image_bytes))
+        fname = f"page_{page_num}.png"
+        path = self._pages_dir / fname
+        img.save(str(path), format="PNG")
+        return ImageRef(
+            id=f"page_{page_num}",
+            path=str(path),
+            label=f"Page {page_num} (full page)",
+            page_num=page_num,
+            crop_type="full_page",
+            width=img.width,
+            height=img.height,
+        )
+    def save_crop(
+        self,
+        page_num: int,
+        crop_id: str,
+        image: Image.Image,
+        label: str,
+    ) -> ImageRef:
+        fname = f"page_{page_num}_{crop_id}.png"
+        path = self._crops_dir / fname
+        image.save(str(path), format="PNG")
+        return ImageRef(
+            id=f"page_{page_num}_{crop_id}",
+            path=str(path),
+            label=label,
+            page_num=page_num,
+            crop_type="crop",
+            width=image.width,
+            height=image.height,
+        )
+    def save_annotated(
+        self,
+        source_ref: ImageRef,
+        annotated_image: Image.Image,
+    ) -> ImageRef:
+        ann_id = f"{source_ref['id']}_ann"
+        fname = f"{ann_id}.png"
+        path = self._annotated_dir / fname
+        annotated_image.save(str(path), format="PNG")
+        return ImageRef(
+            id=ann_id,
+            path=str(path),
+            label=f"{source_ref['label']} [annotated]",
+            page_num=source_ref["page_num"],
+            crop_type="annotated",
+            width=annotated_image.width,
+            height=annotated_image.height,
+        )
+    # ------------------------------------------------------------------
+    # Load helpers
+    # ------------------------------------------------------------------
+    def load_image(self, ref: ImageRef) -> Image.Image:
+        return Image.open(ref["path"])
+    def load_bytes(self, ref: ImageRef) -> bytes:
+        with open(ref["path"], "rb") as f:
+            return f.read()
+    def get_page_image_path(self, page_num: int) -> str:
+        return str(self._pages_dir / f"page_{page_num}.png")
+    def load_page_bytes(self, page_num: int) -> bytes:
+        path = self.get_page_image_path(page_num)
+        with open(path, "rb") as f:
+            return f.read()
+    # ------------------------------------------------------------------
+    # Format conversions for different model APIs
+    # ------------------------------------------------------------------
+    def to_gemini_part(self, ref: ImageRef):
+        """Return a ``google.genai.types.Part`` for Gemini multimodal prompts."""
+        from google.genai import types
+        return types.Part.from_bytes(
+            data=self.load_bytes(ref),
+            mime_type="image/png",
+        )
+    def to_openai_base64(self, ref: ImageRef) -> dict:
+        """Return an OpenAI-compatible image content block (base64 data URI)."""
+        b64 = base64.b64encode(self.load_bytes(ref)).decode("utf-8")
+        return {
+            "type": "image_url",
+            "image_url": {"url": f"data:image/png;base64,{b64}"},
+        }
+    def create_thumbnail(self, ref: ImageRef, max_size: int = 400) -> bytes:
+        img = self.load_image(ref)
+        img.thumbnail((max_size, max_size))
+        buf = io.BytesIO()
+        img.save(buf, format="PNG")
+        return buf.getvalue()
+    # ------------------------------------------------------------------
+    # Cleanup
+    # ------------------------------------------------------------------
+    def cleanup(self):
+        if self.base_dir.exists():
+            shutil.rmtree(self.base_dir, ignore_errors=True)

tools/metadata_cache.py ADDED Viewed

	@@ -0,0 +1,131 @@

+"""Disk-based metadata cache + thread-safe in-memory container for background generation."""
+from __future__ import annotations
+import hashlib
+import json
+import logging
+import threading
+from pathlib import Path
+logger = logging.getLogger(__name__)
+# Cache directory local to the project
+CACHE_DIR = Path(__file__).resolve().parent.parent / ".cache" / "metadata"
+# Cache version — bump this when the metadata format changes to invalidate old caches.
+# v2: switched page_num from 0-indexed to 1-indexed
+# v3: removed related_legends, has_title_block, title_block_text; parallel batch generation
+_CACHE_VERSION = "v3"
+def _pdf_hash(pdf_bytes: bytes) -> str:
+    """Compute a SHA-256 hash of the PDF bytes for cache keying."""
+    return hashlib.sha256(pdf_bytes).hexdigest()
+def get_cached_metadata(pdf_bytes: bytes) -> list[dict] | None:
+    """Check if metadata exists on disk for the given PDF.
+    Returns the metadata list if found, None otherwise.
+    """
+    cache_path = CACHE_DIR / f"{_pdf_hash(pdf_bytes)}_{_CACHE_VERSION}.json"
+    if cache_path.exists():
+        try:
+            return json.loads(cache_path.read_text(encoding="utf-8"))
+        except (json.JSONDecodeError, OSError):
+            return None
+    return None
+def save_metadata(pdf_bytes: bytes, metadata_list: list[dict]) -> None:
+    """Save metadata to disk, keyed by PDF hash."""
+    CACHE_DIR.mkdir(parents=True, exist_ok=True)
+    cache_path = CACHE_DIR / f"{_pdf_hash(pdf_bytes)}_{_CACHE_VERSION}.json"
+    cache_path.write_text(
+        json.dumps(metadata_list, indent=2),
+        encoding="utf-8",
+    )
+class MetadataState:
+    """Thread-safe container for background metadata generation state.
+    Stored as a single object in ``st.session_state``.  The background thread
+    mutates fields on *this same object* (safe under CPython's GIL for simple
+    attribute assignments).  The main Streamlit thread reads from it on each
+    rerun.
+    """
+    def __init__(self) -> None:
+        self.status: str = "not_started"  # not_started | in_progress | ready | failed
+        self.data_json: str = ""          # pre-serialized JSON for the planner
+        self.error: str | None = None
+        self._lock = threading.Lock()
+    # -- convenience helpers --------------------------------------------------
+    def set_ready(self, data_json: str) -> None:
+        with self._lock:
+            self.data_json = data_json
+            self.status = "ready"
+    def set_failed(self, error: str) -> None:
+        with self._lock:
+            self.error = error
+            self.status = "failed"
+    def set_in_progress(self) -> None:
+        with self._lock:
+            self.status = "in_progress"
+    @property
+    def is_ready(self) -> bool:
+        return self.status == "ready"
+    def generate_sync(
+        self,
+        pdf_path: str,
+        num_pages: int,
+        pdf_bytes: bytes,
+    ) -> None:
+        """Generate metadata synchronously (blocking).
+        Same logic as ``start_background_generation`` but runs in the calling
+        thread.  Used during initialization so metadata is ready before the
+        user can ask questions.
+        """
+        self.set_in_progress()
+        try:
+            from nodes.metadata_generator import generate_page_metadata
+            metadata_list = generate_page_metadata(pdf_path, num_pages)
+            save_metadata(pdf_bytes, metadata_list)
+            self.set_ready(json.dumps(metadata_list, indent=2))
+            logger.info("Metadata generation complete.")
+        except Exception as e:
+            self.set_failed(str(e))
+            logger.exception("Metadata generation failed")
+    def start_background_generation(
+        self,
+        pdf_path: str,
+        num_pages: int,
+        pdf_bytes: bytes,
+    ) -> None:
+        """Launch a daemon thread that generates metadata and writes to disk cache."""
+        self.set_in_progress()
+        def _run():
+            try:
+                from nodes.metadata_generator import generate_page_metadata
+                metadata_list = generate_page_metadata(pdf_path, num_pages)
+                save_metadata(pdf_bytes, metadata_list)
+                self.set_ready(json.dumps(metadata_list, indent=2))
+                logger.info("Background metadata generation complete.")
+            except Exception as e:
+                self.set_failed(str(e))
+                logger.exception("Background metadata generation failed")
+        thread = threading.Thread(target=_run, daemon=True)
+        thread.start()

tools/pdf_processor.py ADDED Viewed

	@@ -0,0 +1,95 @@

+"""PDF page rendering (PyMuPDF/fitz) — upfront bulk rendering at ingest time."""
+from __future__ import annotations
+from pathlib import Path
+import fitz  # PyMuPDF
+from config import PDF_RENDER_DPI
+def get_page_count(pdf_path: str) -> int:
+    """Return the number of pages in a PDF without rendering anything."""
+    doc = fitz.open(pdf_path)
+    count = len(doc)
+    doc.close()
+    return count
+def render_pages(pdf_path: str, output_dir: str, dpi: int = PDF_RENDER_DPI) -> int:
+    """Render every PDF page as a PNG image.
+    This is the primary rendering method, called once during PDF ingestion
+    to pre-render all pages at the configured DPI.
+    """
+    out = Path(output_dir)
+    out.mkdir(parents=True, exist_ok=True)
+    doc = fitz.open(pdf_path)
+    num_pages = len(doc)
+    zoom = dpi / 72.0
+    matrix = fitz.Matrix(zoom, zoom)
+    for page_num in range(num_pages):
+        page = doc.load_page(page_num)
+        pix = page.get_pixmap(matrix=matrix)
+        img_bytes = pix.tobytes("png")
+        img_path = out / f"page_{page_num}.png"
+        with open(img_path, "wb") as f:
+            f.write(img_bytes)
+    doc.close()
+    return num_pages
+def render_single_page(
+    pdf_path: str,
+    page_num: int,
+    output_dir: str,
+    dpi: int = PDF_RENDER_DPI,
+) -> None:
+    """Render a single PDF page as a PNG and save to disk."""
+    out = Path(output_dir)
+    out.mkdir(parents=True, exist_ok=True)
+    doc = fitz.open(pdf_path)
+    zoom = dpi / 72.0
+    page = doc.load_page(page_num)
+    pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom))
+    img_path = out / f"page_{page_num}.png"
+    with open(img_path, "wb") as f:
+        f.write(pix.tobytes("png"))
+    doc.close()
+def extract_page_range_bytes(pdf_path: str, start: int, end: int) -> bytes:
+    """Extract a range of pages from a PDF and return as in-memory PDF bytes.
+    Args:
+        pdf_path: Path to the source PDF.
+        start: First page index (0-indexed, inclusive).
+        end: Last page index (0-indexed, inclusive).
+    Returns:
+        Raw bytes of a new PDF containing only the specified pages.
+    """
+    src = fitz.open(pdf_path)
+    dst = fitz.open()  # new empty PDF
+    dst.insert_pdf(src, from_page=start, to_page=end)
+    pdf_bytes = dst.tobytes()
+    dst.close()
+    src.close()
+    return pdf_bytes
+def get_page_image_bytes(
+    page_image_dir: str,
+    page_num: int,
+) -> bytes:
+    """Load a pre-rendered page image from disk.
+    Pages are expected to already exist from the upfront bulk render
+    performed during PDF ingestion.
+    """
+    path = Path(page_image_dir) / f"page_{page_num}.png"
+    return path.read_bytes()