Ryan2219 commited on
Commit
b5bb6b7
·
verified ·
1 Parent(s): b2d54a4

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +135 -67
app.py CHANGED
@@ -25,6 +25,7 @@ class InterfaceState:
25
  self.analysis_messages = []
26
  self.current_chapter = ""
27
  self.current_images = []
 
28
  self.final_answer = ""
29
  self.done = False
30
  self.lock = threading.Lock()
@@ -50,6 +51,18 @@ class InterfaceState:
50
  with self.lock:
51
  self.current_images.append(img_pil)
52
  return self.current_images.copy()
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
  def clear(self):
55
  with self.lock:
@@ -59,6 +72,8 @@ class InterfaceState:
59
  self.current_images.clear()
60
  self.final_answer = ""
61
  self.done = False
 
 
62
 
63
  state = InterfaceState()
64
 
@@ -127,6 +142,8 @@ chroma_client = chromadb.PersistentClient(path="nyc_code_db")
127
  embedding_model = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
128
  collection = chroma_client.get_collection(name="nyc_building_codes", embedding_function=embedding_model)
129
 
 
 
130
  # Modified tool functions with Gradio updates
131
  def search_page_text(page_number: int, research_goal: str):
132
  state.add_log(f'🔍 Searching page **{page_metadata[page_number]["sheet_title"]}** for details')
@@ -539,6 +556,33 @@ def extract_json(s: str):
539
  json_str = s[start:end+1]
540
  return json.loads(json_str)
541
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
542
  def execute_page_expert(expert_instructions: str, page_num: int):
543
  state.add_log(f'👁️ Spawning Page Expert for page **{page_num}**')
544
  state.add_analysis(f"👁️ Page Expert searching for {expert_instructions}")
@@ -686,7 +730,7 @@ def execute_page_expert(expert_instructions: str, page_num: int):
686
  }
687
  ]
688
 
689
- MAX_TURNS = 10
690
 
691
  for turn in range(MAX_TURNS):
692
  response = client.chat.completions.create(
@@ -702,23 +746,46 @@ def execute_page_expert(expert_instructions: str, page_num: int):
702
  if msg.content:
703
  try:
704
  res = extract_json(msg.content)
 
 
705
  state.add_analysis(
706
  f"🟨 Page Analyst\n{res.get('findings','')}"
707
  )
708
- tile_idxs = res.get("visual_pointers", [])
709
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
710
 
711
- stitched_bytes = merge_tiles(
712
- tile_indexes=tile_idxs,
713
- page_num=page_num
 
 
 
 
 
 
 
 
714
  )
715
 
716
- stitched_img = Image.open(
717
- io.BytesIO(stitched_bytes)
718
- )
719
- state.add_image(stitched_img)
720
-
721
- return extract_json(msg.content)
722
  except:
723
  pass
724
 
@@ -783,7 +850,7 @@ def execute_page_expert(expert_instructions: str, page_num: int):
783
  tools_list = [search_page_text, nyc_legal_sub_agent, execute_page_expert]
784
  import time
785
  planner = genai.Client()
786
- planner_model = "gemini-3-pro-preview"
787
  planner_prompt = f"""
788
  You are the Lead Architectural Compliance Planner for NYC Building Code and Zoning review.
789
 
@@ -886,6 +953,13 @@ planner_prompt = f"""
886
  - NEVER issue a final verdict without calling `execute_page_expert`
887
  - If no page contains sufficient proof, return **Unverified**
888
  - Prefer false negatives over false positives
 
 
 
 
 
 
 
889
 
890
  ========================
891
  QUALITY STANDARD
@@ -907,81 +981,75 @@ def agent_worker(user_question):
907
  state.add_log(f'🚀 Starting analysis for: **{user_question}**')
908
  state.add_analysis("🧠 Planner initialized. Awaiting tool calls...")
909
 
 
 
910
  response = chat.send_message(user_question)
911
 
 
 
 
912
  while response.candidates[0].content.parts[0].function_call:
913
  tool_responses = []
914
- pending_images = []
915
 
916
  for part in response.candidates[0].content.parts:
917
  if part.function_call:
918
  name = part.function_call.name
919
  args = part.function_call.args
920
-
921
- state.add_log(f'🛠️ Planner calling: **{name}**')
922
- state.add_analysis(
923
- f"### 🛠️ Tool Call: `{name}`\n"
924
- f"```json\n{json.dumps(args, indent=2)}\n```"
925
- )
926
 
927
  func = globals()[name]
928
  result = func(**args)
929
 
930
- # -----------------------------
931
- # STREAM REAL TOOL OUTPUTS
932
- # -----------------------------
933
-
934
- # search_page_text
935
- # execute_page_expert
936
-
937
- if name == "execute_page_expert":
938
-
939
- tile_idxs = result.get("visual_pointers", [])
940
- page_num = args.get("page_num")
941
-
942
- if tile_idxs:
943
- state.add_log(f'📸 Stitching high-res proof for tiles: **{tile_idxs}**')
944
- state.add_analysis(
945
- f"📸 Visual proof requested for tiles `{tile_idxs}` on page `{page_num}`"
946
- )
947
-
948
- stitched_bytes = merge_tiles(
949
- tile_indexes=tile_idxs,
950
- page_num=page_num
951
- )
952
-
953
- pending_images.append(
954
- types.Part.from_bytes(stitched_bytes, mime_type="image/png")
955
- )
956
- pending_images.append(types.Part.from_bytes(
957
- image_bytes_list[page_num],
958
- mime_type="image/png"
959
- ))
960
-
961
-
962
  tool_responses.append(
963
- types.Part.from_function_response(
964
- name=name,
965
- response={"result": result}
966
- )
967
  )
968
-
969
 
970
- state.add_analysis("🧠 Returning tool outputs to planner...")
971
  response = chat.send_message(tool_responses)
972
- if pending_images:
973
- state.add_log(f'📸 Sending {len(pending_images)} images to Planner...')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
974
 
975
- # We send the images with a prompt telling the model what they are
976
- pending_images.insert(0, "Here is the visual proof generated by the tool. Please use this to confirm your final answer:")
 
 
 
 
 
977
 
978
- # This generates the ACTUAL final answer that sees the image
979
- response = chat.send_message(pending_images)
980
-
 
981
  state.add_log('🏁 **ANALYSIS COMPLETE**')
982
- state.add_analysis("✅ Planner finished. Final verdict generated.")
983
- state.final_answer = response.text
984
  state.done = True
 
985
 
986
  def run_agentic_workflow(user_question):
987
  state.done = False
 
25
  self.analysis_messages = []
26
  self.current_chapter = ""
27
  self.current_images = []
28
+ self.staged_audit_images = []
29
  self.final_answer = ""
30
  self.done = False
31
  self.lock = threading.Lock()
 
51
  with self.lock:
52
  self.current_images.append(img_pil)
53
  return self.current_images.copy()
54
+
55
+ def add_staged_image_part(self, image_part):
56
+ """Thread-safe method to stage images for the Gemini Audit."""
57
+ with self.lock:
58
+ self.staged_audit_images.append(image_part)
59
+ # Log it so we can verify it happened in the console
60
+ print(f"DEBUG: Staged image part. Total staged: {len(self.staged_audit_images)}")
61
+
62
+ def get_staged_images(self):
63
+ """Safely retrieve the staged images for the audit turn."""
64
+ with self.lock:
65
+ return list(self.staged_audit_images) # Return a copy to prevent mutation
66
 
67
  def clear(self):
68
  with self.lock:
 
72
  self.current_images.clear()
73
  self.final_answer = ""
74
  self.done = False
75
+
76
+
77
 
78
  state = InterfaceState()
79
 
 
142
  embedding_model = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
143
  collection = chroma_client.get_collection(name="nyc_building_codes", embedding_function=embedding_model)
144
 
145
+ all_pending_images = []
146
+
147
  # Modified tool functions with Gradio updates
148
  def search_page_text(page_number: int, research_goal: str):
149
  state.add_log(f'🔍 Searching page **{page_metadata[page_number]["sheet_title"]}** for details')
 
556
  json_str = s[start:end+1]
557
  return json.loads(json_str)
558
 
559
+ def sanitize_tile_indices(data):
560
+ """
561
+ Forcefully converts various LLM outputs into a clean list of integers.
562
+ Handles: [1, 2], ["1", "2"], "1, 2, 3", "[1, 2, 3]", and None.
563
+ """
564
+ if not data:
565
+ return []
566
+
567
+ # If it's already a list, ensure all elements are integers
568
+ if isinstance(data, list):
569
+ clean_list = []
570
+ for item in data:
571
+ try:
572
+ # This handles strings inside the list like ["1", "2"]
573
+ clean_list.append(int(str(item).strip()))
574
+ except (ValueError, TypeError):
575
+ continue
576
+ return clean_list
577
+
578
+ # If it's a string, use Regex to find all sequences of digits
579
+ if isinstance(data, str):
580
+ # findall returns all non-overlapping matches of the pattern
581
+ numbers = re.findall(r'\d+', data)
582
+ return [int(n) for n in numbers]
583
+
584
+ return []
585
+
586
  def execute_page_expert(expert_instructions: str, page_num: int):
587
  state.add_log(f'👁️ Spawning Page Expert for page **{page_num}**')
588
  state.add_analysis(f"👁️ Page Expert searching for {expert_instructions}")
 
730
  }
731
  ]
732
 
733
+ MAX_TURNS = 3
734
 
735
  for turn in range(MAX_TURNS):
736
  response = client.chat.completions.create(
 
746
  if msg.content:
747
  try:
748
  res = extract_json(msg.content)
749
+
750
+
751
  state.add_analysis(
752
  f"🟨 Page Analyst\n{res.get('findings','')}"
753
  )
754
+ raw_pointers = res.get("visual_pointers", [])
755
+ tile_idxs = sanitize_tile_indices(raw_pointers)
756
+
757
+
758
+ if tile_idxs and tile_idxs != '[]':
759
+ stitched_bytes = merge_tiles(
760
+ tile_indexes=tile_idxs,
761
+ page_num=page_num
762
+ )
763
+
764
+ state.add_log(f'📸 Staging {len(tile_idxs)} tiles for final audit...')
765
+
766
+ # Store these to use AFTER the chat finishes
767
+ state.add_staged_image_part(
768
+ types.Part.from_bytes(
769
+ data=stitched_bytes, # <-- 'data=' is required here
770
+ mime_type="image/png"
771
+ )
772
+ )
773
+
774
 
775
+ stitched_img = Image.open(
776
+ io.BytesIO(stitched_bytes)
777
+ )
778
+ state.add_image(stitched_img)
779
+
780
+
781
+ state.add_staged_image_part(
782
+ types.Part.from_bytes(
783
+ data=image_bytes_list[page_num], # <-- 'data=' is required here
784
+ mime_type="image/png"
785
+ )
786
  )
787
 
788
+ return res
 
 
 
 
 
789
  except:
790
  pass
791
 
 
850
  tools_list = [search_page_text, nyc_legal_sub_agent, execute_page_expert]
851
  import time
852
  planner = genai.Client()
853
+ planner_model = "gemini-3-flash-preview"
854
  planner_prompt = f"""
855
  You are the Lead Architectural Compliance Planner for NYC Building Code and Zoning review.
856
 
 
953
  - NEVER issue a final verdict without calling `execute_page_expert`
954
  - If no page contains sufficient proof, return **Unverified**
955
  - Prefer false negatives over false positives
956
+ *** CRITICAL VISUAL PROTOCOL ***
957
+ - When `execute_page_expert` returns, it will explicitly state "VISUAL_PROOF_PENDING".
958
+ - When you see this, your ONLY response must be: "Awaiting visual proof."
959
+ - DO NOT attempt to guess the verdict.
960
+ - DO NOT complain about missing images.
961
+ - Simply wait. The user will immediately send the images in the next turn.
962
+
963
 
964
  ========================
965
  QUALITY STANDARD
 
981
  state.add_log(f'🚀 Starting analysis for: **{user_question}**')
982
  state.add_analysis("🧠 Planner initialized. Awaiting tool calls...")
983
 
984
+ # 1. Initialize the Stateful Chat
985
+ chat = planner.chats.create(model=planner_model, config=config)
986
  response = chat.send_message(user_question)
987
 
988
+ # 2. Track images throughout the conversation
989
+
990
+ # 3. Standard Tool Loop (Phases 1-3)
991
  while response.candidates[0].content.parts[0].function_call:
992
  tool_responses = []
 
993
 
994
  for part in response.candidates[0].content.parts:
995
  if part.function_call:
996
  name = part.function_call.name
997
  args = part.function_call.args
998
+ state.add_log(f'🛠️ Tool Call: **{name}**')
 
 
 
 
 
999
 
1000
  func = globals()[name]
1001
  result = func(**args)
1002
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1003
  tool_responses.append(
1004
+ types.Part.from_function_response(name=name, response={"result": result})
 
 
 
1005
  )
 
1006
 
1007
+ # Send tool results back to the stateful chat
1008
  response = chat.send_message(tool_responses)
1009
+
1010
+ # -----------------------------------------------------------------
1011
+ # PHASE 4: THE POST-CHAT HANDOFF (The "Visual Audit")
1012
+ # -----------------------------------------------------------------
1013
+
1014
+ # At this point, the while loop has ended.
1015
+ # 'response.text' contains the model's preliminary answer.
1016
+
1017
+ audit_images = state.get_staged_images()
1018
+
1019
+ if audit_images:
1020
+ state.add_log(f"👁️ Preliminary answer received. Performing audit with {len(audit_images)} images...")
1021
+
1022
+ # 1. Construct the audit parts
1023
+ # Ensure 'text=' is used for the Part constructor
1024
+ audit_parts = [
1025
+ types.Part.from_text(
1026
+ text="You have provided a preliminary verdict. Now, look at these images "
1027
+ "to verify your findings. If the visual evidence contradicts your "
1028
+ "text-based search, update your verdict now. "
1029
+ ),
1030
+ *audit_images
1031
+ ]
1032
+
1033
+ try:
1034
+ # 2. Send directly through the 'chat' session
1035
+ # This automatically appends to history and maintains the session state
1036
+ final_response = chat.send_message(audit_parts)
1037
 
1038
+ state.final_answer = final_response.text
1039
+
1040
+ except Exception as e:
1041
+ # If the above fails, try the explicit message keyword
1042
+ state.add_log("🔄 Retrying audit with explicit message keyword...")
1043
+ final_response = chat.send_message(message=audit_parts)
1044
+ state.final_answer = final_response.text
1045
 
1046
+ else:
1047
+ state.add_log("⚠️ No images found in state. Skipping visual audit.")
1048
+ state.final_answer = response.text
1049
+
1050
  state.add_log('🏁 **ANALYSIS COMPLETE**')
 
 
1051
  state.done = True
1052
+
1053
 
1054
  def run_agentic_workflow(user_question):
1055
  state.done = False