zye0616 Claude Opus 4.6 (1M context) commited on
Commit
75ec737
Β·
1 Parent(s): 240e068

test: add AssessDetections BAML tests with real SAR frame

Browse files

Two test cases using an actual flood rescue aerial frame:
- SAR_PersonOnRooftop: person on rooftop satisfies rescue mission
- SAR_DogNotMissionTarget: neither person nor dog satisfies cargo mission

Also fixes AssessDetections prompt to use proper system/user roles
for OpenAI image compatibility.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

baml_client/inlinedbaml.py CHANGED
@@ -14,7 +14,7 @@ _file_map = {
14
 
15
  "clients.baml": "// ISR LLM clients\n\nclient<llm> GPT4oMini {\n provider openai\n retry_policy Retry\n options {\n model \"gpt-4o-mini\"\n api_key env.OPENAI_API_KEY\n temperature 0.1\n }\n}\n\nclient<llm> GPT4o {\n provider openai\n retry_policy Retry\n options {\n model \"gpt-4o\"\n api_key env.OPENAI_API_KEY\n temperature 0.2\n }\n}\n\nretry_policy Retry {\n max_retries 2\n strategy {\n type exponential_backoff\n delay_ms 500\n multiplier 2.0\n max_delay_ms 5000\n }\n}\n",
16
  "generators.baml": "// This helps use auto generate libraries you can use in the language of\n// your choice. You can have multiple generators if you use multiple languages.\n// Just ensure that the output_dir is different for each generator.\ngenerator target {\n // Valid values: \"python/pydantic\", \"typescript\", \"go\", \"rust\", \"ruby/sorbet\", \"rest/openapi\"\n output_type \"python/pydantic\"\n\n // Where the generated code will be saved (relative to baml_src/)\n output_dir \"../\"\n\n // The version of the BAML package you have installed (e.g. same version as your baml-py or @boundaryml/baml).\n // The BAML VSCode extension version should also match this version.\n version \"0.220.0\"\n\n // Valid values: \"sync\", \"async\"\n // This controls what `b.FunctionName()` will be (sync or async).\n default_client_mode sync\n}\n",
17
- "isr.baml": "// ISR Mission Planning & Assessment Functions\n\n// ── Mission Planning ─────────────────────────────────────────────\n// Takes a free-form mission objective and produces:\n// 1. Concrete object class queries for the detector (YOLO/DETR/GDINO)\n// 2. A refined mission statement for downstream assessment\n\nclass MissionPlan {\n detector_queries string[] @description(\"Object class labels to feed to the detector. Use common COCO class names (person, car, truck, bicycle, motorcycle, bus, dog, cat, etc.) or short noun phrases for open-vocabulary detectors. 3-8 items.\")\n refined_mission string @description(\"A clear, one-sentence restatement of the mission objective that an analyst can evaluate each detection against.\")\n reasoning string @description(\"Brief explanation of why these queries were chosen.\")\n}\n\nfunction PlanMission(mission_text: string) -> MissionPlan {\n client GPT4oMini\n prompt #\"\n You are an ISR (Intelligence, Surveillance, Reconnaissance) mission planner.\n\n Given a free-form mission objective, determine:\n 1. What object classes a visual detector should look for (use common COCO labels when possible: person, car, truck, bus, motorcycle, bicycle, dog, cat, backpack, suitcase, etc.)\n 2. A refined mission statement that an analyst can use to evaluate each detection.\n\n Mission objective: \"{{ mission_text }}\"\n\n {{ ctx.output_format }}\n \"#\n}\n\n\n// ── Detection Assessment ─────────────────────────────────────────\n// Replaces hand-rolled JSON parsing with type-safe BAML output\n\nclass DetectionInfo {\n track_id string\n class_label string\n bbox_width_px int\n bbox_height_px int\n area_ratio float @description(\"Fraction of frame area occupied by this detection\")\n speed_kph float\n direction string @description(\"Clock direction (e.g. '3h' for rightward) or 'unknown'\")\n}\n\nclass DetectionVerdict {\n track_id string\n mission_relevant bool @description(\"Does this CLASS of object relate to the mission at all?\")\n satisfies bool? @description(\"Does THIS SPECIFIC detection meet the mission criteria? null if uncertain.\")\n reason string @description(\"1-2 sentences explaining the assessment\")\n features map<string, string> @description(\"2-5 key-value pairs of observable properties relevant to the mission\")\n}\n\n// ── PlanMission Tests ────────────────────────────────────────────\n\ntest HeavyCargoVehicles {\n functions [PlanMission]\n args {\n mission_text \"identify vehicles that can carry heavy cargos\"\n }\n @@assert( {{ this.detector_queries|length >= 3 }} )\n @@assert( {{ \"truck\" in this.detector_queries }} )\n}\n\ntest PersonOnRooftop {\n functions [PlanMission]\n args {\n mission_text \"identify person stranded on rooftop\"\n }\n @@assert( {{ this.detector_queries|length >= 3 }} )\n @@assert( {{ \"person\" in this.detector_queries }} )\n}\n\n\n// ── Detection Assessment ─────────────────────────────────────────\n// Replaces hand-rolled JSON parsing with type-safe BAML output\n\nfunction AssessDetections(mission: string, detections: DetectionInfo[], frame_image: image) -> DetectionVerdict[] {\n client GPT4oMini\n prompt #\"\n {_.role(\"user\")}\n\n You are an ISR analyst assessing aerial drone detections against a mission objective.\n\n Mission: \"{{ mission }}\"\n\n Detected objects:\n {% for d in detections %}\n - {{ d.track_id }}: class={{ d.class_label }}, bbox={{ d.bbox_width_px }}x{{ d.bbox_height_px }}px, area_ratio={{ d.area_ratio }}, speed={{ d.speed_kph }}kph, direction={{ d.direction }}\n {% endfor %}\n\n Frame context (showing all detections):\n {{ frame_image }}\n\n Assess each detection against the mission.\n\n {{ ctx.output_format }}\n \"#\n}\n",
18
  }
19
 
20
  def get_baml_files():
 
14
 
15
  "clients.baml": "// ISR LLM clients\n\nclient<llm> GPT4oMini {\n provider openai\n retry_policy Retry\n options {\n model \"gpt-4o-mini\"\n api_key env.OPENAI_API_KEY\n temperature 0.1\n }\n}\n\nclient<llm> GPT4o {\n provider openai\n retry_policy Retry\n options {\n model \"gpt-4o\"\n api_key env.OPENAI_API_KEY\n temperature 0.2\n }\n}\n\nretry_policy Retry {\n max_retries 2\n strategy {\n type exponential_backoff\n delay_ms 500\n multiplier 2.0\n max_delay_ms 5000\n }\n}\n",
16
  "generators.baml": "// This helps use auto generate libraries you can use in the language of\n// your choice. You can have multiple generators if you use multiple languages.\n// Just ensure that the output_dir is different for each generator.\ngenerator target {\n // Valid values: \"python/pydantic\", \"typescript\", \"go\", \"rust\", \"ruby/sorbet\", \"rest/openapi\"\n output_type \"python/pydantic\"\n\n // Where the generated code will be saved (relative to baml_src/)\n output_dir \"../\"\n\n // The version of the BAML package you have installed (e.g. same version as your baml-py or @boundaryml/baml).\n // The BAML VSCode extension version should also match this version.\n version \"0.220.0\"\n\n // Valid values: \"sync\", \"async\"\n // This controls what `b.FunctionName()` will be (sync or async).\n default_client_mode sync\n}\n",
17
+ "isr.baml": "// ISR Mission Planning & Assessment Functions\n\n// ── Mission Planning ─────────────────────────────────────────────\n// Takes a free-form mission objective and produces:\n// 1. Concrete object class queries for the detector (YOLO/DETR/GDINO)\n// 2. A refined mission statement for downstream assessment\n\nclass MissionPlan {\n detector_queries string[] @description(\"Object class labels to feed to the detector. Use common COCO class names (person, car, truck, bicycle, motorcycle, bus, dog, cat, etc.) or short noun phrases for open-vocabulary detectors. 3-8 items.\")\n refined_mission string @description(\"A clear, one-sentence restatement of the mission objective that an analyst can evaluate each detection against.\")\n reasoning string @description(\"Brief explanation of why these queries were chosen.\")\n}\n\nfunction PlanMission(mission_text: string) -> MissionPlan {\n client GPT4oMini\n prompt #\"\n You are an ISR (Intelligence, Surveillance, Reconnaissance) mission planner.\n\n Given a free-form mission objective, determine:\n 1. What object classes a visual detector should look for (use common COCO labels when possible: person, car, truck, bus, motorcycle, bicycle, dog, cat, backpack, suitcase, etc.)\n 2. A refined mission statement that an analyst can use to evaluate each detection.\n\n Mission objective: \"{{ mission_text }}\"\n\n {{ ctx.output_format }}\n \"#\n}\n\n\n// ── Detection Assessment ─────────────────────────────────────────\n// Replaces hand-rolled JSON parsing with type-safe BAML output\n\nclass DetectionInfo {\n track_id string\n class_label string\n bbox_width_px int\n bbox_height_px int\n area_ratio float @description(\"Fraction of frame area occupied by this detection\")\n speed_kph float\n direction string @description(\"Clock direction (e.g. '3h' for rightward) or 'unknown'\")\n}\n\nclass DetectionVerdict {\n track_id string\n mission_relevant bool @description(\"Does this CLASS of object relate to the mission at all?\")\n satisfies bool? @description(\"Does THIS SPECIFIC detection meet the mission criteria? null if uncertain.\")\n reason string @description(\"1-2 sentences explaining the assessment\")\n features map<string, string> @description(\"2-5 key-value pairs of observable properties relevant to the mission\")\n}\n\n// ── PlanMission Tests ────────────────────────────────────────────\n\ntest HeavyCargoVehicles {\n functions [PlanMission]\n args {\n mission_text \"identify vehicles that can carry heavy cargos\"\n }\n @@assert( {{ this.detector_queries|length >= 3 }} )\n @@assert( {{ \"truck\" in this.detector_queries }} )\n}\n\ntest PersonOnRooftop {\n functions [PlanMission]\n args {\n mission_text \"identify person stranded on rooftop\"\n }\n @@assert( {{ this.detector_queries|length >= 3 }} )\n @@assert( {{ \"person\" in this.detector_queries }} )\n}\n\n\n// ── Detection Assessment ─────────────────────────────────────────\n// Replaces hand-rolled JSON parsing with type-safe BAML output\n\nfunction AssessDetections(mission: string, detections: DetectionInfo[], frame_image: image) -> DetectionVerdict[] {\n client GPT4oMini\n prompt #\"\n {{ _.role(\"system\") }}\n You are an ISR analyst assessing aerial drone detections against a mission objective.\n\n {{ _.role(\"user\") }}\n Mission: \"{{ mission }}\"\n\n Detected objects:\n {% for d in detections %}\n - {{ d.track_id }}: class={{ d.class_label }}, bbox={{ d.bbox_width_px }}x{{ d.bbox_height_px }}px, area_ratio={{ d.area_ratio }}, speed={{ d.speed_kph }}kph, direction={{ d.direction }}\n {% endfor %}\n\n Frame context (showing all detections):\n {{ frame_image }}\n\n Assess each detection against the mission.\n\n {{ ctx.output_format }}\n \"#\n}\n\n// ── AssessDetections Tests ───────────────────────────────────────\n// SAR flood scene: person + dog on rooftop, surrounded by floodwater\n\ntest SAR_PersonOnRooftop {\n functions [AssessDetections]\n args {\n mission \"identify person stranded on rooftop needing rescue\"\n detections [\n {\n track_id \"T01\"\n class_label \"person\"\n bbox_width_px 120\n bbox_height_px 280\n area_ratio 0.0059\n speed_kph 0.0\n direction \"unknown\"\n },\n {\n track_id \"T02\"\n class_label \"dog\"\n bbox_width_px 80\n bbox_height_px 50\n area_ratio 0.0007\n speed_kph 0.0\n direction \"unknown\"\n }\n ]\n frame_image {\n file \"fixtures/sar_rooftop.jpg\"\n media_type \"image/jpeg\"\n }\n }\n @@assert( {{ this|length == 2 }} )\n @@assert( {{ this[0].track_id == \"T01\" }} )\n @@assert( {{ this[0].mission_relevant == true }} )\n @@assert( {{ this[0].satisfies == true }} )\n}\n\ntest SAR_DogNotMissionTarget {\n functions [AssessDetections]\n args {\n mission \"identify vehicles capable of transporting heavy cargo\"\n detections [\n {\n track_id \"T01\"\n class_label \"person\"\n bbox_width_px 120\n bbox_height_px 280\n area_ratio 0.0059\n speed_kph 0.0\n direction \"unknown\"\n },\n {\n track_id \"T02\"\n class_label \"dog\"\n bbox_width_px 80\n bbox_height_px 50\n area_ratio 0.0007\n speed_kph 0.0\n direction \"unknown\"\n }\n ]\n frame_image {\n file \"fixtures/sar_rooftop.jpg\"\n media_type \"image/jpeg\"\n }\n }\n @@assert( {{ this|length == 2 }} )\n @@assert( {{ this[0].satisfies != true }} )\n @@assert( {{ this[1].satisfies != true }} )\n}\n",
18
  }
19
 
20
  def get_baml_files():
baml_src/fixtures/sar_rooftop.jpg ADDED

Git LFS Details

  • SHA256: 3eb820eaf8ffb1861c6a3e7574c673f4eeec553bf1cdbc9fcdaad962e5392d35
  • Pointer size: 131 Bytes
  • Size of remote file: 269 kB
baml_src/isr.baml CHANGED
@@ -75,10 +75,10 @@ test PersonOnRooftop {
75
  function AssessDetections(mission: string, detections: DetectionInfo[], frame_image: image) -> DetectionVerdict[] {
76
  client GPT4oMini
77
  prompt #"
78
- {_.role("user")}
79
-
80
  You are an ISR analyst assessing aerial drone detections against a mission objective.
81
 
 
82
  Mission: "{{ mission }}"
83
 
84
  Detected objects:
@@ -94,3 +94,75 @@ function AssessDetections(mission: string, detections: DetectionInfo[], frame_im
94
  {{ ctx.output_format }}
95
  "#
96
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  function AssessDetections(mission: string, detections: DetectionInfo[], frame_image: image) -> DetectionVerdict[] {
76
  client GPT4oMini
77
  prompt #"
78
+ {{ _.role("system") }}
 
79
  You are an ISR analyst assessing aerial drone detections against a mission objective.
80
 
81
+ {{ _.role("user") }}
82
  Mission: "{{ mission }}"
83
 
84
  Detected objects:
 
94
  {{ ctx.output_format }}
95
  "#
96
  }
97
+
98
+ // ── AssessDetections Tests ───────────────────────────────────────
99
+ // SAR flood scene: person + dog on rooftop, surrounded by floodwater
100
+
101
+ test SAR_PersonOnRooftop {
102
+ functions [AssessDetections]
103
+ args {
104
+ mission "identify person stranded on rooftop needing rescue"
105
+ detections [
106
+ {
107
+ track_id "T01"
108
+ class_label "person"
109
+ bbox_width_px 120
110
+ bbox_height_px 280
111
+ area_ratio 0.0059
112
+ speed_kph 0.0
113
+ direction "unknown"
114
+ },
115
+ {
116
+ track_id "T02"
117
+ class_label "dog"
118
+ bbox_width_px 80
119
+ bbox_height_px 50
120
+ area_ratio 0.0007
121
+ speed_kph 0.0
122
+ direction "unknown"
123
+ }
124
+ ]
125
+ frame_image {
126
+ file "fixtures/sar_rooftop.jpg"
127
+ media_type "image/jpeg"
128
+ }
129
+ }
130
+ @@assert( {{ this|length == 2 }} )
131
+ @@assert( {{ this[0].track_id == "T01" }} )
132
+ @@assert( {{ this[0].mission_relevant == true }} )
133
+ @@assert( {{ this[0].satisfies == true }} )
134
+ }
135
+
136
+ test SAR_DogNotMissionTarget {
137
+ functions [AssessDetections]
138
+ args {
139
+ mission "identify vehicles capable of transporting heavy cargo"
140
+ detections [
141
+ {
142
+ track_id "T01"
143
+ class_label "person"
144
+ bbox_width_px 120
145
+ bbox_height_px 280
146
+ area_ratio 0.0059
147
+ speed_kph 0.0
148
+ direction "unknown"
149
+ },
150
+ {
151
+ track_id "T02"
152
+ class_label "dog"
153
+ bbox_width_px 80
154
+ bbox_height_px 50
155
+ area_ratio 0.0007
156
+ speed_kph 0.0
157
+ direction "unknown"
158
+ }
159
+ ]
160
+ frame_image {
161
+ file "fixtures/sar_rooftop.jpg"
162
+ media_type "image/jpeg"
163
+ }
164
+ }
165
+ @@assert( {{ this|length == 2 }} )
166
+ @@assert( {{ this[0].satisfies != true }} )
167
+ @@assert( {{ this[1].satisfies != true }} )
168
+ }