Spaces:
Paused
Paused
test: add AssessDetections BAML tests with real SAR frame
Browse filesTwo test cases using an actual flood rescue aerial frame:
- SAR_PersonOnRooftop: person on rooftop satisfies rescue mission
- SAR_DogNotMissionTarget: neither person nor dog satisfies cargo mission
Also fixes AssessDetections prompt to use proper system/user roles
for OpenAI image compatibility.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
- baml_client/inlinedbaml.py +1 -1
- baml_src/fixtures/sar_rooftop.jpg +3 -0
- baml_src/isr.baml +74 -2
baml_client/inlinedbaml.py
CHANGED
|
@@ -14,7 +14,7 @@ _file_map = {
|
|
| 14 |
|
| 15 |
"clients.baml": "// ISR LLM clients\n\nclient<llm> GPT4oMini {\n provider openai\n retry_policy Retry\n options {\n model \"gpt-4o-mini\"\n api_key env.OPENAI_API_KEY\n temperature 0.1\n }\n}\n\nclient<llm> GPT4o {\n provider openai\n retry_policy Retry\n options {\n model \"gpt-4o\"\n api_key env.OPENAI_API_KEY\n temperature 0.2\n }\n}\n\nretry_policy Retry {\n max_retries 2\n strategy {\n type exponential_backoff\n delay_ms 500\n multiplier 2.0\n max_delay_ms 5000\n }\n}\n",
|
| 16 |
"generators.baml": "// This helps use auto generate libraries you can use in the language of\n// your choice. You can have multiple generators if you use multiple languages.\n// Just ensure that the output_dir is different for each generator.\ngenerator target {\n // Valid values: \"python/pydantic\", \"typescript\", \"go\", \"rust\", \"ruby/sorbet\", \"rest/openapi\"\n output_type \"python/pydantic\"\n\n // Where the generated code will be saved (relative to baml_src/)\n output_dir \"../\"\n\n // The version of the BAML package you have installed (e.g. same version as your baml-py or @boundaryml/baml).\n // The BAML VSCode extension version should also match this version.\n version \"0.220.0\"\n\n // Valid values: \"sync\", \"async\"\n // This controls what `b.FunctionName()` will be (sync or async).\n default_client_mode sync\n}\n",
|
| 17 |
-
"isr.baml": "// ISR Mission Planning & Assessment Functions\n\n// ββ Mission Planning βββββββββββββββββββββββββββββββββββββββββββββ\n// Takes a free-form mission objective and produces:\n// 1. Concrete object class queries for the detector (YOLO/DETR/GDINO)\n// 2. A refined mission statement for downstream assessment\n\nclass MissionPlan {\n detector_queries string[] @description(\"Object class labels to feed to the detector. Use common COCO class names (person, car, truck, bicycle, motorcycle, bus, dog, cat, etc.) or short noun phrases for open-vocabulary detectors. 3-8 items.\")\n refined_mission string @description(\"A clear, one-sentence restatement of the mission objective that an analyst can evaluate each detection against.\")\n reasoning string @description(\"Brief explanation of why these queries were chosen.\")\n}\n\nfunction PlanMission(mission_text: string) -> MissionPlan {\n client GPT4oMini\n prompt #\"\n You are an ISR (Intelligence, Surveillance, Reconnaissance) mission planner.\n\n Given a free-form mission objective, determine:\n 1. What object classes a visual detector should look for (use common COCO labels when possible: person, car, truck, bus, motorcycle, bicycle, dog, cat, backpack, suitcase, etc.)\n 2. A refined mission statement that an analyst can use to evaluate each detection.\n\n Mission objective: \"{{ mission_text }}\"\n\n {{ ctx.output_format }}\n \"#\n}\n\n\n// ββ Detection Assessment βββββββββββββββββββββββββββββββββββββββββ\n// Replaces hand-rolled JSON parsing with type-safe BAML output\n\nclass DetectionInfo {\n track_id string\n class_label string\n bbox_width_px int\n bbox_height_px int\n area_ratio float @description(\"Fraction of frame area occupied by this detection\")\n speed_kph float\n direction string @description(\"Clock direction (e.g. '3h' for rightward) or 'unknown'\")\n}\n\nclass DetectionVerdict {\n track_id string\n mission_relevant bool @description(\"Does this CLASS of object relate to the mission at all?\")\n satisfies bool? @description(\"Does THIS SPECIFIC detection meet the mission criteria? null if uncertain.\")\n reason string @description(\"1-2 sentences explaining the assessment\")\n features map<string, string> @description(\"2-5 key-value pairs of observable properties relevant to the mission\")\n}\n\n// ββ PlanMission Tests ββββββββββββββββββββββββββββββββββββββββββββ\n\ntest HeavyCargoVehicles {\n functions [PlanMission]\n args {\n mission_text \"identify vehicles that can carry heavy cargos\"\n }\n @@assert( {{ this.detector_queries|length >= 3 }} )\n @@assert( {{ \"truck\" in this.detector_queries }} )\n}\n\ntest PersonOnRooftop {\n functions [PlanMission]\n args {\n mission_text \"identify person stranded on rooftop\"\n }\n @@assert( {{ this.detector_queries|length >= 3 }} )\n @@assert( {{ \"person\" in this.detector_queries }} )\n}\n\n\n// ββ Detection Assessment βββββββββββββββββββββββββββββββββββββββββ\n// Replaces hand-rolled JSON parsing with type-safe BAML output\n\nfunction AssessDetections(mission: string, detections: DetectionInfo[], frame_image: image) -> DetectionVerdict[] {\n client GPT4oMini\n prompt #\"\n {_.role(\"
|
| 18 |
}
|
| 19 |
|
| 20 |
def get_baml_files():
|
|
|
|
| 14 |
|
| 15 |
"clients.baml": "// ISR LLM clients\n\nclient<llm> GPT4oMini {\n provider openai\n retry_policy Retry\n options {\n model \"gpt-4o-mini\"\n api_key env.OPENAI_API_KEY\n temperature 0.1\n }\n}\n\nclient<llm> GPT4o {\n provider openai\n retry_policy Retry\n options {\n model \"gpt-4o\"\n api_key env.OPENAI_API_KEY\n temperature 0.2\n }\n}\n\nretry_policy Retry {\n max_retries 2\n strategy {\n type exponential_backoff\n delay_ms 500\n multiplier 2.0\n max_delay_ms 5000\n }\n}\n",
|
| 16 |
"generators.baml": "// This helps use auto generate libraries you can use in the language of\n// your choice. You can have multiple generators if you use multiple languages.\n// Just ensure that the output_dir is different for each generator.\ngenerator target {\n // Valid values: \"python/pydantic\", \"typescript\", \"go\", \"rust\", \"ruby/sorbet\", \"rest/openapi\"\n output_type \"python/pydantic\"\n\n // Where the generated code will be saved (relative to baml_src/)\n output_dir \"../\"\n\n // The version of the BAML package you have installed (e.g. same version as your baml-py or @boundaryml/baml).\n // The BAML VSCode extension version should also match this version.\n version \"0.220.0\"\n\n // Valid values: \"sync\", \"async\"\n // This controls what `b.FunctionName()` will be (sync or async).\n default_client_mode sync\n}\n",
|
| 17 |
+
"isr.baml": "// ISR Mission Planning & Assessment Functions\n\n// ββ Mission Planning βββββββββββββββββββββββββββββββββββββββββββββ\n// Takes a free-form mission objective and produces:\n// 1. Concrete object class queries for the detector (YOLO/DETR/GDINO)\n// 2. A refined mission statement for downstream assessment\n\nclass MissionPlan {\n detector_queries string[] @description(\"Object class labels to feed to the detector. Use common COCO class names (person, car, truck, bicycle, motorcycle, bus, dog, cat, etc.) or short noun phrases for open-vocabulary detectors. 3-8 items.\")\n refined_mission string @description(\"A clear, one-sentence restatement of the mission objective that an analyst can evaluate each detection against.\")\n reasoning string @description(\"Brief explanation of why these queries were chosen.\")\n}\n\nfunction PlanMission(mission_text: string) -> MissionPlan {\n client GPT4oMini\n prompt #\"\n You are an ISR (Intelligence, Surveillance, Reconnaissance) mission planner.\n\n Given a free-form mission objective, determine:\n 1. What object classes a visual detector should look for (use common COCO labels when possible: person, car, truck, bus, motorcycle, bicycle, dog, cat, backpack, suitcase, etc.)\n 2. A refined mission statement that an analyst can use to evaluate each detection.\n\n Mission objective: \"{{ mission_text }}\"\n\n {{ ctx.output_format }}\n \"#\n}\n\n\n// ββ Detection Assessment βββββββββββββββββββββββββββββββββββββββββ\n// Replaces hand-rolled JSON parsing with type-safe BAML output\n\nclass DetectionInfo {\n track_id string\n class_label string\n bbox_width_px int\n bbox_height_px int\n area_ratio float @description(\"Fraction of frame area occupied by this detection\")\n speed_kph float\n direction string @description(\"Clock direction (e.g. '3h' for rightward) or 'unknown'\")\n}\n\nclass DetectionVerdict {\n track_id string\n mission_relevant bool @description(\"Does this CLASS of object relate to the mission at all?\")\n satisfies bool? @description(\"Does THIS SPECIFIC detection meet the mission criteria? null if uncertain.\")\n reason string @description(\"1-2 sentences explaining the assessment\")\n features map<string, string> @description(\"2-5 key-value pairs of observable properties relevant to the mission\")\n}\n\n// ββ PlanMission Tests ββββββββββββββββββββββββββββββββββββββββββββ\n\ntest HeavyCargoVehicles {\n functions [PlanMission]\n args {\n mission_text \"identify vehicles that can carry heavy cargos\"\n }\n @@assert( {{ this.detector_queries|length >= 3 }} )\n @@assert( {{ \"truck\" in this.detector_queries }} )\n}\n\ntest PersonOnRooftop {\n functions [PlanMission]\n args {\n mission_text \"identify person stranded on rooftop\"\n }\n @@assert( {{ this.detector_queries|length >= 3 }} )\n @@assert( {{ \"person\" in this.detector_queries }} )\n}\n\n\n// ββ Detection Assessment βββββββββββββββββββββββββββββββββββββββββ\n// Replaces hand-rolled JSON parsing with type-safe BAML output\n\nfunction AssessDetections(mission: string, detections: DetectionInfo[], frame_image: image) -> DetectionVerdict[] {\n client GPT4oMini\n prompt #\"\n {{ _.role(\"system\") }}\n You are an ISR analyst assessing aerial drone detections against a mission objective.\n\n {{ _.role(\"user\") }}\n Mission: \"{{ mission }}\"\n\n Detected objects:\n {% for d in detections %}\n - {{ d.track_id }}: class={{ d.class_label }}, bbox={{ d.bbox_width_px }}x{{ d.bbox_height_px }}px, area_ratio={{ d.area_ratio }}, speed={{ d.speed_kph }}kph, direction={{ d.direction }}\n {% endfor %}\n\n Frame context (showing all detections):\n {{ frame_image }}\n\n Assess each detection against the mission.\n\n {{ ctx.output_format }}\n \"#\n}\n\n// ββ AssessDetections Tests βββββββββββββββββββββββββββββββββββββββ\n// SAR flood scene: person + dog on rooftop, surrounded by floodwater\n\ntest SAR_PersonOnRooftop {\n functions [AssessDetections]\n args {\n mission \"identify person stranded on rooftop needing rescue\"\n detections [\n {\n track_id \"T01\"\n class_label \"person\"\n bbox_width_px 120\n bbox_height_px 280\n area_ratio 0.0059\n speed_kph 0.0\n direction \"unknown\"\n },\n {\n track_id \"T02\"\n class_label \"dog\"\n bbox_width_px 80\n bbox_height_px 50\n area_ratio 0.0007\n speed_kph 0.0\n direction \"unknown\"\n }\n ]\n frame_image {\n file \"fixtures/sar_rooftop.jpg\"\n media_type \"image/jpeg\"\n }\n }\n @@assert( {{ this|length == 2 }} )\n @@assert( {{ this[0].track_id == \"T01\" }} )\n @@assert( {{ this[0].mission_relevant == true }} )\n @@assert( {{ this[0].satisfies == true }} )\n}\n\ntest SAR_DogNotMissionTarget {\n functions [AssessDetections]\n args {\n mission \"identify vehicles capable of transporting heavy cargo\"\n detections [\n {\n track_id \"T01\"\n class_label \"person\"\n bbox_width_px 120\n bbox_height_px 280\n area_ratio 0.0059\n speed_kph 0.0\n direction \"unknown\"\n },\n {\n track_id \"T02\"\n class_label \"dog\"\n bbox_width_px 80\n bbox_height_px 50\n area_ratio 0.0007\n speed_kph 0.0\n direction \"unknown\"\n }\n ]\n frame_image {\n file \"fixtures/sar_rooftop.jpg\"\n media_type \"image/jpeg\"\n }\n }\n @@assert( {{ this|length == 2 }} )\n @@assert( {{ this[0].satisfies != true }} )\n @@assert( {{ this[1].satisfies != true }} )\n}\n",
|
| 18 |
}
|
| 19 |
|
| 20 |
def get_baml_files():
|
baml_src/fixtures/sar_rooftop.jpg
ADDED
|
Git LFS Details
|
baml_src/isr.baml
CHANGED
|
@@ -75,10 +75,10 @@ test PersonOnRooftop {
|
|
| 75 |
function AssessDetections(mission: string, detections: DetectionInfo[], frame_image: image) -> DetectionVerdict[] {
|
| 76 |
client GPT4oMini
|
| 77 |
prompt #"
|
| 78 |
-
{_.role("
|
| 79 |
-
|
| 80 |
You are an ISR analyst assessing aerial drone detections against a mission objective.
|
| 81 |
|
|
|
|
| 82 |
Mission: "{{ mission }}"
|
| 83 |
|
| 84 |
Detected objects:
|
|
@@ -94,3 +94,75 @@ function AssessDetections(mission: string, detections: DetectionInfo[], frame_im
|
|
| 94 |
{{ ctx.output_format }}
|
| 95 |
"#
|
| 96 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
function AssessDetections(mission: string, detections: DetectionInfo[], frame_image: image) -> DetectionVerdict[] {
|
| 76 |
client GPT4oMini
|
| 77 |
prompt #"
|
| 78 |
+
{{ _.role("system") }}
|
|
|
|
| 79 |
You are an ISR analyst assessing aerial drone detections against a mission objective.
|
| 80 |
|
| 81 |
+
{{ _.role("user") }}
|
| 82 |
Mission: "{{ mission }}"
|
| 83 |
|
| 84 |
Detected objects:
|
|
|
|
| 94 |
{{ ctx.output_format }}
|
| 95 |
"#
|
| 96 |
}
|
| 97 |
+
|
| 98 |
+
// ββ AssessDetections Tests βββββββββββββββββββββββββββββββββββββββ
|
| 99 |
+
// SAR flood scene: person + dog on rooftop, surrounded by floodwater
|
| 100 |
+
|
| 101 |
+
test SAR_PersonOnRooftop {
|
| 102 |
+
functions [AssessDetections]
|
| 103 |
+
args {
|
| 104 |
+
mission "identify person stranded on rooftop needing rescue"
|
| 105 |
+
detections [
|
| 106 |
+
{
|
| 107 |
+
track_id "T01"
|
| 108 |
+
class_label "person"
|
| 109 |
+
bbox_width_px 120
|
| 110 |
+
bbox_height_px 280
|
| 111 |
+
area_ratio 0.0059
|
| 112 |
+
speed_kph 0.0
|
| 113 |
+
direction "unknown"
|
| 114 |
+
},
|
| 115 |
+
{
|
| 116 |
+
track_id "T02"
|
| 117 |
+
class_label "dog"
|
| 118 |
+
bbox_width_px 80
|
| 119 |
+
bbox_height_px 50
|
| 120 |
+
area_ratio 0.0007
|
| 121 |
+
speed_kph 0.0
|
| 122 |
+
direction "unknown"
|
| 123 |
+
}
|
| 124 |
+
]
|
| 125 |
+
frame_image {
|
| 126 |
+
file "fixtures/sar_rooftop.jpg"
|
| 127 |
+
media_type "image/jpeg"
|
| 128 |
+
}
|
| 129 |
+
}
|
| 130 |
+
@@assert( {{ this|length == 2 }} )
|
| 131 |
+
@@assert( {{ this[0].track_id == "T01" }} )
|
| 132 |
+
@@assert( {{ this[0].mission_relevant == true }} )
|
| 133 |
+
@@assert( {{ this[0].satisfies == true }} )
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
test SAR_DogNotMissionTarget {
|
| 137 |
+
functions [AssessDetections]
|
| 138 |
+
args {
|
| 139 |
+
mission "identify vehicles capable of transporting heavy cargo"
|
| 140 |
+
detections [
|
| 141 |
+
{
|
| 142 |
+
track_id "T01"
|
| 143 |
+
class_label "person"
|
| 144 |
+
bbox_width_px 120
|
| 145 |
+
bbox_height_px 280
|
| 146 |
+
area_ratio 0.0059
|
| 147 |
+
speed_kph 0.0
|
| 148 |
+
direction "unknown"
|
| 149 |
+
},
|
| 150 |
+
{
|
| 151 |
+
track_id "T02"
|
| 152 |
+
class_label "dog"
|
| 153 |
+
bbox_width_px 80
|
| 154 |
+
bbox_height_px 50
|
| 155 |
+
area_ratio 0.0007
|
| 156 |
+
speed_kph 0.0
|
| 157 |
+
direction "unknown"
|
| 158 |
+
}
|
| 159 |
+
]
|
| 160 |
+
frame_image {
|
| 161 |
+
file "fixtures/sar_rooftop.jpg"
|
| 162 |
+
media_type "image/jpeg"
|
| 163 |
+
}
|
| 164 |
+
}
|
| 165 |
+
@@assert( {{ this|length == 2 }} )
|
| 166 |
+
@@assert( {{ this[0].satisfies != true }} )
|
| 167 |
+
@@assert( {{ this[1].satisfies != true }} )
|
| 168 |
+
}
|