⚠️ Upload image.
" process_img = _prepare_image_for_model(image_in, short_size) output_text, token_sequence, out_info = GLOBAL_WORKER.generate( process_img, categories_list, model_mode, temp=temp, top_p=top_p, top_k=top_k, question_override=question_override ) detections = parse_mixed_results(output_text, category_str) frame_bgr = cv2.cvtColor(np.array(image_in), cv2.COLOR_RGB2BGR) out_img_bgr = draw_on_frame(frame_bgr, detections, draw_label=True) output_image = Image.fromarray(cv2.cvtColor(out_img_bgr, cv2.COLOR_BGR2RGB)) _log_to_dataset("image", ", ".join(categories_list), model_mode, question_override or category_str, output_text, image_in, output_image) return gr.update(value=output_image, visible=True), gr.update(value=None, visible=False), generate_dynamic_html(token_sequence, out_info, output_text) @spaces.GPU(duration=180) def _run_video_inference(video_in, categories_list, category_str, model_mode, temp, top_p, top_k, short_size, question_override, max_video_frames): import subprocess as _sp if video_in is None: return gr.update(value=None, visible=False), gr.update(value=None, visible=True), "⚠️ Upload video.
" cap = cv2.VideoCapture(video_in) fps = cap.get(cv2.CAP_PROP_FPS) vid_w, vid_h = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) all_frames = [] while cap.isOpened(): ret, frame = cap.read() if not ret: break all_frames.append(frame) cap.release() total = len(all_frames) max_frames = int(max_video_frames) if max_video_frames else 4 sample_indices = list(range(total)) if total <= max_frames else [int(round(i * (total - 1) / (max_frames - 1))) for i in range(max_frames)] sampled_frames = [all_frames[i] for i in sample_indices] out_fps = max(1.0, len(sampled_frames) / (total / fps)) if fps > 0 else 5.0 del all_frames gc.collect() inference_results = [] for frame in sampled_frames: pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) process_img = _prepare_image_for_model(pil_img, short_size) output_text, _, _ = GLOBAL_WORKER.generate(process_img, categories_list, model_mode, temp=temp, top_p=top_p, top_k=top_k, question_override=question_override) inference_results.append(output_text) tmp_raw = tempfile.mktemp(suffix=".raw.mp4") out_video_path = tempfile.mktemp(suffix=".mp4") out = cv2.VideoWriter(tmp_raw, cv2.VideoWriter_fourcc(*"mp4v"), out_fps, (vid_w, vid_h)) for frame, output_text in zip(sampled_frames, inference_results): detections = parse_mixed_results(output_text, category_str) valid_results = _postprocess_detections(detections, vid_w, vid_h) out.write(draw_on_frame(frame, valid_results, draw_label=True)) out.release() _sp.run(["ffmpeg", "-y", "-i", tmp_raw, "-c:v", "libx264", "-preset", "ultrafast", "-crf", "23", "-pix_fmt", "yuv420p", out_video_path], capture_output=True) if os.path.exists(tmp_raw): os.remove(tmp_raw) combined_raw_text = "\n\n".join([f"--- Frame {i+1} ---\n{t}" for i, t in enumerate(inference_results)]) return gr.update(value=None, visible=False), gr.update(value=out_video_path, visible=True), generate_dynamic_html([], "Processed Loop Successful", combined_raw_text) def run_inference(input_type, image_in, video_in, task_type, category_str, model_mode, temp, top_p, top_k, short_side, question_override, max_video_frames): categories_list = [c.strip() for c in category_str.split(",") if c.strip()] or ["object"] final_override = question_override.strip() if (question_override and question_override.strip()) else None if input_type == "Image": return _run_image_inference(image_in, categories_list, category_str, model_mode, temp, top_p, top_k, short_side, final_override) return _run_video_inference(video_in, categories_list, category_str, model_mode, temp, top_p, top_k, short_side, final_override, max_video_frames) # ============================================================ # GRADIO INTERFACE LAYOUT BUILD # ============================================================ def build_ui(): with gr.Blocks(title="LocateAnything Grounding Suite") as demo: gr.Markdown("# 🔍 LocateAnything Grounding Studio\nInfer target regions, visual boxes, and point indicators.") with gr.Row(): with gr.Column(scale=1): input_type = gr.Radio(["Image", "Video"], value="Image", label="Input Format") image_input = gr.Image(type="pil", label="Source Image", visible=True) video_input = gr.Video(label="Source Video", visible=False) task_dropdown = gr.Dropdown(["Detection", "Grounding", "OCR", "GUI", "Pointing"], value="Detection", label="Goal Context Task") category_input = gr.Textbox(label="Categories / Label Targets (comma separated)", value="car, pedestrian") raw_prompt_box = gr.Textbox(label="Generated Execution Prompt (Read Only)", value="Locate all the instances that matches the following description: carpedestrian.", interactive=False) with gr.Accordion("Advanced Parameters", open=False): model_dropdown = gr.Dropdown(["hybrid", "fast", "slow"], value="hybrid", label="Decoding Engine Mode") temp_slider = gr.Slider(0.0, 1.0, value=0.7, step=0.1, label="Temperature") top_p_slider = gr.Slider(0.0, 1.0, value=0.9, step=0.05, label="Top P") top_k_slider = gr.Slider(1, 100, value=50, step=1, label="Top K") short_size_input = gr.Slider(0, 1024, value=1024, step=64, label="Max Downscaling Res Constraint (0 for Native)") max_video_frames_slider = gr.Slider(1, 16, value=4, step=1, label="Video Sample Extraction Cap") run_btn = gr.Button("Run Inference", variant="primary") with gr.Column(scale=1): output_image = gr.Image(label="Annotated Image Result", visible=True) output_video = gr.Video(label="Annotated Video Result", visible=False) raw_output_box = gr.HTML(label="Visual Trace Dashboard") input_type.change( fn=lambda c: (gr.update(visible=(c == "Image")), gr.update(visible=(c == "Video"))), inputs=input_type, outputs=[image_input, video_input], ) for comp in [task_dropdown, category_input]: comp.change(fn=generate_raw_prompt, inputs=[task_dropdown, category_input], outputs=raw_prompt_box) run_btn.click( fn=lambda: gr.update(interactive=False, value="Processing Tensors..."), outputs=[run_btn], ).then( fn=run_inference, inputs=[ input_type, image_input, video_input, task_dropdown, category_input, model_dropdown, temp_slider, top_p_slider, top_k_slider, short_size_input, raw_prompt_box, max_video_frames_slider, ], outputs=[output_image, output_video, raw_output_box], ).then( fn=lambda: gr.update(interactive=True, value="Run Inference"), outputs=[run_btn], ) return demo if __name__ == "__main__": demo = build_ui() demo.queue().launch()