Spaces:

Syzygianinfern0
/

NSVS

Paused

App Files Files Community

Syzygianinfern0 commited on Sep 30, 2025

Commit

d41a400

1 Parent(s): 2d6187c

Bring over latest scripts from demo (7c8fc86)

Browse files

Files changed (7) hide show

execute_demo_v2.py +588 -588
execute_demo_v3.py +668 -0
execute_with_mp4.py +100 -18
launch_space.sh +2 -1
ns_vfs/nsvs.py +2 -14
ns_vfs/nsvs_yolo.py +215 -0
pyproject.toml +1 -0

execute_demo_v2.py CHANGED Viewed

@@ -1,588 +1,588 @@
-import json
-import os
-import uuid
-import cv2
-import subprocess
-import numpy as np
-import gradio as gr
-import tempfile
-from typing import Dict, List, Iterable, Tuple
-from ns_vfs.video.read_mp4 import Mp4Reader
-from execute_with_mp4 import process_entry
-from matplotlib import pyplot as plt
-import base64
-from openai import OpenAI
-class VLLMClient:
-    def __init__(
-        self,
-        api_key="EMPTY",
-        api_base="http://localhost:8000/v1",
-        model="OpenGVLab/InternVL2-8B",
-        # model="Qwen/Qwen2.5-VL-7B-Instruct",
-    ):
-        self.client = OpenAI(api_key=api_key, base_url=api_base)
-        self.model = model
-    # def _encode_frame(self, frame):
-    #     return base64.b64encode(frame.tobytes()).decode("utf-8")
-    def _encode_frame(self, frame):
-        # Encode a uint8 numpy array (image) as a JPEG and then base64 encode it.
-        ret, buffer = cv2.imencode(".jpg", frame)
-        if not ret:
-            raise ValueError("Could not encode frame")
-        return base64.b64encode(buffer).decode("utf-8")
-    def caption( self, frames: list[np.ndarray]):
-        parsing_rule = " You must return a caption for the sequence of images. The caption must be a single sentence. The caption must be in the same language as the question."
-        prompt = rf"Give me a detailed description of what you see in the images " f"\n[PARSING RULE]: {parsing_rule}"
-        # Encode each frame.
-        encoded_images = [self._encode_frame(frame) for frame in frames]
-        # Build the user message: a text prompt plus one image for each frame.
-        user_content = [
-            {
-                "type": "text",
-                "text": f"The following is the sequence of images",
-            }
-        ]
-        for encoded in encoded_images:
-            user_content.append(
-                {
-                    "type": "image_url",
-                    "image_url": {"url": f"data:image/jpeg;base64,{encoded}"},
-                }
-            )
-        # Create a chat completion request.
-        chat_response = self.client.chat.completions.create(
-            model=self.model,
-            messages=[
-                {"role": "system", "content": prompt},
-                {"role": "user", "content": user_content},
-            ],
-            max_tokens=1000,
-            temperature=0.0,
-            logprobs=True,
-        )
-        content = chat_response.choices[0].message.content
-        return content
-def _load_entry_from_reader(video_path, query_text):
-    reader = Mp4Reader(
-        [{"path": video_path, "query": query_text}],
-        openai_save_path="",
-        sampling_rate_fps=0.5
-    )
-    data = reader.read_video()
-    if not data:
-        raise RuntimeError("No data returned by Mp4Reader (check video path)")
-    return data[0]
-def _make_empty_video(path, width=320, height=240, fps=1.0):
-    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
-    writer = cv2.VideoWriter(path, fourcc, fps, (width, height))
-    frame = np.zeros((height, width, 3), dtype=np.uint8)
-    writer.write(frame)
-    writer.release()
-    return path
-def _crop_video_ffmpeg(input_path, output_path, frame_indices, prop_matrix):
-    if len(frame_indices) == 0:
-        cap = cv2.VideoCapture(str(input_path))
-        if not cap.isOpened():
-            raise RuntimeError(f"Could not open video: {input_path}")
-        width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-        cap.release()
-        _make_empty_video(output_path, width, height, fps=1.0)
-        return
-    def group_into_ranges(frames):
-        if not frames:
-            return []
-        frames = sorted(set(frames))
-        ranges = []
-        start = prev = frames[0]
-        for f in frames[1:]:
-            if f == prev + 1:
-                prev = f
-            else:
-                ranges.append((start, prev + 1))  # end-exclusive
-                start = prev = f
-        ranges.append((start, prev + 1))
-        return ranges
-    ranges = group_into_ranges(frame_indices)
-    filters = []
-    labels = []
-    for i, (start, end) in enumerate(ranges):
-        filters.append(
-            f"[0:v]trim=start_frame={start}:end_frame={end},setpts=PTS-STARTPTS[v{i}]"
-        )
-        labels.append(f"[v{i}]")
-    filters.append(f"{''.join(labels)}concat=n={len(ranges)}:v=1:a=0[outv]")
-    cmd = [
-        "ffmpeg", "-y", "-i", input_path,
-        "-filter_complex", "; ".join(filters),
-        "-map", "[outv]",
-        "-c:v", "libx264", "-preset", "fast", "-crf", "23",
-        output_path,
-    ]
-    subprocess.run(cmd, check=True)
-def _crop_video(input_path: str, output_path: str, frame_indices: List[int], prop_matrix: Dict[str, List[int]]):
-    input_path = str(input_path)
-    output_path = str(output_path)
-    # Probe width/height/fps
-    cap = cv2.VideoCapture(input_path)
-    if not cap.isOpened():
-        raise RuntimeError(f"Could not open video: {input_path}")
-    width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-    fps    = float(cap.get(cv2.CAP_PROP_FPS)) or 0.0
-    cap.release()
-    if fps <= 0:
-        fps = 30.0
-    # If nothing to write, emit a 1-frame empty video
-    if not frame_indices:
-        from numpy import zeros, uint8
-        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
-        out = cv2.VideoWriter(output_path, fourcc, 1.0, (width, height))
-        out.write(zeros((height, width, 3), dtype=uint8))
-        out.release()
-        return
-    # Helper: group consecutive integers into (start, end_exclusive)
-    def _group_ranges(frames: Iterable[int]) -> List[Tuple[int, int]]:
-        f = sorted(set(int(x) for x in frames))
-        if not f:
-            return []
-        out = []
-        s = p = f[0]
-        for x in f[1:]:
-            if x == p + 1:
-                p = x
-            else:
-                out.append((s, p + 1))
-                s = p = x
-        out.append((s, p + 1))
-        return out
-    # Invert prop_matrix to {frame_idx: sorted [props]}
-    props_by_frame: Dict[int, List[str]] = {}
-    for prop, frames in (prop_matrix or {}).items():
-        for fi in frames:
-            fi = int(fi)
-            props_by_frame.setdefault(fi, []).append(prop)
-    for fi in list(props_by_frame.keys()):
-        props_by_frame[fi] = sorted(set(props_by_frame[fi]))
-    # Only subtitle frames we will output
-    fi_set = set(int(x) for x in frame_indices)
-    frames_with_labels = sorted(fi for fi in fi_set if props_by_frame.get(fi))
-    # Compress consecutive frames that share the same label set
-    grouped_label_spans: List[Tuple[int, int, Tuple[str, ...]]] = []
-    prev_f = None
-    prev_labels: Tuple[str, ...] = ()
-    span_start = None
-    for f in frames_with_labels:
-        labels = tuple(props_by_frame.get(f, []))
-        if prev_f is None:
-            span_start, prev_f, prev_labels = f, f, labels
-        elif (f == prev_f + 1) and (labels == prev_labels):
-            prev_f = f
-        else:
-            grouped_label_spans.append((span_start, prev_f + 1, prev_labels))
-            span_start, prev_f, prev_labels = f, f, labels
-    if prev_f is not None and prev_labels:
-        grouped_label_spans.append((span_start, prev_f + 1, prev_labels))
-    # Build ASS subtitle file (top-right)
-    def ass_time(t_sec: float) -> str:
-        cs = int(round(t_sec * 100))
-        h = cs // (100 * 3600)
-        m = (cs // (100 * 60)) % 60
-        s = (cs // 100) % 60
-        cs = cs % 100
-        return f"{h}:{m:02d}:{s:02d}.{cs:02d}"
-    def make_ass(width: int, height: int) -> str:
-        lines = []
-        lines.append("[Script Info]")
-        lines.append("ScriptType: v4.00+")
-        lines.append("ScaledBorderAndShadow: yes")
-        lines.append(f"PlayResX: {width}")
-        lines.append(f"PlayResY: {height}")
-        lines.append("")
-        lines.append("[V4+ Styles]")
-        lines.append("Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, "
-                     "Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, "
-                     "Shadow, Alignment, MarginL, MarginR, MarginV, Encoding")
-        # Font size 18 per your request; Alignment=9 (top-right)
-        lines.append("Style: Default,DejaVu Sans,18,&H00FFFFFF,&H000000FF,&H00000000,&H64000000,"
-                     "0,0,0,0,100,100,0,0,1,2,0.8,9,16,16,16,1")
-        lines.append("")
-        lines.append("[Events]")
-        lines.append("Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text")
-        for start_f, end_f, labels in grouped_label_spans:
-            if not labels:
-                continue
-            start_t = ass_time(start_f / fps)
-            end_t   = ass_time(end_f   / fps)
-            text = r"\N".join(labels)  # stacked lines
-            lines.append(f"Dialogue: 0,{start_t},{end_t},Default,,0,0,0,,{text}")
-        return "\n".join(lines)
-    tmp_dir = tempfile.mkdtemp(prefix="props_ass_")
-    ass_path = os.path.join(tmp_dir, "props.ass")
-    with open(ass_path, "w", encoding="utf-8") as f:
-        f.write(make_ass(width, height))
-    # Build trim/concat ranges from requested frame_indices
-    ranges = _group_ranges(frame_indices)
-    # Filtergraph with burned subtitles then trim/concat
-    split_labels = [f"[s{i}]" for i in range(len(ranges))] if ranges else []
-    out_labels   = [f"[v{i}]" for i in range(len(ranges))] if ranges else []
-    filters = []
-    ass_arg = ass_path.replace("\\", "\\\\")
-    filters.append(f"[0:v]subtitles='{ass_arg}'[sub]")
-    if len(ranges) == 1:
-        s0, e0 = ranges[0]
-        filters.append(f"[sub]trim=start_frame={s0}:end_frame={e0},setpts=PTS-STARTPTS[v0]")
-    else:
-        if ranges:
-            filters.append(f"[sub]split={len(ranges)}{''.join(split_labels)}")
-            for i, (s, e) in enumerate(ranges):
-                filters.append(f"{split_labels[i]}trim=start_frame={s}:end_frame={e},setpts=PTS-STARTPTS{out_labels[i]}")
-    if ranges:
-        filters.append(f"{''.join(out_labels)}concat=n={len(ranges)}:v=1:a=0[outv]")
-    filter_complex = "; ".join(filters)
-    cmd = [
-        "ffmpeg", "-y",
-        "-i", input_path,
-        "-filter_complex", filter_complex,
-        "-map", "[outv]" if ranges else "[sub]",
-        "-c:v", "libx264", "-preset", "fast", "-crf", "23",
-        output_path,
-    ]
-    try:
-        subprocess.run(cmd, check=True)
-    finally:
-        try:
-            os.remove(ass_path)
-            os.rmdir(tmp_dir)
-        except OSError:
-            pass
-def _format_prop_ranges_dict(prop_matrix: Dict[str, List[int]]) -> str:
-    def group_into_ranges(frames: Iterable[int]) -> List[Tuple[int, int]]:
-        f = sorted(set(int(x) for x in frames))
-        if not f:
-            return []
-        ranges: List[Tuple[int, int]] = []
-        s = p = f[0]
-        for x in f[1:]:
-            if x == p + 1:
-                p = x
-            else:
-                ranges.append((s, p))   # inclusive end for display
-                s = p = x
-        ranges.append((s, p))
-        return ranges
-    detections = {}
-    for prop, frames in prop_matrix.items():
-        ranges = group_into_ranges(frames)
-        detections[prop] = ranges
-    return detections
-def _format_prop_ranges(prop_matrix: Dict[str, List[int]]) -> str:
-    def group_into_ranges(frames: Iterable[int]) -> List[Tuple[int, int]]:
-        f = sorted(set(int(x) for x in frames))
-        if not f:
-            return []
-        ranges: List[Tuple[int, int]] = []
-        s = p = f[0]
-        for x in f[1:]:
-            if x == p + 1:
-                p = x
-            else:
-                ranges.append((s, p))   # inclusive end for display
-                s = p = x
-        ranges.append((s, p))
-        return ranges
-    if not prop_matrix:
-        return "No propositions detected."
-    lines = []
-    for prop, frames in prop_matrix.items():
-        ranges = group_into_ranges(frames)
-        pretty = prop.replace("_", " ").title()
-        if not ranges:
-            lines.append(f"{pretty}: —")
-            continue
-        parts = [f"{a}" if a == b else f"{a}-{b}" for (a, b) in ranges]
-        lines.append(f"{pretty}: {', '.join(parts)}")
-    return "\n".join(lines)
-def generate_timeline_plot(detections, total_frames):
-    """
-    Generates a timeline plot from detection data using Matplotlib.
-    Args:
-        detections (dict): A dictionary where keys are string labels and values are lists
-                           of (start_frame, end_frame) tuples.
-                           e.g., {"dog": [(0, 45), (90, 100)], "grass": [(30, 80)]}
-        total_frames (int): The total number of frames in the video for the x-axis scale.
-    Returns:
-        matplotlib.figure.Figure: The generated plot figure.
-    """
-    labels = list(detections.keys())
-    num_labels = len(labels)
-    # Handle case with no detections
-    if num_labels == 0:
-        fig, ax = plt.subplots(figsize=(10, 1))
-        ax.text(0.5, 0.5, 'No propositions detected.', ha='center', va='center')
-        ax.set_axis_off()
-        return fig
-    # Use a color map to assign distinct colors automatically
-    colors = plt.cm.get_cmap('tab10', num_labels)
-    fig, ax = plt.subplots(figsize=(10, num_labels * 0.6 + 0.5))
-    ax.set_xlim(0, total_frames)
-    ax.set_ylim(0, num_labels)
-    ax.set_yticks(np.arange(num_labels) + 0.5)
-    ax.set_yticklabels(labels, fontsize=12)
-    ax.set_xlabel("Frame Number", fontsize=12)
-    ax.grid(axis='x', linestyle='--', alpha=0.6)
-    # Invert y-axis to have the first proposition on top
-    ax.invert_yaxis()
-    for i, label in enumerate(labels):
-        # matplotlib's broken_barh needs a list of (start, width) tuples
-        segments = [(start, end - start) for start, end in detections[label]]
-        # The bar is drawn at y-position 'i' with a height of 0.8
-        ax.broken_barh(segments, (i + 0.1, 0.8), facecolors=colors(i))
-    plt.tight_layout()
-    return fig
-# -----------------------------
-# Gradio handler
-# -----------------------------
-def run_pipeline(input_video, mode, query_text, propositions_json, specification_text):
-    """
-    Returns: (cropped_video_path, prop_ranges_text, tl_text)
-    """
-    def _err(msg, width=320, height=240):  # keep outputs shape consistent
-        tmp_out = os.path.join("/tmp", f"empty_{uuid.uuid4().hex}.mp4")
-        _make_empty_video(tmp_out, width=width, height=height, fps=1.0)
-        return (
-            tmp_out,
-            "No propositions detected.",
-            f"Error: {msg}"
-        )
-    # Resolve video path
-    if isinstance(input_video, dict) and "name" in input_video:
-        video_path = input_video["name"]
-    elif isinstance(input_video, str):
-        video_path = input_video
-    else:
-        return _err("Please provide a video.")
-    # Build entry
-    if mode == "Natural language query":
-        if not query_text or not query_text.strip():
-            return _err("Please enter a query.")
-        entry = _load_entry_from_reader(video_path, query_text)
-    else:
-        if not (propositions_json and propositions_json.strip()) or not (specification_text and specification_text.strip()):
-            return _err("Please provide both Propositions (array) and Specification.")
-        entry = _load_entry_from_reader(video_path, "dummy-query")
-        try:
-            props = json.loads(propositions_json)
-            if not isinstance(props, list):
-                return _err("Propositions must be a JSON array.")
-        except Exception as e:
-            return _err(f"Failed to parse propositions JSON: {e}")
-        entry["tl"] = {
-            "propositions": props,
-            "specification": specification_text
-        }
-    # Compute FOI
-    try:
-        foi, prop_matrix, p2 = process_entry(entry)  # list of frame indices & {prop: [frames]}
-        print(foi)
-        print(prop_matrix)
-        print(p2)
-    except Exception as e:
-        return _err(f"Processing error: {e}")
-    # Write cropped video
-    try:
-        out_path = os.path.join("/tmp", f"cropped_{uuid.uuid4().hex}.mp4")
-        _crop_video(video_path, out_path, foi, prop_matrix)
-        print(f"Wrote cropped video to: {out_path}")
-    except Exception as e:
-        return _err(f"Failed to write cropped video: {e}")
-    # Build right-side text sections
-    prop_ranges_text = _format_prop_ranges(prop_matrix)
-    prop_ranges_dict = _format_prop_ranges_dict(prop_matrix)
-    plot = generate_timeline_plot(prop_ranges_dict, entry["video_info"].frame_count)
-    tl_text = (
-        f"Propositions: {json.dumps(entry['tl']['propositions'], ensure_ascii=False)}\n"
-        f"Specification: {entry['tl']['specification']}"
-    )
-    return out_path, prop_ranges_text, tl_text, plot
-def generate_caption(video_path):
-    """
-    Simulates generating a caption for the given video file.
-    """
-    # If the video is cleared, the input will be None
-    if video_path is None:
-        # Hide the caption box and clear its content
-        return gr.update(value="", visible=False)
-    print(f"Generating caption for: {video_path}")
-    vllm_client = VLLMClient()
-    entry = _load_entry_from_reader(video_path, "dummy-query")
-    # sample 4 frames from the video evenly
-    len_frames = len(entry['images'])
-    images = [entry['images'][i] for i in range(0, len_frames, len_frames//3)]
-    caption_text = vllm_client.caption(images)
-    # Simulate model inference time
-    # Use gr.update to change both the value and visibility of the textbox
-    return gr.update(value=caption_text, visible=True)
-# -----------------------------
-# UI
-# -----------------------------
-with gr.Blocks(css="""
-#io-col {display: flex; gap: 1rem;}
-#left {flex: 1;}
-#right {flex: 1;}
-""", title="NSVS-TL") as demo:
-    gr.Markdown("# Neuro-Symbolic Visual Search with Temporal Logic")
-    gr.Markdown(
-        "Upload a video and either provide a natural-language **Query** *or* directly supply **Propositions** (array) + **Specification**. "
-        "On the right, you'll get a **cropped video** containing only the frames of interest, a **Propositions by Frames** summary, and the combined TL summary."
-    )
-    with gr.Row(elem_id="io-col"):
-        with gr.Column(elem_id="left"):
-            mode = gr.Radio(
-                choices=["Natural language query", "Props/Spec"],
-                value="Natural language query",
-                label="Input mode"
-            )
-            video = gr.Video(label="Upload Video")
-            query = gr.Textbox(
-                label="Query (natural language)",
-                placeholder="e.g., a man is jumping and panting until he falls down"
-            )
-            captions = gr.Textbox(
-                label="Video Caption",
-                placeholder="e.g., a man is jumping and panting until he falls down",
-                lines=4,
-                visible=False
-            )
-            propositions = gr.Textbox(
-                label="Propositions (JSON array)",
-                placeholder='e.g., ["man_jumps", "man_pants", "man_falls_down"]',
-                lines=4,
-                visible=False
-            )
-            specification = gr.Textbox(
-                label="Specification",
-                placeholder='e.g., ("woman_jumps" & "woman_claps") U "candle_is_blown"',
-                visible=False
-            )
-            def _toggle_fields(m):
-                if m == "Natural language query":
-                    return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
-                else:
-                    return gr.update(visible=False), gr.update(visible=True), gr.update(visible=True)
-            mode.change(_toggle_fields, inputs=[mode], outputs=[query, propositions, specification])
-            video.change(
-                fn=generate_caption,
-                inputs=[video],
-                outputs=[captions]
-            )
-            run_btn = gr.Button("Run", variant="primary")
-            gr.Examples(
-                label="Examples (dummy paths + queries)",
-                examples=[
-                    ["demo_videos/dog_jump.mp4", "a dog jumps until a red tube is in view"],
-                    ["demo_videos/blue_shirt.mp4", "a girl in a green shirt until a candle is blown"],
-                    ["demo_videos/car.mp4", "red car until a truck"],
-                    ["demo_videos/newyork_1.mp4", "bright lights until empire state building"],
-                    ["demo_videos/chicago_2.mp4", "ocean until ship"],
-                ],
-                inputs=[video, query],
-                cache_examples=False,
-            )
-        with gr.Column(elem_id="right"):
-            cropped_video = gr.Video(label="Cropped Video (Frames of Interest Only)")
-            prop_ranges_out = gr.Textbox(
-                label="Propositions by Frames",
-                lines=6,
-                interactive=False
-            )
-            timeline_plot_output = gr.Plot(label="Propositions Timeline")
-            tl_out = gr.Textbox(
-                label="TL (Propositions & Specification)",
-                lines=8,
-                interactive=False
-            )
-    run_btn.click(
-        fn=run_pipeline,
-        inputs=[video, mode, query, propositions, specification],
-        outputs=[cropped_video, prop_ranges_out, tl_out, timeline_plot_output]
-    )
-if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=7860)

+import json
+import os
+import uuid
+import cv2
+import subprocess
+import numpy as np
+import gradio as gr
+import tempfile
+from typing import Dict, List, Iterable, Tuple
+from ns_vfs.video.read_mp4 import Mp4Reader
+from execute_with_mp4 import process_entry
+from matplotlib import pyplot as plt
+import base64
+from openai import OpenAI
+class VLLMClient:
+    def __init__(
+        self,
+        api_key="EMPTY",
+        api_base="http://localhost:8000/v1",
+        model="OpenGVLab/InternVL2-8B",
+        # model="Qwen/Qwen2.5-VL-7B-Instruct",
+    ):
+        self.client = OpenAI(api_key=api_key, base_url=api_base)
+        self.model = model
+    # def _encode_frame(self, frame):
+    #     return base64.b64encode(frame.tobytes()).decode("utf-8")
+    def _encode_frame(self, frame):
+        # Encode a uint8 numpy array (image) as a JPEG and then base64 encode it.
+        ret, buffer = cv2.imencode(".jpg", frame)
+        if not ret:
+            raise ValueError("Could not encode frame")
+        return base64.b64encode(buffer).decode("utf-8")
+    def caption( self, frames: list[np.ndarray]):
+        parsing_rule = " You must return a caption for the sequence of images. The caption must be a single sentence. The caption must be in the same language as the question."
+        prompt = rf"Give me a detailed description of what you see in the images " f"\n[PARSING RULE]: {parsing_rule}"
+        # Encode each frame.
+        encoded_images = [self._encode_frame(frame) for frame in frames]
+        # Build the user message: a text prompt plus one image for each frame.
+        user_content = [
+            {
+                "type": "text",
+                "text": f"The following is the sequence of images",
+            }
+        ]
+        for encoded in encoded_images:
+            user_content.append(
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/jpeg;base64,{encoded}"},
+                }
+            )
+        # Create a chat completion request.
+        chat_response = self.client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "system", "content": prompt},
+                {"role": "user", "content": user_content},
+            ],
+            max_tokens=1000,
+            temperature=0.0,
+            logprobs=True,
+        )
+        content = chat_response.choices[0].message.content
+        return content
+def _load_entry_from_reader(video_path, query_text):
+    reader = Mp4Reader(
+        [{"path": video_path, "query": query_text}],
+        openai_save_path="",
+        sampling_rate_fps=0.5
+    )
+    data = reader.read_video()
+    if not data:
+        raise RuntimeError("No data returned by Mp4Reader (check video path)")
+    return data[0]
+def _make_empty_video(path, width=320, height=240, fps=1.0):
+    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+    writer = cv2.VideoWriter(path, fourcc, fps, (width, height))
+    frame = np.zeros((height, width, 3), dtype=np.uint8)
+    writer.write(frame)
+    writer.release()
+    return path
+def _crop_video_ffmpeg(input_path, output_path, frame_indices, prop_matrix):
+    if len(frame_indices) == 0:
+        cap = cv2.VideoCapture(str(input_path))
+        if not cap.isOpened():
+            raise RuntimeError(f"Could not open video: {input_path}")
+        width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        cap.release()
+        _make_empty_video(output_path, width, height, fps=1.0)
+        return
+    def group_into_ranges(frames):
+        if not frames:
+            return []
+        frames = sorted(set(frames))
+        ranges = []
+        start = prev = frames[0]
+        for f in frames[1:]:
+            if f == prev + 1:
+                prev = f
+            else:
+                ranges.append((start, prev + 1))  # end-exclusive
+                start = prev = f
+        ranges.append((start, prev + 1))
+        return ranges
+    ranges = group_into_ranges(frame_indices)
+    filters = []
+    labels = []
+    for i, (start, end) in enumerate(ranges):
+        filters.append(
+            f"[0:v]trim=start_frame={start}:end_frame={end},setpts=PTS-STARTPTS[v{i}]"
+        )
+        labels.append(f"[v{i}]")
+    filters.append(f"{''.join(labels)}concat=n={len(ranges)}:v=1:a=0[outv]")
+    cmd = [
+        "ffmpeg", "-y", "-i", input_path,
+        "-filter_complex", "; ".join(filters),
+        "-map", "[outv]",
+        "-c:v", "libx264", "-preset", "fast", "-crf", "23",
+        output_path,
+    ]
+    subprocess.run(cmd, check=True)
+def _crop_video(input_path: str, output_path: str, frame_indices: List[int], prop_matrix: Dict[str, List[int]]):
+    input_path = str(input_path)
+    output_path = str(output_path)
+    # Probe width/height/fps
+    cap = cv2.VideoCapture(input_path)
+    if not cap.isOpened():
+        raise RuntimeError(f"Could not open video: {input_path}")
+    width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    fps    = float(cap.get(cv2.CAP_PROP_FPS)) or 0.0
+    cap.release()
+    if fps <= 0:
+        fps = 30.0
+    # If nothing to write, emit a 1-frame empty video
+    if not frame_indices:
+        from numpy import zeros, uint8
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        out = cv2.VideoWriter(output_path, fourcc, 1.0, (width, height))
+        out.write(zeros((height, width, 3), dtype=uint8))
+        out.release()
+        return
+    # Helper: group consecutive integers into (start, end_exclusive)
+    def _group_ranges(frames: Iterable[int]) -> List[Tuple[int, int]]:
+        f = sorted(set(int(x) for x in frames))
+        if not f:
+            return []
+        out = []
+        s = p = f[0]
+        for x in f[1:]:
+            if x == p + 1:
+                p = x
+            else:
+                out.append((s, p + 1))
+                s = p = x
+        out.append((s, p + 1))
+        return out
+    # Invert prop_matrix to {frame_idx: sorted [props]}
+    props_by_frame: Dict[int, List[str]] = {}
+    for prop, frames in (prop_matrix or {}).items():
+        for fi in frames:
+            fi = int(fi)
+            props_by_frame.setdefault(fi, []).append(prop)
+    for fi in list(props_by_frame.keys()):
+        props_by_frame[fi] = sorted(set(props_by_frame[fi]))
+    # Only subtitle frames we will output
+    fi_set = set(int(x) for x in frame_indices)
+    frames_with_labels = sorted(fi for fi in fi_set if props_by_frame.get(fi))
+    # Compress consecutive frames that share the same label set
+    grouped_label_spans: List[Tuple[int, int, Tuple[str, ...]]] = []
+    prev_f = None
+    prev_labels: Tuple[str, ...] = ()
+    span_start = None
+    for f in frames_with_labels:
+        labels = tuple(props_by_frame.get(f, []))
+        if prev_f is None:
+            span_start, prev_f, prev_labels = f, f, labels
+        elif (f == prev_f + 1) and (labels == prev_labels):
+            prev_f = f
+        else:
+            grouped_label_spans.append((span_start, prev_f + 1, prev_labels))
+            span_start, prev_f, prev_labels = f, f, labels
+    if prev_f is not None and prev_labels:
+        grouped_label_spans.append((span_start, prev_f + 1, prev_labels))
+    # Build ASS subtitle file (top-right)
+    def ass_time(t_sec: float) -> str:
+        cs = int(round(t_sec * 100))
+        h = cs // (100 * 3600)
+        m = (cs // (100 * 60)) % 60
+        s = (cs // 100) % 60
+        cs = cs % 100
+        return f"{h}:{m:02d}:{s:02d}.{cs:02d}"
+    def make_ass(width: int, height: int) -> str:
+        lines = []
+        lines.append("[Script Info]")
+        lines.append("ScriptType: v4.00+")
+        lines.append("ScaledBorderAndShadow: yes")
+        lines.append(f"PlayResX: {width}")
+        lines.append(f"PlayResY: {height}")
+        lines.append("")
+        lines.append("[V4+ Styles]")
+        lines.append("Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, "
+                     "Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, "
+                     "Shadow, Alignment, MarginL, MarginR, MarginV, Encoding")
+        # Font size 18 per your request; Alignment=9 (top-right)
+        lines.append("Style: Default,DejaVu Sans,18,&H00FFFFFF,&H000000FF,&H00000000,&H64000000,"
+                     "0,0,0,0,100,100,0,0,1,2,0.8,9,16,16,16,1")
+        lines.append("")
+        lines.append("[Events]")
+        lines.append("Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text")
+        for start_f, end_f, labels in grouped_label_spans:
+            if not labels:
+                continue
+            start_t = ass_time(start_f / fps)
+            end_t   = ass_time(end_f   / fps)
+            text = r"\N".join(labels)  # stacked lines
+            lines.append(f"Dialogue: 0,{start_t},{end_t},Default,,0,0,0,,{text}")
+        return "\n".join(lines)
+    tmp_dir = tempfile.mkdtemp(prefix="props_ass_")
+    ass_path = os.path.join(tmp_dir, "props.ass")
+    with open(ass_path, "w", encoding="utf-8") as f:
+        f.write(make_ass(width, height))
+    # Build trim/concat ranges from requested frame_indices
+    ranges = _group_ranges(frame_indices)
+    # Filtergraph with burned subtitles then trim/concat
+    split_labels = [f"[s{i}]" for i in range(len(ranges))] if ranges else []
+    out_labels   = [f"[v{i}]" for i in range(len(ranges))] if ranges else []
+    filters = []
+    ass_arg = ass_path.replace("\\", "\\\\")
+    filters.append(f"[0:v]subtitles='{ass_arg}'[sub]")
+    if len(ranges) == 1:
+        s0, e0 = ranges[0]
+        filters.append(f"[sub]trim=start_frame={s0}:end_frame={e0},setpts=PTS-STARTPTS[v0]")
+    else:
+        if ranges:
+            filters.append(f"[sub]split={len(ranges)}{''.join(split_labels)}")
+            for i, (s, e) in enumerate(ranges):
+                filters.append(f"{split_labels[i]}trim=start_frame={s}:end_frame={e},setpts=PTS-STARTPTS{out_labels[i]}")
+    if ranges:
+        filters.append(f"{''.join(out_labels)}concat=n={len(ranges)}:v=1:a=0[outv]")
+    filter_complex = "; ".join(filters)
+    cmd = [
+        "ffmpeg", "-y",
+        "-i", input_path,
+        "-filter_complex", filter_complex,
+        "-map", "[outv]" if ranges else "[sub]",
+        "-c:v", "libx264", "-preset", "fast", "-crf", "23",
+        output_path,
+    ]
+    try:
+        subprocess.run(cmd, check=True)
+    finally:
+        try:
+            os.remove(ass_path)
+            os.rmdir(tmp_dir)
+        except OSError:
+            pass
+def _format_prop_ranges_dict(prop_matrix: Dict[str, List[int]]) -> str:
+    def group_into_ranges(frames: Iterable[int]) -> List[Tuple[int, int]]:
+        f = sorted(set(int(x) for x in frames))
+        if not f:
+            return []
+        ranges: List[Tuple[int, int]] = []
+        s = p = f[0]
+        for x in f[1:]:
+            if x == p + 1:
+                p = x
+            else:
+                ranges.append((s, p))   # inclusive end for display
+                s = p = x
+        ranges.append((s, p))
+        return ranges
+    detections = {}
+    for prop, frames in prop_matrix.items():
+        ranges = group_into_ranges(frames)
+        detections[prop] = ranges
+    return detections
+def _format_prop_ranges(prop_matrix: Dict[str, List[int]]) -> str:
+    def group_into_ranges(frames: Iterable[int]) -> List[Tuple[int, int]]:
+        f = sorted(set(int(x) for x in frames))
+        if not f:
+            return []
+        ranges: List[Tuple[int, int]] = []
+        s = p = f[0]
+        for x in f[1:]:
+            if x == p + 1:
+                p = x
+            else:
+                ranges.append((s, p))   # inclusive end for display
+                s = p = x
+        ranges.append((s, p))
+        return ranges
+    if not prop_matrix:
+        return "No propositions detected."
+    lines = []
+    for prop, frames in prop_matrix.items():
+        ranges = group_into_ranges(frames)
+        pretty = prop.replace("_", " ").title()
+        if not ranges:
+            lines.append(f"{pretty}: —")
+            continue
+        parts = [f"{a}" if a == b else f"{a}-{b}" for (a, b) in ranges]
+        lines.append(f"{pretty}: {', '.join(parts)}")
+    return "\n".join(lines)
+def generate_timeline_plot(detections, total_frames):
+    """
+    Generates a timeline plot from detection data using Matplotlib.
+    Args:
+        detections (dict): A dictionary where keys are string labels and values are lists
+                           of (start_frame, end_frame) tuples.
+                           e.g., {"dog": [(0, 45), (90, 100)], "grass": [(30, 80)]}
+        total_frames (int): The total number of frames in the video for the x-axis scale.
+    Returns:
+        matplotlib.figure.Figure: The generated plot figure.
+    """
+    labels = list(detections.keys())
+    num_labels = len(labels)
+    # Handle case with no detections
+    if num_labels == 0:
+        fig, ax = plt.subplots(figsize=(10, 1))
+        ax.text(0.5, 0.5, 'No propositions detected.', ha='center', va='center')
+        ax.set_axis_off()
+        return fig
+    # Use a color map to assign distinct colors automatically
+    colors = plt.cm.get_cmap('tab10', num_labels)
+    fig, ax = plt.subplots(figsize=(10, num_labels * 0.6 + 0.5))
+    ax.set_xlim(0, total_frames)
+    ax.set_ylim(0, num_labels)
+    ax.set_yticks(np.arange(num_labels) + 0.5)
+    ax.set_yticklabels(labels, fontsize=12)
+    ax.set_xlabel("Frame Number", fontsize=12)
+    ax.grid(axis='x', linestyle='--', alpha=0.6)
+    # Invert y-axis to have the first proposition on top
+    ax.invert_yaxis()
+    for i, label in enumerate(labels):
+        # matplotlib's broken_barh needs a list of (start, width) tuples
+        segments = [(start, end - start) for start, end in detections[label]]
+        # The bar is drawn at y-position 'i' with a height of 0.8
+        ax.broken_barh(segments, (i + 0.1, 0.8), facecolors=colors(i))
+    plt.tight_layout()
+    return fig
+# -----------------------------
+# Gradio handler
+# -----------------------------
+def run_pipeline(input_video, mode, query_text, propositions_json, specification_text):
+    """
+    Returns: (cropped_video_path, prop_ranges_text, tl_text)
+    """
+    def _err(msg, width=320, height=240):  # keep outputs shape consistent
+        tmp_out = os.path.join("/tmp", f"empty_{uuid.uuid4().hex}.mp4")
+        _make_empty_video(tmp_out, width=width, height=height, fps=1.0)
+        return (
+            tmp_out,
+            "No propositions detected.",
+            f"Error: {msg}"
+        )
+    # Resolve video path
+    if isinstance(input_video, dict) and "name" in input_video:
+        video_path = input_video["name"]
+    elif isinstance(input_video, str):
+        video_path = input_video
+    else:
+        return _err("Please provide a video.")
+    # Build entry
+    if mode == "Natural language query":
+        if not query_text or not query_text.strip():
+            return _err("Please enter a query.")
+        entry = _load_entry_from_reader(video_path, query_text)
+    else:
+        if not (propositions_json and propositions_json.strip()) or not (specification_text and specification_text.strip()):
+            return _err("Please provide both Propositions (array) and Specification.")
+        entry = _load_entry_from_reader(video_path, "dummy-query")
+        try:
+            props = json.loads(propositions_json)
+            if not isinstance(props, list):
+                return _err("Propositions must be a JSON array.")
+        except Exception as e:
+            return _err(f"Failed to parse propositions JSON: {e}")
+        entry["tl"] = {
+            "propositions": props,
+            "specification": specification_text
+        }
+    # Compute FOI
+    try:
+        foi, prop_matrix, p2 = process_entry(entry)  # list of frame indices & {prop: [frames]}
+        print(foi)
+        print(prop_matrix)
+        print(p2)
+    except Exception as e:
+        return _err(f"Processing error: {e}")
+    # Write cropped video
+    try:
+        out_path = os.path.join("/tmp", f"cropped_{uuid.uuid4().hex}.mp4")
+        _crop_video(video_path, out_path, foi, prop_matrix)
+        print(f"Wrote cropped video to: {out_path}")
+    except Exception as e:
+        return _err(f"Failed to write cropped video: {e}")
+    # Build right-side text sections
+    prop_ranges_text = _format_prop_ranges(prop_matrix)
+    prop_ranges_dict = _format_prop_ranges_dict(prop_matrix)
+    plot = generate_timeline_plot(prop_ranges_dict, entry["video_info"].frame_count)
+    tl_text = (
+        f"Propositions: {json.dumps(entry['tl']['propositions'], ensure_ascii=False)}\n"
+        f"Specification: {entry['tl']['specification']}"
+    )
+    return out_path, prop_ranges_text, tl_text, plot
+def generate_caption(video_path):
+    """
+    Simulates generating a caption for the given video file.
+    """
+    # If the video is cleared, the input will be None
+    if video_path is None:
+        # Hide the caption box and clear its content
+        return gr.update(value="", visible=False)
+    print(f"Generating caption for: {video_path}")
+    vllm_client = VLLMClient()
+    entry = _load_entry_from_reader(video_path, "dummy-query")
+    # sample 4 frames from the video evenly
+    len_frames = len(entry['images'])
+    images = [entry['images'][i] for i in range(0, len_frames, len_frames//3)]
+    caption_text = vllm_client.caption(images)
+    # Simulate model inference time
+    # Use gr.update to change both the value and visibility of the textbox
+    return gr.update(value=caption_text, visible=True)
+# -----------------------------
+# UI
+# -----------------------------
+with gr.Blocks(css="""
+#io-col {display: flex; gap: 1rem;}
+#left {flex: 1;}
+#right {flex: 1;}
+""", title="NSVS-TL") as demo:
+    gr.Markdown("# Neuro-Symbolic Visual Search with Temporal Logic")
+    gr.Markdown(
+        "Upload a video and either provide a natural-language **Query** *or* directly supply **Propositions** (array) + **Specification**. "
+        "On the right, you'll get a **cropped video** containing only the frames of interest, a **Propositions by Frames** summary, and the combined TL summary."
+    )
+    with gr.Row(elem_id="io-col"):
+        with gr.Column(elem_id="left"):
+            mode = gr.Radio(
+                choices=["Natural language query", "Props/Spec"],
+                value="Natural language query",
+                label="Input mode"
+            )
+            video = gr.Video(label="Upload Video")
+            query = gr.Textbox(
+                label="Query (natural language)",
+                placeholder="e.g., a man is jumping and panting until he falls down"
+            )
+            captions = gr.Textbox(
+                label="Video Caption",
+                placeholder="e.g., a man is jumping and panting until he falls down",
+                lines=4,
+                visible=False
+            )
+            propositions = gr.Textbox(
+                label="Propositions (JSON array)",
+                placeholder='e.g., ["man_jumps", "man_pants", "man_falls_down"]',
+                lines=4,
+                visible=False
+            )
+            specification = gr.Textbox(
+                label="Specification",
+                placeholder='e.g., ("woman_jumps" & "woman_claps") U "candle_is_blown"',
+                visible=False
+            )
+            def _toggle_fields(m):
+                if m == "Natural language query":
+                    return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
+                else:
+                    return gr.update(visible=False), gr.update(visible=True), gr.update(visible=True)
+            mode.change(_toggle_fields, inputs=[mode], outputs=[query, propositions, specification])
+            video.change(
+                fn=generate_caption,
+                inputs=[video],
+                outputs=[captions]
+            )
+            run_btn = gr.Button("Run", variant="primary")
+            gr.Examples(
+                label="Examples (dummy paths + queries)",
+                examples=[
+                    ["demo_videos/dog_jump.mp4", "a dog jumps until a red tube is in view"],
+                    ["demo_videos/blue_shirt.mp4", "a girl in a green shirt until a candle is blown"],
+                    ["demo_videos/car.mp4", "red car until a truck"]
+                ],
+                inputs=[video, query],
+                cache_examples=False
+            )
+        with gr.Column(elem_id="right"):
+            cropped_video = gr.Video(label="Cropped Video (Frames of Interest Only)")
+            prop_ranges_out = gr.Textbox(
+                label="Propositions by Frames",
+                lines=6,
+                interactive=False
+            )
+            timeline_plot_output = gr.Plot(label="Propositions Timeline")
+            tl_out = gr.Textbox(
+                label="TL (Propositions & Specification)",
+                lines=8,
+                interactive=False
+            )
+    run_btn.click(
+        fn=run_pipeline,
+        inputs=[video, mode, query, propositions, specification],
+        outputs=[cropped_video, prop_ranges_out, tl_out, timeline_plot_output]
+    )
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)

execute_demo_v3.py ADDED Viewed

	@@ -0,0 +1,668 @@

+import os
+import cv2
+import json
+import uuid
+import base64
+import tempfile
+import subprocess
+import numpy as np
+import gradio as gr
+from openai import OpenAI
+from matplotlib import pyplot as plt
+from typing import Dict, List, Iterable, Tuple, Union
+from ns_vfs.video.read_mp4 import Mp4Reader
+from execute_with_mp4 import process_entry
+# Optional import of preprocess_yolo if available alongside process_entry
+try:
+    from execute_with_mp4 import preprocess_yolo
+except Exception:
+    preprocess_yolo = None
+class VLLMClient:
+    def __init__(
+        self,
+        api_key="EMPTY",
+        api_base="http://localhost:8000/v1",
+        model="OpenGVLab/InternVL2-8B",
+    ):
+        self.client = OpenAI(api_key=api_key, base_url=api_base)
+        self.model = model
+    def _encode_frame(self, frame):
+        ok, buffer = cv2.imencode(".jpg", frame)
+        if not ok:
+            raise ValueError("Could not encode frame")
+        return base64.b64encode(buffer).decode("utf-8")
+    def caption(self, frames: list[np.ndarray]):
+        parsing_rule = (
+            " You must return a caption for the sequence of images. "
+            "The caption must be a single sentence. "
+            "The caption must be in the same language as the question."
+        )
+        prompt = (
+            r"Give me a detailed description of what you see in the images "
+            f"\n[PARSING RULE]: {parsing_rule}"
+        )
+        encoded_images = [self._encode_frame(frame) for frame in frames]
+        user_content = [{"type": "text", "text": "The following is the sequence of images"}]
+        for encoded in encoded_images:
+            user_content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded}"}})
+        chat_response = self.client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "system", "content": prompt},
+                {"role": "user", "content": user_content},
+            ],
+            max_tokens=1000,
+            temperature=0.0,
+            logprobs=True,
+        )
+        return chat_response.choices[0].message.content
+def _load_entry_from_reader(video_path, query_text):
+    reader = Mp4Reader(
+        [{"path": video_path, "query": query_text}],
+        openai_save_path="",
+        sampling_rate_fps=2
+    )
+    data = reader.read_video()
+    if not data:
+        raise RuntimeError("No data returned by Mp4Reader (check video path)")
+    return data[0]
+def _make_empty_video(path, width=320, height=240, fps=1.0):
+    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+    writer = cv2.VideoWriter(path, fourcc, fps, (width, height))
+    frame = np.zeros((height, width, 3), dtype=np.uint8)
+    writer.write(frame)
+    writer.release()
+    return path
+# -----------------------------
+# Helpers to detect bbox-style outputs and to convert them
+# -----------------------------
+BBox = Tuple[float, float, float, float]
+YOLODict = Dict[str, List[Tuple[int, BBox]]]
+VLMDict = Dict[str, List[int]]
+def _has_bboxes(prop_matrix: Union[YOLODict, VLMDict]) -> bool:
+    """Return True if the prop_matrix contains (frame_idx, bbox) tuples."""
+    if not prop_matrix:
+        return False
+    for v in prop_matrix.values():
+        if not v:
+            continue
+        first = v[0]
+        if isinstance(first, tuple) and len(first) == 2 and hasattr(first[1], "__len__") and len(first[1]) == 4:
+            return True
+    return False
+def _bbox_dict_to_frames_only(prop_bboxes: YOLODict) -> VLMDict:
+    """Convert {'car': [(i, (x1,y1,x2,y2)), ...], ...} -> {'car': [i, ...], ...}"""
+    out: VLMDict = {}
+    for k, pairs in (prop_bboxes or {}).items():
+        out[k] = [int(i) for i, _ in pairs]
+    return out
+# -----------------------------
+# Video cropping and overlays
+# -----------------------------
+def _crop_video_subtitles(input_path: str, output_path: str, frame_indices: List[int], prop_matrix: VLMDict):
+    """
+    Existing behavior (VLM/no bboxes):
+      - Keep only frames in frame_indices (in order, contiguous groups)
+      - Overlay top-right proposition text via ASS subtitles
+    """
+    input_path = str(input_path)
+    output_path = str(output_path)
+    cap = cv2.VideoCapture(input_path)
+    if not cap.isOpened():
+        raise RuntimeError(f"Could not open video: {input_path}")
+    width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    fps    = float(cap.get(cv2.CAP_PROP_FPS)) or 0.0
+    cap.release()
+    if fps <= 0:
+        fps = 30.0
+    if not frame_indices:
+        from numpy import zeros, uint8
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        out = cv2.VideoWriter(output_path, fourcc, 1.0, (width, height))
+        out.write(zeros((height, width, 3), dtype=uint8))
+        out.release()
+        return
+    def _group_ranges(frames: Iterable[int]) -> List[Tuple[int, int]]:
+        f = sorted(set(int(x) for x in frames))
+        if not f:
+            return []
+        out = []
+        s = p = f[0]
+        for x in f[1:]:
+            if x == p + 1:
+                p = x
+            else:
+                out.append((s, p + 1))
+                s = p = x
+        out.append((s, p + 1))
+        return out
+    props_by_frame: Dict[int, List[str]] = {}
+    for prop, frames in (prop_matrix or {}).items():
+        for fi in frames:
+            fi = int(fi)
+            props_by_frame.setdefault(fi, []).append(prop)
+    for fi in list(props_by_frame.keys()):
+        props_by_frame[fi] = sorted(set(props_by_frame[fi]))
+    fi_set = set(int(x) for x in frame_indices)
+    frames_with_labels = sorted(fi for fi in fi_set if props_by_frame.get(fi))
+    grouped_label_spans: List[Tuple[int, int, Tuple[str, ...]]] = []
+    prev_f = None
+    prev_labels: Tuple[str, ...] = ()
+    span_start = None
+    for f in frames_with_labels:
+        labels = tuple(props_by_frame.get(f, []))
+        if prev_f is None:
+            span_start, prev_f, prev_labels = f, f, labels
+        elif (f == prev_f + 1) and (labels == prev_labels):
+            prev_f = f
+        else:
+            grouped_label_spans.append((span_start, prev_f + 1, prev_labels))
+            span_start, prev_f, prev_labels = f, f, labels
+    if prev_f is not None and prev_labels:
+        grouped_label_spans.append((span_start, prev_f + 1, prev_labels))
+    # Build ASS subtitle (top-right)
+    def ass_time(t_sec: float) -> str:
+        cs = int(round(t_sec * 100))
+        h = cs // (100 * 3600)
+        m = (cs // (100 * 60)) % 60
+        s = (cs // 100) % 60
+        cs = cs % 100
+        return f"{h}:{m:02d}:{s:02d}.{cs:02d}"
+    def make_ass(width: int, height: int) -> str:
+        lines = []
+        lines.append("[Script Info]")
+        lines.append("ScriptType: v4.00+")
+        lines.append("ScaledBorderAndShadow: yes")
+        lines.append(f"PlayResX: {width}")
+        lines.append(f"PlayResY: {height}")
+        lines.append("")
+        lines.append("[V4+ Styles]")
+        lines.append("Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, "
+                     "Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, "
+                     "Shadow, Alignment, MarginL, MarginR, MarginV, Encoding")
+        lines.append("Style: Default,DejaVu Sans,18,&H00FFFFFF,&H000000FF,&H00000000,&H64000000,"
+                     "0,0,0,0,100,100,0,0,1,2,0.8,9,16,16,16,1")
+        lines.append("")
+        lines.append("[Events]")
+        lines.append("Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text")
+        for start_f, end_f, labels in grouped_label_spans:
+            if not labels:
+                continue
+            start_t = ass_time(start_f / fps)
+            end_t   = ass_time(end_f   / fps)
+            text = r"\N".join(labels)  # stacked lines
+            lines.append(f"Dialogue: 0,{start_t},{end_t},Default,,0,0,0,,{text}")
+        return "\n".join(lines)
+    tmp_dir = tempfile.mkdtemp(prefix="props_ass_")
+    ass_path = os.path.join(tmp_dir, "props.ass")
+    with open(ass_path, "w", encoding="utf-8") as f:
+        f.write(make_ass(width, height))
+    ranges = _group_ranges(frame_indices)
+    split_labels = [f"[s{i}]" for i in range(len(ranges))] if ranges else []
+    out_labels   = [f"[v{i}]" for i in range(len(ranges))] if ranges else []
+    filters = []
+    ass_arg = ass_path.replace("\\", "\\\\")
+    filters.append(f"[0:v]subtitles='{ass_arg}'[sub]")
+    if len(ranges) == 1:
+        s0, e0 = ranges[0]
+        filters.append(f"[sub]trim=start_frame={s0}:end_frame={e0},setpts=PTS-STARTPTS[v0]")
+    else:
+        if ranges:
+            filters.append(f"[sub]split={len(ranges)}{''.join(split_labels)}")
+            for i, (s, e) in enumerate(ranges):
+                filters.append(f"{split_labels[i]}trim=start_frame={s}:end_frame={e},setpts=PTS-STARTPTS{out_labels[i]}")
+    if ranges:
+        filters.append(f"{''.join(out_labels)}concat=n={len(ranges)}:v=1:a=0[outv]")
+    filter_complex = "; ".join(filters)
+    cmd = [
+        "ffmpeg", "-y",
+        "-i", input_path,
+        "-filter_complex", filter_complex,
+        "-map", "[outv]" if ranges else "[sub]",
+        "-c:v", "libx264", "-preset", "fast", "-crf", "23",
+        output_path,
+    ]
+    try:
+        subprocess.run(cmd, check=True)
+    finally:
+        try:
+            os.remove(ass_path)
+            os.rmdir(tmp_dir)
+        except OSError:
+            pass
+def _crop_video_bboxes(input_path: str, output_path: str, frame_indices: List[int], prop_bboxes: YOLODict):
+    """
+    YOLO path (with bounding boxes):
+      - Keep only frames in frame_indices.
+      - Draw rectangles for each detected prop on the kept frames.
+      - Label each rectangle with the prop name (top-left of box).
+    """
+    keep_set = set(int(x) for x in frame_indices)
+    if not keep_set:
+        # output a 1-frame empty video (consistent with _crop_video_subtitles)
+        cap0 = cv2.VideoCapture(input_path)
+        if not cap0.isOpened():
+            raise RuntimeError(f"Could not open video: {input_path}")
+        width  = int(cap0.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap0.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        cap0.release()
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        out = cv2.VideoWriter(output_path, fourcc, 1.0, (width, height))
+        out.write(np.zeros((height, width, 3), dtype=np.uint8))
+        out.release()
+        return
+    # Build frame -> list[(prop, bbox)]
+    per_frame: Dict[int, List[Tuple[str, BBox]]] = {}
+    for prop, pairs in (prop_bboxes or {}).items():
+        for fi, bbox in pairs:
+            fi = int(fi)
+            per_frame.setdefault(fi, []).append((prop, bbox))
+    cap = cv2.VideoCapture(input_path)
+    if not cap.isOpened():
+        raise RuntimeError(f"Could not open video: {input_path}")
+    width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    fps    = float(cap.get(cv2.CAP_PROP_FPS)) or 30.0
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
+    idx = 0
+    ok, frame = cap.read()
+    while ok:
+        if idx in keep_set:
+            # draw all bboxes for this frame
+            for prop, (x1, y1, x2, y2) in per_frame.get(idx, []):
+                p1 = (int(round(x1)), int(round(y1)))
+                p2 = (int(round(x2)), int(round(y2)))
+                cv2.rectangle(frame, p1, p2, (0, 255, 0), 2)  # green rectangle
+                # text background for readability
+                label = prop.replace("_", " ")
+                txt_origin = (p1[0], max(0, p1[1] - 5))
+                cv2.putText(frame, label, txt_origin, cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 0), 3, cv2.LINE_AA)
+                cv2.putText(frame, label, txt_origin, cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 1, cv2.LINE_AA)
+            out.write(frame)
+        idx += 1
+        ok, frame = cap.read()
+    cap.release()
+    out.release()
+def _crop_video(
+    input_path: str,
+    output_path: str,
+    frame_indices: List[int],
+    prop_matrix: Union[VLMDict, YOLODict]
+):
+    """
+    Dispatch to the appropriate cropper:
+      - VLM/no-bbox: ASS subtitle overlay.
+      - YOLO with bbox: draw rectangles overlay via OpenCV.
+    """
+    if _has_bboxes(prop_matrix):
+        _crop_video_bboxes(input_path, output_path, frame_indices, prop_matrix)  # type: ignore[arg-type]
+    else:
+        _crop_video_subtitles(input_path, output_path, frame_indices, prop_matrix)  # type: ignore[arg-type]
+# -----------------------------
+# Text helpers (unchanged API, but robust to bbox dicts)
+# -----------------------------
+def _format_prop_ranges_dict(prop_matrix: Union[VLMDict, YOLODict]) -> Dict[str, List[Tuple[int, int]]]:
+    def group_into_ranges(frames: Iterable[int]) -> List[Tuple[int, int]]:
+        f = sorted(set(int(x) for x in frames))
+        if not f:
+            return []
+        ranges: List[Tuple[int, int]] = []
+        s = p = f[0]
+        for x in f[1:]:
+            if x == p + 1:
+                p = x
+            else:
+                ranges.append((s, p))
+                s = p = x
+        ranges.append((s, p))
+        return ranges
+    if _has_bboxes(prop_matrix):
+        frames_only = _bbox_dict_to_frames_only(prop_matrix)  # type: ignore[arg-type]
+    else:
+        frames_only = prop_matrix  # type: ignore[assignment]
+    detections: Dict[str, List[Tuple[int, int]]] = {}
+    for prop, frames in (frames_only or {}).items():
+        detections[prop] = group_into_ranges(frames)
+    return detections
+def _format_prop_ranges(prop_matrix: Union[VLMDict, YOLODict]) -> str:
+    def group_into_ranges(frames: Iterable[int]) -> List[Tuple[int, int]]:
+        f = sorted(set(int(x) for x in frames))
+        if not f:
+            return []
+        ranges: List[Tuple[int, int]] = []
+        s = p = f[0]
+        for x in f[1:]:
+            if x == p + 1:
+                p = x
+            else:
+                ranges.append((s, p))
+                s = p = x
+        ranges.append((s, p))
+        return ranges
+    if not prop_matrix:
+        return "No propositions detected."
+    if _has_bboxes(prop_matrix):
+        frames_only = _bbox_dict_to_frames_only(prop_matrix)  # type: ignore[arg-type]
+    else:
+        frames_only = prop_matrix  # type: ignore[assignment]
+    lines = []
+    for prop, frames in (frames_only or {}).items():
+        ranges = group_into_ranges(frames)
+        pretty = prop.replace("_", " ").title()
+        if not ranges:
+            lines.append(f"{pretty}: —")
+            continue
+        parts = [f"{a}" if a == b else f"{a}-{b}" for (a, b) in ranges]
+        lines.append(f"{pretty}: {', '.join(parts)}")
+    return "\n".join(lines)
+# -----------------------------
+# Plotting
+# -----------------------------
+def generate_timeline_plot(detections, total_frames):
+    labels = list(detections.keys())
+    num_labels = len(labels)
+    if num_labels == 0:
+        fig, ax = plt.subplots(figsize=(10, 1))
+        ax.text(0.5, 0.5, 'No propositions detected.', ha='center', va='center')
+        ax.set_axis_off()
+        return fig
+    colors = plt.cm.get_cmap('tab10', num_labels)
+    fig, ax = plt.subplots(figsize=(10, num_labels * 0.6 + 0.5))
+    ax.set_xlim(0, total_frames)
+    ax.set_ylim(0, num_labels)
+    ax.set_yticks(np.arange(num_labels) + 0.5)
+    ax.set_yticklabels(labels, fontsize=12)
+    ax.set_xlabel("Frame Number", fontsize=12)
+    ax.grid(axis='x', linestyle='--', alpha=0.6)
+    ax.invert_yaxis()
+    for i, label in enumerate(labels):
+        segments = [(start, end - start) for start, end in detections[label]]
+        ax.broken_barh(segments, (i + 0.1, 0.8))
+    plt.tight_layout()
+    return fig
+# -----------------------------
+# Helpers for YOLO cache path
+# -----------------------------
+def _yolo_cache_path_for_video(video_path: str) -> str:
+    """
+    Always save the YOLO cache in the demo_videos folder.
+    demo_videos/car.mp4 -> demo_videos/car.npz
+    uploads/tmp123.mp4  -> demo_videos/tmp123.npz
+    """
+    base = os.path.basename(video_path)
+    root, _ = os.path.splitext(base)
+    os.makedirs("demo_videos", exist_ok=True)
+    return os.path.join("demo_videos", f"{root}.npz")
+# -----------------------------
+# Gradio handler
+# -----------------------------
+def run_pipeline(input_video, mode, detector, query_text, propositions_json, specification_text):
+    def _err(msg, width=320, height=240):
+        tmp_out = os.path.join("/tmp", f"empty_{uuid.uuid4().hex}.mp4")
+        _make_empty_video(tmp_out, width=width, height=height, fps=1.0)
+        return (tmp_out, "No propositions detected.", f"Error: {msg}", None)
+    # Normalize input path
+    if isinstance(input_video, dict) and "name" in input_video:
+        video_path = input_video["name"]
+    elif isinstance(input_video, str):
+        video_path = input_video
+    else:
+        return _err("Please provide a video.")
+    # Build entry
+    if mode == "Natural language query":
+        if not query_text or not query_text.strip():
+            return _err("Please enter a query.")
+        entry = _load_entry_from_reader(video_path, query_text)
+    else:
+        if not (propositions_json and propositions_json.strip()) or not (specification_text and specification_text.strip()):
+            return _err("Please provide both Propositions (array) and Specification.")
+        entry = _load_entry_from_reader(video_path, "dummy-query")
+        try:
+            props = json.loads(propositions_json)
+            if not isinstance(props, list):
+                return _err("Propositions must be a JSON array.")
+        except Exception as e:
+            return _err(f"Failed to parse propositions JSON: {e}")
+        entry["tl"] = {"propositions": props, "specification": specification_text}
+    # Process depending on detector
+    foi = None
+    prop_matrix: Union[VLMDict, YOLODict] = {}
+    if detector == "YOLO":
+        cache_path = _yolo_cache_path_for_video(video_path)
+        # 1) preprocess_yolo when YOLO is on
+        try:
+            if preprocess_yolo is None:
+                raise NameError("preprocess_yolo() not defined")
+            ret_path = preprocess_yolo(
+                entry["images"],
+                model_weights="yolov8n.pt",
+                device="cuda:0",
+                out_path=cache_path
+            )
+            if isinstance(ret_path, str) and ret_path.strip():
+                cache_path = ret_path
+        except NameError:
+            return _err("YOLO selected but preprocess_yolo is not available.")
+        except Exception as e:
+            return _err(f"YOLO preprocessing error: {e}")
+        # 2) then run with YOLO
+        try:
+            res = process_entry(entry, run_with_yolo=True, cache_path=cache_path)
+            if isinstance(res, tuple) and len(res) == 2:
+                foi, prop_matrix = res
+            else:
+                foi = res
+                prop_matrix = {}
+        except Exception as e:
+            return _err(f"Processing error (YOLO mode): {e}")
+    else:
+        # VLM path only
+        try:
+            foi, prop_matrix = process_entry(entry, run_with_yolo=False)
+        except Exception as e:
+            return _err(f"Processing error (VLM mode): {e}")
+    # Export cropped video (with either subtitles or bbox overlays)
+    try:
+        out_path = os.path.join("/tmp", f"cropped_{uuid.uuid4().hex}.mp4")
+        _crop_video(video_path, out_path, foi, prop_matrix)
+    except Exception as e:
+        return _err(f"Failed to write cropped video: {e}")
+    # Text + plot (work from frames; ignore bbox coords)
+    try:
+        prop_ranges_text = _format_prop_ranges(prop_matrix)
+        prop_ranges_dict = _format_prop_ranges_dict(prop_matrix)
+        plot = generate_timeline_plot(prop_ranges_dict, entry["video_info"].frame_count)
+    except Exception:
+        prop_ranges_text = "No propositions detected." if not prop_matrix else str(prop_matrix)
+        plot = generate_timeline_plot({}, entry["video_info"].frame_count)
+    tl_text = (
+        f"Propositions: {json.dumps(entry['tl']['propositions'], ensure_ascii=False)}\n"
+        f"Specification: {entry['tl']['specification']}"
+    )
+    return out_path, prop_ranges_text, tl_text, plot
+def generate_caption(video_path):
+    if video_path is None:
+        return gr.update(value="", visible=False)
+    vllm_client = VLLMClient()
+    entry = _load_entry_from_reader(video_path, "dummy-query")
+    n = len(entry['images'])
+    step = max(1, n // 3)
+    images = [entry['images'][i] for i in range(0, n, step)][:3]
+    caption_text = vllm_client.caption(images)
+    return gr.update(value=caption_text, visible=True)
+# -----------------------------
+# UI
+# -----------------------------
+with gr.Blocks(css="""
+#io-col {display: flex; gap: 1rem;}
+#left {flex: 1;}
+#right {flex: 1;}
+""", title="NSVS-TL") as demo:
+    gr.Markdown("# Neuro-Symbolic Visual Search with Temporal Logic")
+    gr.Markdown("Upload a video and either provide a natural-language **Query** *or* directly supply **Propositions** + **Specification**.")
+    with gr.Row(elem_id="io-col"):
+        with gr.Column(elem_id="left"):
+            mode = gr.Radio(
+                choices=["Natural language query", "Props/Spec"],
+                value="Natural language query",
+                label="Input mode"
+            )
+            detector = gr.Radio(
+                choices=["VLM", "YOLO"],
+                value="VLM",
+                label="Yolo vs VLM"
+            )
+            video = gr.Video(label="Upload Video")
+            query = gr.Textbox(
+                label="Query (natural language)",
+                placeholder="e.g., a man is jumping and panting until he falls down"
+            )
+            captions = gr.Textbox(
+                label="Video Caption",
+                placeholder="Auto caption will appear here",
+                lines=4,
+                visible=False
+            )
+            propositions = gr.Textbox(
+                label="Propositions (JSON array)",
+                placeholder='e.g., ["man_jumps", "man_pants", "man_falls_down"]',
+                lines=4,
+                visible=False
+            )
+            specification = gr.Textbox(
+                label="Specification",
+                placeholder='e.g., ("woman_jumps" & "woman_claps") U "candle_is_blown"',
+                visible=False
+            )
+            def _toggle_fields(m):
+                if m == "Natural language query":
+                    return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
+                else:
+                    return gr.update(visible=False), gr.update(visible=True), gr.update(visible=True)
+            # Only toggles visibility of fields; no processing
+            mode.change(_toggle_fields, inputs=[mode], outputs=[query, propositions, specification])
+            # Only auto-caption runs on video change
+            video.change(fn=generate_caption, inputs=[video], outputs=[captions], queue=False)
+            run_btn = gr.Button("Run", variant="primary")
+            gr.Examples(
+                label="Examples",
+                examples=[
+                    ["demo_videos/dog_jump.mp4", "a dog jumps until a red tube is in view"],
+                    ["demo_videos/blue_shirt.mp4", "a girl in a green shirt until a candle is blown"],
+                    ["demo_videos/car.mp4", "red car until a truck"],
+                    ["demo_videos/newyork_1.mp4", "taxi until empire state building"],
+                    ["demo_videos/chicago_2.mp4", "boat until ferris wheel"]
+                ],
+                inputs=[video, query],
+                cache_examples=False
+            )
+        with gr.Column(elem_id="right"):
+            cropped_video = gr.Video(label="Cropped Video (Frames of Interest Only)")
+            prop_ranges_out = gr.Textbox(label="Propositions by Frames", lines=6, interactive=False)
+            timeline_plot_output = gr.Plot(label="Propositions Timeline")
+            tl_out = gr.Textbox(label="TL (Propositions & Specification)", lines=8, interactive=False)
+    # ONLY the Run button triggers processing/preprocessing
+    run_btn.click(
+        fn=run_pipeline,
+        inputs=[video, mode, detector, query, propositions, specification],
+        outputs=[cropped_video, prop_ranges_out, tl_out, timeline_plot_output]
+    )
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)

execute_with_mp4.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from tqdm import tqdm
 import itertools
 import operator
 import json
@@ -6,24 +6,30 @@ import time
 import os
 from ns_vfs.nsvs import run_nsvs
 from ns_vfs.video.read_mp4 import Mp4Reader
 VIDEOS = [
     {
-        "path": "demo_videos/blue_shirt.mp4",
-        "query": "a woman is jumping and clapping until a candle is blown"
     }
 ]
 DEVICE = 7  # GPU device index
 OPENAI_SAVE_PATH = ""
 OUTPUT_DIR = "output"
 def fill_in_frame_count(arr, entry):
     scale = (entry["video_info"].fps) / (entry["metadata"]["sampling_rate_fps"])
     runs = []
-    for _, grp in itertools.groupby(sorted(arr), key=lambda x, c=[0]: (x - (c.__setitem__(0, c[0]+1) or c[0]))):
         g = list(grp)
         runs.append((g[0], g[-1]))
@@ -36,30 +42,106 @@ def fill_in_frame_count(arr, entry):
         real.extend(range(a, b + 1))
     return real
-def process_entry(entry):
-    foi, object_frame_dict, px = run_nsvs(
-        frames=entry['images'],
-        proposition=entry['tl']['propositions'],
-        specification=entry['tl']['specification'],
-        model_name="InternVL2-8B",
-        device=DEVICE
-    )
-    foi = fill_in_frame_count([i for sub in foi for i in sub], entry)
-    object_frame_dict = {key: fill_in_frame_count(value, entry) for key, value in object_frame_dict.items()}
-    px = {key: fill_in_frame_count(value, entry) for key, value in px.items()}
-    return foi, object_frame_dict, px
 def main():
     reader = Mp4Reader(VIDEOS, OPENAI_SAVE_PATH, sampling_rate_fps=1)
     data = reader.read_video()
     if not data:
         return
-    with tqdm(enumerate(data), total=len(data), desc="Processing entries") as pbar:
         for i, entry in pbar:
             start_time = time.time()
-            foi = process_entry(entry)
             end_time = time.time()
             processing_time = round(end_time - start_time, 3)

+import tqdm
 import itertools
 import operator
 import json
 import os
 from ns_vfs.nsvs import run_nsvs
+from ns_vfs.nsvs_yolo import *
 from ns_vfs.video.read_mp4 import Mp4Reader
 VIDEOS = [
     {
+        "path": "demo_videos/car.mp4",
+        "query": "car until truck"
     }
 ]
 DEVICE = 7  # GPU device index
 OPENAI_SAVE_PATH = ""
 OUTPUT_DIR = "output"
+import itertools
 def fill_in_frame_count(arr, entry):
     scale = (entry["video_info"].fps) / (entry["metadata"]["sampling_rate_fps"])
     runs = []
+    for _, grp in itertools.groupby(
+        sorted(arr),
+        key=lambda x, c=[0]: (x - (c.__setitem__(0, c[0] + 1) or c[0]))
+    ):
         g = list(grp)
         runs.append((g[0], g[-1]))
         real.extend(range(a, b + 1))
     return real
+def _fill_in_frame_count_pairs(pairs, entry):
+    if not pairs:
+        return []
+    scale = (entry["video_info"].fps) / (entry["metadata"]["sampling_rate_fps"])
+    pairs = sorted(pairs, key=lambda t: int(t[0]))
+    sampled_indices = [int(i) for i, _ in pairs]
+    runs = []
+    for _, grp in itertools.groupby(
+        sampled_indices,
+        key=lambda x, c=[0]: (x - (c.__setitem__(0, c[0] + 1) or c[0]))
+    ):
+        g = list(grp)
+        runs.append((g[0], g[-1]))
+    idx2bbox = {}
+    for i, bbox in pairs:
+        i = int(i)
+        if i not in idx2bbox:
+            idx2bbox[i] = bbox
+    expanded: list[tuple[int, tuple[float, float, float, float]]] = []
+    last_real = -1
+    for start_i, end_i in runs:
+        rep_bbox = idx2bbox.get(start_i)
+        if rep_bbox is None:
+            for k in range(start_i, end_i + 1):
+                if k in idx2bbox:
+                    rep_bbox = idx2bbox[k]
+                    break
+        if rep_bbox is None:
+            continue
+        a = int(round(start_i * scale))
+        b = int(round(end_i * scale))
+        if expanded and a <= last_real:
+            a = last_real + 1
+        for real_i in range(a, b + 1):
+            expanded.append((real_i, rep_bbox))
+        last_real = b
+    return expanded
+def process_entry(entry, run_with_yolo=False, cache_path=""):
+    """
+    VLM path (run_with_yolo=False):
+        - Returns (foi, object_frame_dict_expanded)
+          where object_frame_dict_expanded: Dict[str, List[int]] (real frame indices)
+    YOLO path (run_with_yolo=True):
+        - Expects run_nsvs_yolo to return (foi, object_frame_bounding_boxes)
+          where object_frame_bounding_boxes: Dict[str, List[(sample_idx, bbox)]]
+        - Returns (foi, object_frame_bounding_boxes_expanded)
+          where each bbox is duplicated across the scaled span to real frames:
+            Dict[str, List[(real_idx, bbox)]]
+    """
+    if run_with_yolo:
+        foi, object_frame_bounding_boxes = run_nsvs_yolo(
+            frames=entry["images"],
+            proposition=entry['tl']['propositions'],
+            specification=entry['tl']['specification'],
+            yolo_cache_path=cache_path,
+            vlm_detection_threshold=0.35,
+        )
+        foi = fill_in_frame_count([i for sub in foi for i in sub], entry)
+        expanded_boxes = {}
+        for key, pairs in (object_frame_bounding_boxes or {}).items():
+            expanded_boxes[key] = _fill_in_frame_count_pairs(pairs, entry)
+        return foi, expanded_boxes
+    else:
+        foi, object_frame_dict = run_nsvs(
+            frames=entry['images'],
+            proposition=entry['tl']['propositions'],
+            specification=entry['tl']['specification'],
+            model_name="InternVL2-8B",
+            device=DEVICE
+        )
+        foi = fill_in_frame_count([i for sub in foi for i in sub], entry)
+        object_frame_dict = {key: fill_in_frame_count(value, entry) for key, value in (object_frame_dict or {}).items()}
+        return foi, object_frame_dict
 def main():
     reader = Mp4Reader(VIDEOS, OPENAI_SAVE_PATH, sampling_rate_fps=1)
     data = reader.read_video()
     if not data:
         return
+    # cache_path = preprocess_yolo(entry["images"], model_weights="yolov8n.pt",
+    #                              device="cuda:0", out_path="yolo_cache.npz")
+    with tqdm.tqdm(enumerate(data), total=len(data), desc="Processing entries") as pbar:
         for i, entry in pbar:
             start_time = time.time()
+            foi = process_entry(entry, run_with_yolo=True)
             end_time = time.time()
             processing_time = round(end_time - start_time, 3)

launch_space.sh CHANGED Viewed

@@ -2,6 +2,7 @@
 apt update
 apt install -y ffmpeg
 # Start vLLM server in background
 ./vllm_serve.sh &
@@ -19,4 +20,4 @@ echo "
 "
 # Start Gradio app
-python3 execute_demo_v2.py

 apt update
 apt install -y ffmpeg
+pip install ultralytics
 # Start vLLM server in background
 ./vllm_serve.sh &
 "
 # Start Gradio app
+python3 execute_demo_v3.py

ns_vfs/nsvs.py CHANGED Viewed

@@ -24,15 +24,12 @@ def run_nsvs(
     tl_satisfaction_threshold: float = 0.6,
     detection_threshold: float = 0.5,
     vlm_detection_threshold: float = 0.35,
-    image_output_dir: str = "output"
 ):
     """Find relevant frames from a video that satisfy a specification"""
     object_frame_dict = {}
-    object_frame_dict_prob = {}
-    vlm = VLLMClient()
-    # vlm = InternVL(model_name=model_name, device=device)
     automaton = VideoAutomaton(include_initial_state=True)
     automaton.set_up(proposition_set=proposition)
@@ -62,12 +59,6 @@ def run_nsvs(
             object_of_interest[prop] = detected_object
             if detected_object.is_detected:
                 multi_frame_arr = [frame_count * num_of_frame_in_sequence + j for j in range(num_of_frame_in_sequence)]
-                p2 = f"{prop}: {detected_object.probability}"
-                if p2 in object_frame_dict_prob:
-                    object_frame_dict_prob[p2].extend(multi_frame_arr)
-                else:
-                    object_frame_dict_prob[p2] = multi_frame_arr
                 if prop in object_frame_dict:
                     object_frame_dict[prop].extend(multi_frame_arr)
                 else:
@@ -93,9 +84,6 @@ def run_nsvs(
             print("\n" + "*"*50 + f" {i}/{len(frame_windows)-1} " + "*"*50)
             print("Detections:")
         frame = process_frame(sequence_of_frames, i)
-        if PRINT_ALL:
-            os.makedirs(image_output_dir, exist_ok=True)
-            frame.save_frame_img(save_path=os.path.join(image_output_dir, f"{i}"))
         if checker.validate_frame(frame_of_interest=frame):
             automaton.add_frame(frame=frame)
@@ -112,5 +100,5 @@ def run_nsvs(
         print("Detected frames of interest:")
         print(foi)
-    return foi, object_frame_dict, object_frame_dict_prob

     tl_satisfaction_threshold: float = 0.6,
     detection_threshold: float = 0.5,
     vlm_detection_threshold: float = 0.35,
 ):
     """Find relevant frames from a video that satisfy a specification"""
     object_frame_dict = {}
+    vlm = VLLMClient()
     automaton = VideoAutomaton(include_initial_state=True)
     automaton.set_up(proposition_set=proposition)
             object_of_interest[prop] = detected_object
             if detected_object.is_detected:
                 multi_frame_arr = [frame_count * num_of_frame_in_sequence + j for j in range(num_of_frame_in_sequence)]
                 if prop in object_frame_dict:
                     object_frame_dict[prop].extend(multi_frame_arr)
                 else:
             print("\n" + "*"*50 + f" {i}/{len(frame_windows)-1} " + "*"*50)
             print("Detections:")
         frame = process_frame(sequence_of_frames, i)
         if checker.validate_frame(frame_of_interest=frame):
             automaton.add_frame(frame=frame)
         print("Detected frames of interest:")
         print(foi)
+    return foi, object_frame_dict

ns_vfs/nsvs_yolo.py ADDED Viewed

	@@ -0,0 +1,215 @@

+# -------------------------------
+# Preprocess: per-frame dicts {class: List[(conf, (x1,y1,x2,y2))]}
+# -------------------------------
+from ultralytics import YOLO
+import numpy as np
+import warnings
+import tqdm
+import os
+import pickle
+import re
+from typing import Dict, List, Literal, Tuple
+from ns_vfs.model_checker.property_checker import PropertyChecker
+from ns_vfs.model_checker.video_automaton import VideoAutomaton
+from ns_vfs.vlm.obj import DetectedObject
+from ns_vfs.vlm.vllm_client import VLLMClient
+from ns_vfs.video.frame import FramesofInterest, VideoFrame
+PRINT_ALL = True
+warnings.filterwarnings("ignore")
+def preprocess_yolo(
+    frames: List[np.ndarray],
+    model_weights: str = "yolov8n.pt",
+    device: str | int = "cuda:0",
+    batch_size: int = 16,
+    out_path: str = "yolo_det_cache.pkl",
+    conf_threshold: float = 0.001,
+    iou: float = 0.7,
+) -> str:
+    """
+    Run YOLOv8 detection on every frame and save a list of dicts.
+      cache format:
+        yolo_dets: List[ Dict[str, List[Tuple[float, Tuple[float,float,float,float]]]] ]
+        # one item per frame
+        # each frame dict maps: class_name (lowercase, spaces) ->
+        #     list of (confidence, (x1, y1, x2, y2)) in pixel coordinates
+    """
+    model = YOLO(model_weights)
+    id_to_name: Dict[int, str] = {int(k): str(v).lower() for k, v in model.names.items()}
+    yolo_dets: List[Dict[str, List[Tuple[float, Tuple[float, float, float, float]]]]] = []
+    for start in range(0, len(frames), batch_size):
+        batch = frames[start:start + batch_size]
+        results = model.predict(
+            batch,
+            device=device,
+            conf=conf_threshold,
+            iou=iou,
+            verbose=False,
+        )
+        for r in results:
+            frame_dict: Dict[str, List[Tuple[float, Tuple[float, float, float, float]]]] = {}
+            if r.boxes is not None and len(r.boxes) > 0:
+                # xyxy in pixels, conf, and class ids
+                xyxy = r.boxes.xyxy.detach().cpu().numpy().astype(float)
+                confs = r.boxes.conf.detach().cpu().numpy().astype(float)
+                cls_ids = r.boxes.cls.detach().cpu().numpy().astype(int)
+                for (x1, y1, x2, y2), conf, cid in zip(xyxy, confs, cls_ids):
+                    name = id_to_name.get(int(cid), str(cid))  # e.g., "traffic light"
+                    frame_dict.setdefault(name, []).append(
+                        (float(conf), (float(x1), float(y1), float(x2), float(y2)))
+                    )
+            yolo_dets.append(frame_dict)
+    assert len(yolo_dets) == len(frames), f"expected {len(frames)} dicts, got {len(yolo_dets)}"
+    with open(out_path, "wb") as f:
+        pickle.dump(yolo_dets, f, protocol=pickle.HIGHEST_PROTOCOL)
+    return out_path
+# -------------------------------
+# NSVS using cached YOLO dicts; 1 frame per step
+# -------------------------------
+# normalize props to YOLO label style (spaces, lowercase, collapsed whitespace)
+_WS = re.compile(r"\s+")
+def normalize_label_for_yolo(s: str) -> str:
+    s = (s or "").strip().lower()
+    s = s.replace("_", " ")
+    s = s.replace("-", " ").replace("–", " ").replace("—", " ")
+    s = _WS.sub(" ", s)
+    return s
+def run_nsvs_yolo(
+    frames: List[np.ndarray],
+    proposition: List[str],
+    specification: str,
+    *,
+    yolo_cache_path: str = "yolo_det_cache.pkl",
+    model_type: str = "dtmc",
+    tl_satisfaction_threshold: float = 0.6,
+    detection_threshold: float = 0.5,
+    vlm_detection_threshold: float = 0.35,   # used as 'false_threshold' in calibrate()
+    image_output_dir: str = "output",
+) -> Tuple[List[VideoFrame], Dict[str, List[Tuple[int, Tuple[float, float, float, float]]]]]:
+    """
+    Replaces vlm.detect with cached YOLO per frame (1-frame sequences).
+    Returns:
+      foi: List[VideoFrame]
+      object_frame_bounding_boxes:
+         Dict[str, List[(frame_index, (x1, y1, x2, y2))]]
+         # one bbox per frame (the highest-confidence bbox for that class in that frame)
+    """
+    if not os.path.exists(yolo_cache_path):
+        raise FileNotFoundError(
+            f"YOLO cache not found at '{yolo_cache_path}'. "
+            f"Call preprocess_yolo(frames, out_path='yolo_det_cache.pkl') first."
+        )
+    with open(yolo_cache_path, "rb") as f:
+        # List[Dict[str, List[(conf, (x1,y1,x2,y2))]]]
+        yolo_dets: List[Dict[str, List[Tuple[float, Tuple[float, float, float, float]]]]] = pickle.load(f)
+    if len(yolo_dets) != len(frames):
+        raise ValueError(f"cache length {len(yolo_dets)} != frames length {len(frames)}")
+    # Build normalized lookup (e.g., "traffic_light" -> "traffic light")
+    prop_lookup: Dict[str, str] = {prop_raw: normalize_label_for_yolo(prop_raw) for prop_raw in proposition}
+    automaton = VideoAutomaton(include_initial_state=True)
+    automaton.set_up(proposition_set=proposition)   # original props for TL
+    checker = PropertyChecker(
+        proposition=proposition,
+        specification=specification,
+        model_type=model_type,
+        tl_satisfaction_threshold=tl_satisfaction_threshold,
+        detection_threshold=detection_threshold,
+    )
+    frame_of_interest = FramesofInterest(1)  # 1-frame sequences
+    object_frame_bounding_boxes: Dict[str, List[Tuple[int, Tuple[float, float, float, float]]]] = {}
+    calibrator = VLLMClient()
+    def _mk_detected_object(name: str, confidence: float) -> DetectedObject:
+        probability = calibrator.calibrate(confidence=confidence, false_threshold=vlm_detection_threshold)
+        return DetectedObject(
+            name=name,
+            is_detected=confidence >= vlm_detection_threshold,
+            confidence=confidence,
+            probability=probability,
+        )
+    looper = range(len(frames)) if PRINT_ALL else tqdm.tqdm(range(len(frames)))
+    for i in looper:
+        if PRINT_ALL:
+            print("\n" + "*" * 50 + f" {i}/{len(frames) - 1} " + "*" * 50)
+            print("Detections:")
+        # Per-frame dict: class -> List[(conf, (x1,y1,x2,y2))]
+        det_dict = yolo_dets[i]
+        object_of_interest = {}
+        for prop_raw in proposition:
+            yolo_label = prop_lookup[prop_raw]
+            dets_for_class = det_dict.get(yolo_label, [])
+            # confidence for decision = max conf for that class in this frame (0 if none)
+            if dets_for_class:
+                confs = [c for c, _ in dets_for_class]
+                max_idx = int(np.argmax(confs))
+                best_conf, best_bbox = dets_for_class[max_idx]
+            else:
+                best_conf, best_bbox = 0.0, None
+            det = _mk_detected_object(prop_raw, float(best_conf))
+            object_of_interest[prop_raw] = det
+            if det.is_detected and best_bbox is not None:
+                # one bbox per frame (highest-confidence one)
+                object_frame_bounding_boxes.setdefault(prop_raw, []).append((i, best_bbox))
+            if PRINT_ALL:
+                if best_bbox is not None:
+                    x1, y1, x2, y2 = best_bbox
+                    print(f"\t{prop_raw} (yolo='{yolo_label}'): conf={det.confidence:.3f} "
+                          f"-> prob={det.probability:.3f} bbox=({x1:.1f},{y1:.1f},{x2:.1f},{y2:.1f})"
+                          + (" [DETECTED]" if det.is_detected else ""))
+                else:
+                    print(f"\t{prop_raw} (yolo='{yolo_label}'): conf=0.000 -> prob={det.probability:.3f}")
+        frame = VideoFrame(
+            frame_idx=i,
+            frame_images=[frames[i]],    # single-frame
+            object_of_interest=object_of_interest,
+        )
+        if checker.validate_frame(frame_of_interest=frame):
+            automaton.add_frame(frame=frame)
+            frame_of_interest.frame_buffer.append(frame)
+            model_check = checker.check_automaton(automaton=automaton)
+            if model_check:
+                automaton.reset()
+                frame_of_interest.flush_frame_buffer()
+    foi = frame_of_interest.foi_list
+    if PRINT_ALL:
+        print("\n" + "-" * 107)
+        print("Detected frames of interest:")
+        print(foi)
+    # NOTE: replaced the old object_frame_dict return
+    return foi, object_frame_bounding_boxes

pyproject.toml CHANGED Viewed

@@ -18,5 +18,6 @@ dependencies = [
     "timm>=1.0.19",
     "tqdm>=4.67.1",
     "transformers>=4.41,<4.47",
 ]

     "timm>=1.0.19",
     "tqdm>=4.67.1",
     "transformers>=4.41,<4.47",
+    "ultralytics>=8.3.201",
 ]