Spaces:

Syzygianinfern0
/

NSVS

Runtime error

File size: 5,158 Bytes

import tqdm
import itertools
import operator
import json
import time
import os

from ns_vfs.nsvs import run_nsvs
from ns_vfs.nsvs_yolo import *
from ns_vfs.video.read_mp4 import Mp4Reader


VIDEOS = [
    {
        "path": "demo_videos/car.mp4",
        "query": "car until truck"
    }
]
DEVICE = 7  # GPU device index
OPENAI_SAVE_PATH = ""
OUTPUT_DIR = "output"

import itertools

def fill_in_frame_count(arr, entry):
    scale = (entry["video_info"].fps) / (entry["metadata"]["sampling_rate_fps"])

    runs = []
    for _, grp in itertools.groupby(
        sorted(arr),
        key=lambda x, c=[0]: (x - (c.__setitem__(0, c[0] + 1) or c[0]))
    ):
        g = list(grp)
        runs.append((g[0], g[-1]))

    real = []
    for start_i, end_i in runs:
        a = int(round(start_i * scale))
        b = int(round(end_i * scale))
        if real and a <= real[-1]:
            a = real[-1] + 1
        real.extend(range(a, b + 1))
    return real


def _fill_in_frame_count_pairs(pairs, entry):
    if not pairs:
        return []
    scale = (entry["video_info"].fps) / (entry["metadata"]["sampling_rate_fps"])

    pairs = sorted(pairs, key=lambda t: int(t[0]))
    sampled_indices = [int(i) for i, _ in pairs]

    runs = []
    for _, grp in itertools.groupby(
        sampled_indices,
        key=lambda x, c=[0]: (x - (c.__setitem__(0, c[0] + 1) or c[0]))
    ):
        g = list(grp)
        runs.append((g[0], g[-1]))

    idx2bbox = {}
    for i, bbox in pairs:
        i = int(i)
        if i not in idx2bbox:
            idx2bbox[i] = bbox

    expanded: list[tuple[int, tuple[float, float, float, float]]] = []
    last_real = -1

    for start_i, end_i in runs:
        rep_bbox = idx2bbox.get(start_i)
        if rep_bbox is None:
            for k in range(start_i, end_i + 1):
                if k in idx2bbox:
                    rep_bbox = idx2bbox[k]
                    break
        if rep_bbox is None:
            continue

        a = int(round(start_i * scale))
        b = int(round(end_i * scale))
        if expanded and a <= last_real:
            a = last_real + 1
        for real_i in range(a, b + 1):
            expanded.append((real_i, rep_bbox))
        last_real = b

    return expanded


def process_entry(entry, run_with_yolo=False, cache_path=""):
    """
    VLM path (run_with_yolo=False):
        - Returns (foi, object_frame_dict_expanded)
          where object_frame_dict_expanded: Dict[str, List[int]] (real frame indices)

    YOLO path (run_with_yolo=True):
        - Expects run_nsvs_yolo to return (foi, object_frame_bounding_boxes)
          where object_frame_bounding_boxes: Dict[str, List[(sample_idx, bbox)]]
        - Returns (foi, object_frame_bounding_boxes_expanded)
          where each bbox is duplicated across the scaled span to real frames:
            Dict[str, List[(real_idx, bbox)]]
    """
    if run_with_yolo:
        foi, object_frame_bounding_boxes = run_nsvs_yolo(
            frames=entry["images"],
            proposition=entry['tl']['propositions'],
            specification=entry['tl']['specification'],
            yolo_cache_path=cache_path,
            vlm_detection_threshold=0.35,
        )
        foi = fill_in_frame_count([i for sub in foi for i in sub], entry)

        expanded_boxes = {}
        for key, pairs in (object_frame_bounding_boxes or {}).items():
            expanded_boxes[key] = _fill_in_frame_count_pairs(pairs, entry)
        return foi, expanded_boxes

    else:
        foi, object_frame_dict = run_nsvs(
            frames=entry['images'],
            proposition=entry['tl']['propositions'],
            specification=entry['tl']['specification'],
            model_name="InternVL2-8B",
            device=DEVICE
        )
        foi = fill_in_frame_count([i for sub in foi for i in sub], entry)
        object_frame_dict = {key: fill_in_frame_count(value, entry) for key, value in (object_frame_dict or {}).items()}
        return foi, object_frame_dict

def main():
    reader = Mp4Reader(VIDEOS, OPENAI_SAVE_PATH, sampling_rate_fps=1)
    data = reader.read_video()
    if not data:
        return
    
    # cache_path = preprocess_yolo(entry["images"], model_weights="yolov8n.pt",
    #                              device="cuda:0", out_path="yolo_cache.npz")

    with tqdm.tqdm(enumerate(data), total=len(data), desc="Processing entries") as pbar:
        for i, entry in pbar:
            start_time = time.time()
            foi = process_entry(entry, run_with_yolo=True)
            end_time = time.time()
            processing_time = round(end_time - start_time, 3)

            if foi:
                output = {
                    "tl": entry["tl"],
                    "metadata": entry["metadata"],
                    "video_info": entry["video_info"].to_dict(),
                    "frames_of_interest": foi,
                    "processting_time_seconds": processing_time
                }

                os.makedirs(OUTPUT_DIR, exist_ok=True)
                with open(os.path.join(OUTPUT_DIR, f"output_{i}.json"), "w") as f:
                    json.dump(output, f, indent=4)

if __name__ == "__main__":
    main()