NSVS / scripts /no_nsvs.py
Syzygianinfern0's picture
Initial clean commit for HF Spaces deployment with LFS
47875a1
from enum import Enum
from tqdm import tqdm
import json
import time
import os
from ns_vfs.vlm.internvl import InternVL
from ns_vfs.video.read_tlv import TLVReader
class RunConfig(Enum):
SLIDING_WINDOW = "sliding_window"
FRAME_WISE = "frame_wise"
CURRENT_CONFIG = RunConfig.SLIDING_WINDOW
TLV_PATH = "/nas/dataset/tlv-dataset-v1"
MODEL_NAME = "InternVL2-8B"
DEVICE = 7 # GPU device index
CALIBRATION_THRESHOLD = 0.349 # vllm threshold
THRESHOLD = 0.5 # detection threshold (fw)
STRIDE = 10 # slide stride (sw)
WINDOW = 20 # window length (sw)
def sliding_window(entry): # answers "which sequence of `WINDOW` frames can best answer the query"
query = entry["tl"]["query"]
frames = entry["images"]
model = InternVL(model_name=MODEL_NAME, device=DEVICE)
best = {"prob": -1.0, "start": 1, "end": 1}
foi = []
t = 0
windows = list(range(0, len(frames), STRIDE))
with tqdm(windows, desc=f"Sliding window (stride={STRIDE}, window={WINDOW})") as pbar:
for t in pbar:
end_idx = min(t + WINDOW, len(frames))
seq = frames[t:end_idx]
detect = model.detect(seq, query, CALIBRATION_THRESHOLD)
prob = detect.probability
is_detected = detect.is_detected
pbar.set_postfix( {"best_prob": f"{best['prob']:.3f}", "current_prob": f"{prob:.3f}", "detected": is_detected} )
if prob > best["prob"] and is_detected:
best.update({"prob": prob, "start": t, "end": end_idx})
if best["prob"] != -1.0:
foi = list(range(best["start"], best["end"] + 1))
return foi
def frame_wise(entry):
query = entry["tl"]["query"]
frames = entry["images"]
model = InternVL(model_name="InternVL2-8B", device=DEVICE)
foi = []
t = 0
windows = range(len(frames))
with tqdm(windows, desc=f"Framewise (threshold={THRESHOLD}") as pbar:
for t in pbar:
f = [frames[t]]
detect = model.detect(f, query, CALIBRATION_THRESHOLD)
prob = detect.probability
is_detected = detect.is_detected
pbar.set_postfix( {"current_prob": f"{prob:.3f}", "detected": is_detected} )
if prob > THRESHOLD and is_detected:
foi.append(t)
return foi
def main():
reader = TLVReader(TLV_PATH)
data = reader.read_video()
if not data:
return
folder_name = f"{MODEL_NAME}_{CURRENT_CONFIG.value}"
folder_name = os.path.join("/nas/mars/experiment_result/nsvs/nsvs2-prelims", folder_name)
if not os.path.exists(folder_name):
os.makedirs(folder_name)
with tqdm(enumerate(data), total=len(data), desc="Processing entries") as pbar:
for i, entry in pbar:
start_time = time.time()
if CURRENT_CONFIG == RunConfig.SLIDING_WINDOW:
foi = sliding_window(entry)
else:
foi = frame_wise(entry)
end_time = time.time()
output = {
"propositions": entry["tl"]["propositions"],
"specification": entry["tl"]["specification"],
"ground_truth": entry["metadata"]["ground_truth"],
"frames_of_interest": foi,
"type": entry["metadata"]["type"],
"number_of_frames": entry["video_info"].frame_count,
"processting_time_seconds": round(end_time - start_time, 3),
}
with open(f"{folder_name}/output_{i}.json", "w") as f:
json.dump(output, f, indent=4)
if __name__ == "__main__":
main()