File size: 5,254 Bytes

b0c0df0

import argparse
import json
import os
import pdb
import re
from copy import deepcopy
from pathlib import Path

import numpy as np


# read json files
def read_json(path):
    with open(path, "r") as fin:
        datas = json.load(fin)
    return datas


def write_json(path, data):
    with open(path, "w") as fout:
        json.dump(data, fout)
    print("The format file has been saved at:{}".format(path))
    return


def extract_time(paragraph):
    prompt = "A specific example is : 20.8 - 30.0 seconds".lower()
    paragraph = paragraph.lower().replace(prompt, "").replace("to", "-")
    # Split text into sentences based on common delimiters
    sentences = re.split(r"[!?\n]", paragraph)

    # Keywords that might indicate the presence of time information
    keywords = ["starts", "ends", "happens in", "start time", "end time", "start", "end", "happen"]
    # filter sentences by keywords
    candidates = []
    for sentence in sentences:
        # If sentence contains one of the keywords
        if any(keyword in sentence for keyword in keywords):
            candidates.append(sentence)

    timestamps = []
    # Compile patterns once outside the loop for better performance
    main_pattern = re.compile(r"(\d+(?:\.\d+)?)\s*[–-]\s*(\d+(?:\.\d+)?)")
    time_number_pattern = re.compile(r"\b(\d+(?:\.\d+)?)\b")
    time_format_pattern = re.compile(r"\b(\d{1,2}:\d{2}:\d{2})\b")
    fallback_pattern = re.compile(r"(\d+(?:\.\d+)?)\s*s?\s*[–-]\s*(\d+(?:\.\d+)?)\s*s?")

    if len(sentences) == 0:
        return []

    # Check for The given query happens in m - n (seconds)
    # Search only in candidates, not the entire paragraph
    for sentence in candidates:
        time_matches = main_pattern.findall(sentence)
        if time_matches:
            timestamps = [[float(start), float(end)] for start, end in time_matches]
            break

    # check for other formats e.g.:
    # 1 .Starting time: 0.8 seconds
    # Ending time: 1.1 seconds
    # 2. The start time for this event is 0 seconds, and the end time is 12 seconds.
    if len(timestamps) == 0:
        times = []
        for sentence in candidates:
            time = time_number_pattern.findall(sentence)
            if time:
                time_in_sec = float(time[0])
                times.append(time_in_sec)
        times = times[: len(times) // 2 * 2]
        timestamps = [(times[i], times[i + 1]) for i in range(0, len(times), 2)]

    # Check for  examples like:
    # 3. The event 'person flipped the light switch near the door' starts at 00:00:18 and ends at 00:00:23.
    if len(timestamps) == 0:
        times = []
        for sentence in candidates:
            time = time_format_pattern.findall(sentence)
            if time:
                t = time[0]
            else:
                continue
            # If time is in HH:MM:SS format, convert to seconds
            if t.count(":") == 2:
                h, m, s = map(int, t.split(":"))
                time_in_sec = h * 3600 + m * 60 + s
            elif t.count(":") == 1:
                m, s = map(int, t.split(":"))
                time_in_sec = m * 60 + s
            times.append(time_in_sec)
        times = times[: len(times) // 2 * 2]
        timestamps = [(times[i], times[i + 1]) for i in range(0, len(times), 2)]

    # Fallback: if no timestamps found, search for any two number patterns with dash
    if len(timestamps) == 0:
        # More comprehensive pattern to match various formats like:
        # xx - xx, x.xx s - x.xx s, x.xxs - x.xxs, etc.
        # Also handle en dash (–) and regular dash (-)
        for sentence in candidates:
            fallback_matches = fallback_pattern.findall(sentence)
            if fallback_matches:
                timestamps = [[float(start), float(end)] for start, end in fallback_matches]
                break

    results = []
    for start, end in timestamps:
        if end > start:
            results.append([start, end])
        else:
            results.append([end, start])
    if len(results) > 1:
        results = results[:1]
    return results


def iou(A, B):
    max0 = max((A[0]), (B[0]))
    min0 = min((A[0]), (B[0]))
    max1 = max((A[1]), (B[1]))
    min1 = min((A[1]), (B[1]))
    return max(min1 - max0, 0) / (max1 - min0)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("-f", default="your_result.json")
    args = parser.parse_args()

    datas = read_json(args.f)

    num = len(datas)

    # miou
    ious = []
    for k in datas.keys():
        vid, caption, gt = k.split(">>>")
        pred = datas[k]
        gt = eval(gt)
        timestamps = extract_time(pred)
        if len(timestamps) != 1:
            print(f"pred={pred},timestamps={timestamps}")
            timestamps = [[gt[1] + 10, gt[1] + 20]]
        # print(f"GT: {gt}, Pred: {timestamps[0]}")

        ious.append(iou(gt, timestamps[0]))

    Result = {0.3: 0, 0.5: 0, 0.7: 0}
    for c_iou in [0.3, 0.5, 0.7]:
        for cur_iou in ious:
            if cur_iou >= c_iou:
                Result[c_iou] = Result[c_iou] + 1

    print("IOU 0.3: {0}\nIOU 0.5: {1}\nIOU 0.7: {2}\nmIOU".format(Result[0.3] * 100 / num, Result[0.5] * 100 / num, Result[0.7] * 100 / num), sum(ious) * 100 / num)