import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from torch.utils.data import Dataset, DataLoader import pandas as pd import pdb import sys import os import random from torchvision import transforms from collections import defaultdict import cv2 # cv2.setNumThreads(0) import transformers from transformers import AutoImageProcessor, AutoTokenizer, AutoModel from PIL import Image # from towhee import pipe, ops from transformers import pipeline from transformers import CLIPImageProcessor from models.segment_anything.utils.transforms import ResizeLongestSide # logger = log_agent('audio_recs.log') import pickle as pkl from models.llava import conversation as conversation_lib from collections import defaultdict IGNORE_INDEX = -100 IMAGE_TOKEN_INDEX = -200 DEFAULT_IMAGE_TOKEN = "" DEFAULT_IMAGE_PATCH_TOKEN = "" DEFAULT_IM_START_TOKEN = "" DEFAULT_IM_END_TOKEN = "" DEFAULT_VIDEO_TOKEN = "