#imports import gradio as gr import os import glob import random import argparse import numpy as np import pandas as pd import torch import torch.backends.cudnn as cudnn import decord decord.bridge.set_bridge('torch') from video_llama.tasks import * from video_llama.models import * from video_llama.runners import * from video_llama.processors import * from video_llama.datasets.builders import * from video_llama.common.config import Config from video_llama.common.dist_utils import get_rank from video_llama.common.registry import registry from video_llama.conversation.conversation_video import Chat, Conversation, default_conversation, SeparatorStyle def setup_seeds(config): seed = config.run_cfg.seed + get_rank() random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) cudnn.benchmark = False cudnn.deterministic = True def upload_imgorvideo(video_path, chat_state): chat_state = default_conversation.copy() chat_state = Conversation( system= "", roles=("Human", "Assistant"), messages=[], offset=0, sep_style=SeparatorStyle.SINGLE, sep="###", ) img_list = [] chat.upload_video(video_path, chat_state, img_list) return chat_state, img_list def gradio_ask(user_message, chat_state): chat.ask(user_message, chat_state) return chat_state def gradio_answer(chat_state, img_list, num_beams, temperature): output_text, _ = chat.answer(conv=chat_state, img_list=img_list, num_beams=num_beams, temperature=temperature, max_new_tokens=300, max_length=2000) # llama: max_token_num=2048 return output_text def infer(video_path): print(f'\n\n-----------------------{video_path}----------------------\n\n') chat_state, img_list = [], [] chat_state, img_list = upload_imgorvideo(video_path, chat_state) chat_state = gradio_ask(user_message, chat_state) response = gradio_answer(chat_state, img_list, num_beams=1, temperature=1) print (f'assistant: {user_message}') print (f'answer: {response}') if 'Yes' in response: return "The video is HATEFUL" elif 'No' in response: return "The video is NOT HATEFUL" else: return response parser = argparse.ArgumentParser(description="Inference Process for Multimodal Hate Content Detection") # default config parser.add_argument("--gpu-id", type=int, default='0', help="specify the gpu to load the model.") parser.add_argument("--cfg-path", default='eval_configs/video_llama_eval_withaudio_stage3.yaml', help="path to configuration file.") parser.add_argument("--options", nargs="+", help="override some settings in the used config, format: --option xx=xx yy=yy zz=zz") # input message parser.add_argument('--user_message',type=str, default="Is this hateful? Answer (Yes/No)", help='input user message') args = parser.parse_args() cfg = Config(args) model_config = cfg.model_cfg print(f'--------------MODEL CONFIG------------ :\n{model_config}\n\n ----------------------------------------------------------------\n\n') if args.gpu_id == -1: device = 'cpu' else: device='cuda:{}'.format(args.gpu_id) print(f'\n\n------------------device == {device}-----------\n\n') model_config.device_8bit = device model_cls = registry.get_model_class(model_config.arch) model = model_cls.from_config(model_config).to(device) model.eval() vis_processor_cfg = cfg.datasets_cfg.webvid.vis_processor.train vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg) chat = Chat(model, vis_processor, device=device) user_message = args.user_message print('Initialization Finished') print ('Step2: feed-forward process') #title = "Hate-LLaMA - An Instruction-tuned Audio-Visual Language Model for Hate Content Detection" description = """

Hate-LLaMA

An Audio-Visual Language Model for Hate Content Detection

Hate-LLaMA , is a multi-modal framework, designed to detect hate in videos and classify them as HATE or NON HATE. Hate-LLaMA finetunes Video-LLaMA (which uses the LLaMA-7b-chat model as backbone). The model is able to analyse both the audio and visual content to perform the classification task. After uploading a video and clicking submit, the model outputs a simple statement identifying if the video has hate or not. """ article = "Authors : Anisha Bhatnagar, Simran Makariye, Divyanshi Parashar" #examples = ["examples/hate_video_136.mp4","examples/hate_video_2.mp4", "examples/non_hate_video_349.mp4", "examples/non_hate_video_569.mp4"] demo = gr.Interface(fn=infer, inputs="video", outputs="text", description=description, article=article) #, examples=examples) demo.launch(share=True,show_api=False)