Spaces:

AnishaBhatnagar
/

LLVM_trial

Paused

LLVM_trial / app.py

Anisha Bhatnagar

removed image

125208c about 2 years ago

4.91 kB

	#imports
	import gradio as gr

	import os
	import glob
	import random
	import argparse
	import numpy as np
	import pandas as pd

	import torch
	import torch.backends.cudnn as cudnn

	import decord
	decord.bridge.set_bridge('torch')

	from video_llama.tasks import *
	from video_llama.models import *
	from video_llama.runners import *
	from video_llama.processors import *
	from video_llama.datasets.builders import *
	from video_llama.common.config import Config
	from video_llama.common.dist_utils import get_rank
	from video_llama.common.registry import registry
	from video_llama.conversation.conversation_video import Chat, Conversation, default_conversation, SeparatorStyle

	def setup_seeds(config):
	seed = config.run_cfg.seed + get_rank()
	random.seed(seed)
	np.random.seed(seed)
	torch.manual_seed(seed)
	cudnn.benchmark = False
	cudnn.deterministic = True

	def upload_imgorvideo(video_path, chat_state):
	chat_state = default_conversation.copy()
	chat_state = Conversation(
	system= "",
	roles=("Human", "Assistant"),
	messages=[],
	offset=0,
	sep_style=SeparatorStyle.SINGLE,
	sep="###",
	)
	img_list = []
	chat.upload_video(video_path, chat_state, img_list)
	return chat_state, img_list

	def gradio_ask(user_message, chat_state):
	chat.ask(user_message, chat_state)
	return chat_state

	def gradio_answer(chat_state, img_list, num_beams, temperature):
	output_text, _ = chat.answer(conv=chat_state,
	img_list=img_list,
	num_beams=num_beams,
	temperature=temperature,
	max_new_tokens=300,
	max_length=2000) # llama: max_token_num=2048
	return output_text

	def infer(video_path):
	print(f'\n\n-----------------------{video_path}----------------------\n\n')
	chat_state, img_list = [], []
	chat_state, img_list = upload_imgorvideo(video_path, chat_state)
	chat_state = gradio_ask(user_message, chat_state)
	response = gradio_answer(chat_state, img_list, num_beams=1, temperature=1)
	print (f'assistant: {user_message}')
	print (f'answer: {response}')

	if 'Yes' in response:
	return "The video is HATEFUL"
	elif 'No' in response:
	return "The video is NOT HATEFUL"
	else:
	return response

	parser = argparse.ArgumentParser(description="Inference Process for Multimodal Hate Content Detection")

	# default config
	parser.add_argument("--gpu-id", type=int, default='0', help="specify the gpu to load the model.")
	parser.add_argument("--cfg-path", default='eval_configs/video_llama_eval_withaudio_stage3.yaml', help="path to configuration file.")
	parser.add_argument("--options", nargs="+", help="override some settings in the used config, format: --option xx=xx yy=yy zz=zz")

	# input message
	parser.add_argument('--user_message',type=str, default="Is this hateful? Answer (Yes/No)", help='input user message')

	args = parser.parse_args()
	cfg = Config(args)

	model_config = cfg.model_cfg
	print(f'--------------MODEL CONFIG------------ :\n{model_config}\n\n ----------------------------------------------------------------\n\n')
	if args.gpu_id == -1:
	device = 'cpu'
	else:
	device='cuda:{}'.format(args.gpu_id)
	print(f'\n\n------------------device == {device}-----------\n\n')
	model_config.device_8bit = device
	model_cls = registry.get_model_class(model_config.arch)
	model = model_cls.from_config(model_config).to(device)
	model.eval()
	vis_processor_cfg = cfg.datasets_cfg.webvid.vis_processor.train
	vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)

	chat = Chat(model, vis_processor, device=device)
	user_message = args.user_message
	print('Initialization Finished')

	print ('Step2: feed-forward process')
	#title = "Hate-LLaMA - An Instruction-tuned Audio-Visual Language Model for Hate Content Detection"

	description = """
	<h1 align="center"> Hate-LLaMA </h1>
	<h3 align="center"> An Audio-Visual Language Model for Hate Content Detection </h3>

	Hate-LLaMA , is a multi-modal framework, designed to detect hate in videos and classify them as HATE or NON HATE. Hate-LLaMA finetunes Video-LLaMA (which uses the LLaMA-7b-chat model as backbone). The model is able to analyse both the audio and visual content to perform the classification task. After uploading a video and clicking submit, the model outputs a simple statement identifying if the video has hate or not.

	"""

	article = "Authors : Anisha Bhatnagar, Simran Makariye, Divyanshi Parashar"
	#examples = ["examples/hate_video_136.mp4","examples/hate_video_2.mp4", "examples/non_hate_video_349.mp4", "examples/non_hate_video_569.mp4"]

	demo = gr.Interface(fn=infer, inputs="video", outputs="text", description=description, article=article) #, examples=examples)

	demo.launch(share=True,show_api=False)