Using Multimodal Large Language Models for False Alarm Reduction in Image-based Fire Detection
Existing vision-based methods suffer from high false alarm rates in urban flame detection. Applying Multimodal Large Language Models (MLLMs) for secondary filtering shows great potential in reducing false alarms, yet they have high inference latency and are prone to reasoning collapse on negative samples without explicit Chain-of-Thought (CoT) guidance. To overcome these challenges, this study proposed Flash-Cascade, the first sub-second MLLM-based firewall to leverage CoT to efficiently filter false alarms. We deconstructed the flame detection process into four logical stages (planning, observation, analysis, and judgment), which informed the design of three switchable reasoning modes (Detailed, Quick, and Rapid) to achieve inference acceleration via CoT compression. We fine-tuned Qwen2-VL-7B-Instruct on a multi-grained instruction dataset via Low-Rank Adaptation. This process internalizes explicit reasoning logic into implicit parameter representations, enabling the model to maintain robust reasoning capability even without explicit CoT guidance. On our newly constructed benchmark incorporating real-world hard negatives, Flash-Cascade achieves an accuracy of 97.79% and an F1-score of 0.9767 in Rapid mode, outperforming the baseline by 61.63 percentage points (pp) and 0.5152, respectively. Furthermore, it outperforms the state-of-the-art object detector DEIMv2 by 14.64 pp in accuracy. The method exhibits exceptional sample efficiency, converging with only 600 samples and 2 epochs, and improves inference speed by 810% over standard CoT. This study will open a door for robust and efficient flame detection in high-interference scenarios.
1. Quick Start
For installation instructions, please refer to deepseek-ai/deepseek-vl2-tiny.
# Copyright (c) 2023-2024 DeepSeek.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the "Software"), to deal in
# the Software without restriction, including without limitation the rights to
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
# the Software, and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# -*- coding:utf-8 -*-
from argparse import ArgumentParser
import io
import sys
import base64
from PIL import Image
import gradio as gr
import torch
from deepseek_vl2.serve.app_modules.gradio_utils import (
cancel_outputing,
delete_last_conversation,
reset_state,
reset_textbox,
wrap_gen_fn,
)
from deepseek_vl2.serve.app_modules.overwrites import reload_javascript
from deepseek_vl2.serve.app_modules.presets import (
CONCURRENT_COUNT,
MAX_EVENTS,
description,
description_top,
title
)
from deepseek_vl2.serve.app_modules.utils import (
configure_logger,
is_variable_assigned,
strip_stop_words,
parse_ref_bbox,
pil_to_base64,
display_example
)
from deepseek_vl2.serve.inference import (
convert_conversation_to_prompts,
deepseek_generate,
load_model,
)
from deepseek_vl2.models.conversation import SeparatorStyle
# logger = configure_logger()
# MODELS = [
# "DeepSeek-VL2-tiny",
# "DeepSeek-VL2-small",
# "DeepSeek-VL2",
# "deepseek-ai/deepseek-vl2-tiny",
# "deepseek-ai/deepseek-vl2-small",
# "deepseek-ai/deepseek-vl2",
# ]
DEPLOY_MODELS = dict()
IMAGE_TOKEN = "<image>"
# ok
def fetch_model(model_path,model_name: str, dtype=torch.bfloat16):
global args, DEPLOY_MODELS
# if args.local_path:
# model_path = args.local_path
# else:
# model_path = model_name
if model_name in DEPLOY_MODELS:
model_info = DEPLOY_MODELS[model_name]
print(f"{model_name} has been loaded.")
else:
print(f"{model_name} is loading...")
DEPLOY_MODELS[model_name] = load_model(model_path, dtype=dtype)
print(f"Load {model_name} successfully...")
model_info = DEPLOY_MODELS[model_name]
return model_info
# ok
def generate_prompt_with_history(
text, images, history, vl_chat_processor, tokenizer, max_length=2048
):
"""
Generate a prompt with history for the deepseek application.
Args:
text (str): The text prompt.
images (list[PIL.Image.Image]): The image prompt.
history (list): List of previous conversation messages.
tokenizer: The tokenizer used for encoding the prompt.
max_length (int): The maximum length of the prompt.
Returns:
tuple: A tuple containing the generated prompt, image list, conversation, and conversation copy. If the prompt could not be generated within the max_length limit, returns None.
"""
global IMAGE_TOKEN
sft_format = "deepseek"
user_role_ind = 0
bot_role_ind = 1
# Initialize conversation
conversation = vl_chat_processor.new_chat_template()
if history:
conversation.messages = history
if images is not None and len(images) > 0:
num_image_tags = text.count(IMAGE_TOKEN)
num_images = len(images)
if num_images > num_image_tags:
pad_image_tags = num_images - num_image_tags
image_tokens = "\n".join([IMAGE_TOKEN] * pad_image_tags)
# append the <image> in a new line after the text prompt
text = image_tokens + "\n" + text
elif num_images < num_image_tags:
remove_image_tags = num_image_tags - num_images
text = text.replace(IMAGE_TOKEN, "", remove_image_tags)
# print(f"prompt = {text}, len(images) = {len(images)}")
text = (text, images)
conversation.append_message(conversation.roles[user_role_ind], text)
conversation.append_message(conversation.roles[bot_role_ind], "")
# Create a copy of the conversation to avoid history truncation in the UI
conversation_copy = conversation.copy()
# logger.info("=" * 80)
# logger.info(get_prompt(conversation))
rounds = len(conversation.messages) // 2
for _ in range(rounds):
current_prompt = get_prompt(conversation)
current_prompt = (
current_prompt.replace("</s>", "")
if sft_format == "deepseek"
else current_prompt
)
if torch.tensor(tokenizer.encode(current_prompt)).size(-1) <= max_length:
return conversation_copy
if len(conversation.messages) % 2 != 0:
gr.Error("The messages between user and assistant are not paired.")
return
try:
for _ in range(2): # pop out two messages in a row
conversation.messages.pop(0)
except IndexError:
gr.Error("Input text processing failed, unable to respond in this round.")
return None
gr.Error("Prompt could not be generated within max_length limit.")
return None
# ok
def get_prompt(conv) -> str:
"""Get the prompt for generation."""
system_prompt = conv.system_template.format(system_message=conv.system_message)
if conv.sep_style == SeparatorStyle.DeepSeek:
seps = [conv.sep, conv.sep2]
if system_prompt == "" or system_prompt is None:
ret = ""
else:
ret = system_prompt + seps[0]
for i, (role, message) in enumerate(conv.messages):
if message:
if type(message) is tuple: # multimodal message
message, _ = message
ret += role + ": " + message + seps[i % 2]
else:
ret += role + ":"
return ret
else:
return conv.get_prompt()
def predict(
text,
images,
history,
top_p,
temperature,
repetition_penalty,
max_length_tokens,
max_context_length_tokens,
tokenizer,
vl_gpt,
vl_chat_processor,
chunk_size
):
"""
Function to predict the response based on the user's input and selected model.
Parameters:
user_text (str): The input text from the user.
user_image (str): The input image from the user.
chatbot (str): The chatbot's name.
history (str): The history of the chat.
top_p (float): The top-p parameter for the model.
temperature (float): The temperature parameter for the model.
max_length_tokens (int): The maximum length of tokens for the model.
max_context_length_tokens (int): The maximum length of context tokens for the model.
model_select_dropdown (str): The selected model from the dropdown.
Returns:
generator: A generator that yields the chatbot outputs, history, and status.
"""
# print("running the prediction function")
if images is None:
images = []
# load images
pil_images = []
for img_or_file in images:
try:
# load as pil image
if isinstance(images, Image.Image):
pil_images.append(img_or_file)
else:
image = Image.open(img_or_file).convert("RGB")
pil_images.append(image)
except Exception as e:
print(f"Error loading image: {e}")
conversation = generate_prompt_with_history(
text,
pil_images,
history,
vl_chat_processor,
tokenizer,
max_length=max_context_length_tokens,
)
all_conv, last_image = convert_conversation_to_prompts(conversation)
stop_words = conversation.stop_str
# gradio_chatbot_output = to_gradio_chatbot(conversation)
full_response = ""
with torch.no_grad():
for x in deepseek_generate(
conversations=all_conv,
vl_gpt=vl_gpt,
vl_chat_processor=vl_chat_processor,
tokenizer=tokenizer,
stop_words=stop_words,
max_length=max_length_tokens,
temperature=temperature,
repetition_penalty=repetition_penalty,
top_p=top_p,
chunk_size=chunk_size
):
full_response += x
response = strip_stop_words(full_response, stop_words)
# sys.stdout.write(x)
# sys.stdout.flush()
return response
import os
def deepkseek_vl2_infer(img_path):
model_path = "" # gaoqie/DeepSeekVL2-Tiny-fire
chunk_size = -1
tokenizer, vl_gpt, vl_chat_processor = fetch_model(model_path,"DeepSeek-VL2-tiny")
output_text = predict(
text = "图像中是否存在火焰?详细分析。", # 模式二:"图像中是否存在火焰?简单回答。" 模式三:"图像中是否存在火焰?简单回答快速回答。"
images = [img_path],
history = [],# history
top_p = 0.01, # top_p
temperature = 0.000001,# temperature
repetition_penalty = 1.05, # repetition_penalty #
max_length_tokens = 2048, # max_length_tokens
max_context_length_tokens = 4096, # max_context_length_tokens
tokenizer = tokenizer,
vl_gpt = vl_gpt,
vl_chat_processor =vl_chat_processor,
chunk_size = chunk_size
)
print(output_text)
if __name__ == "__main__":
img_path = ""
deepkseek_vl2_infer(img_path)
2. License
This code repository is licensed under the Apache license 2.0.
3. Citation
- Downloads last month
- 8
Model tree for gaoqie/DeepSeekVL2-Tiny-fire
Base model
deepseek-ai/deepseek-vl2-tiny
