Using Multimodal Large Language Models for False Alarm Reduction in Image-based Fire Detection
Existing vision-based methods suffer from high false alarm rates in urban flame detection. Applying Multimodal Large Language Models (MLLMs) for secondary filtering shows great potential in reducing false alarms, yet they have high inference latency and are prone to reasoning collapse on negative samples without explicit Chain-of-Thought (CoT) guidance. To overcome these challenges, this study proposed Flash-Cascade, the first sub-second MLLM-based firewall to leverage CoT to efficiently filter false alarms. We deconstructed the flame detection process into four logical stages (planning, observation, analysis, and judgment), which informed the design of three switchable reasoning modes (Detailed, Quick, and Rapid) to achieve inference acceleration via CoT compression. We fine-tuned Qwen2-VL-7B-Instruct on a multi-grained instruction dataset via Low-Rank Adaptation. This process internalizes explicit reasoning logic into implicit parameter representations, enabling the model to maintain robust reasoning capability even without explicit CoT guidance. On our newly constructed benchmark incorporating real-world hard negatives, Flash-Cascade achieves an accuracy of 97.79% and an F1-score of 0.9767 in Rapid mode, outperforming the baseline by 61.63 percentage points (pp) and 0.5152, respectively. Furthermore, it outperforms the state-of-the-art object detector DEIMv2 by 14.64 pp in accuracy. The method exhibits exceptional sample efficiency, converging with only 600 samples and 2 epochs, and improves inference speed by 810% over standard CoT. This study will open a door for robust and efficient flame detection in high-interference scenarios.
1. Quick Start
For installation instructions, please refer to OpenGVLab/InternVL2-8B.
import numpy as np
import torch
import torchvision.transforms as T
from decord import VideoReader, cpu
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
import torch
from PIL import Image
from transformers import (
AutoTokenizer,
AutoImageProcessor,
AutoModelForCausalLM,
)
import os
# swift_common
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
generation_config = dict(max_new_tokens=1024, do_sample=True)
path = '' # gaoqie/InternVl2-8B-fire
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
# use_flash_attn=True,
trust_remote_code=True).eval().cuda()
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
def traverse_directory(directory):
'''
获取指定文件夹下的所有图像路径
'''
filenames_list = []
for filename in os.listdir(directory):
file_path = os.path.join(directory, filename)
if os.path.isfile(file_path) and file_path.endswith(('.png', '.jpg', '.jpeg')):
filenames_list.append(file_path)
elif os.path.isdir(file_path):
filenames_list.extend(traverse_directory(file_path)) # 递归调用自身来处理子文件夹
print(filenames_list,flush=True)
return filenames_list
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
best_ratio_diff = float('inf')
best_ratio = (1, 1)
area = width * height
for ratio in target_ratios:
target_aspect_ratio = ratio[0] / ratio[1]
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
if ratio_diff < best_ratio_diff:
best_ratio_diff = ratio_diff
best_ratio = ratio
elif ratio_diff == best_ratio_diff:
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
best_ratio = ratio
return best_ratio
def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
orig_width, orig_height = image.size
aspect_ratio = orig_width / orig_height
# calculate the existing image aspect ratio
target_ratios = set(
(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
i * j <= max_num and i * j >= min_num)
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
# find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio, target_ratios, orig_width, orig_height, image_size)
# calculate the target width and height
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
# resize the image
resized_img = image.resize((target_width, target_height))
processed_images = []
for i in range(blocks):
box = (
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size
)
# split the image
split_img = resized_img.crop(box)
processed_images.append(split_img)
assert len(processed_images) == blocks
if use_thumbnail and len(processed_images) != 1:
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images
def load_image(image_file, input_size=448, max_num=12):
image = Image.open(image_file).convert('RGB')
transform = build_transform(input_size=input_size)
images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
pixel_values = [transform(image) for image in images]
pixel_values = torch.stack(pixel_values)
return pixel_values
def infer(img_path):
question = f'<image>\n图像中是否存在火焰?详细分析。'
question = f'<image>\n图像中是否存在火焰?简单回答。'
question = f'<image>\n图像中是否存在火焰?快速回答。'
# set the max number of tiles in `max_num`
pixel_values = load_image(img_path, max_num=12).to(torch.bfloat16).cuda()
# single-image single-round conversation (单图单轮对话)
output_text = model.chat(tokenizer, pixel_values, question, generation_config)
print(f'User: {question}\nAssistant: {output_text}')
if __name__=="__main__":
image_path = ""
infer(image_path)
2. License
This code repository is licensed under the Apache license 2.0.
3. Citation
- Downloads last month
- 37
Model tree for gaoqie/InternVl2-8B-fire
Base model
OpenGVLab/InternVL2-8B
