Using Multimodal Large Language Models for False Alarm Reduction in Image-based Fire Detection

https://doi.org/10.21203/rs.3.rs-8847038/v1

Existing vision-based methods suffer from high false alarm rates in urban flame detection. Applying Multimodal Large Language Models (MLLMs) for secondary filtering shows great potential in reducing false alarms, yet they have high inference latency and are prone to reasoning collapse on negative samples without explicit Chain-of-Thought (CoT) guidance. To overcome these challenges, this study proposed Flash-Cascade, the first sub-second MLLM-based firewall to leverage CoT to efficiently filter false alarms. We deconstructed the flame detection process into four logical stages (planning, observation, analysis, and judgment), which informed the design of three switchable reasoning modes (Detailed, Quick, and Rapid) to achieve inference acceleration via CoT compression. We fine-tuned Qwen2-VL-7B-Instruct on a multi-grained instruction dataset via Low-Rank Adaptation. This process internalizes explicit reasoning logic into implicit parameter representations, enabling the model to maintain robust reasoning capability even without explicit CoT guidance. On our newly constructed benchmark incorporating real-world hard negatives, Flash-Cascade achieves an accuracy of 97.79% and an F1-score of 0.9767 in Rapid mode, outperforming the baseline by 61.63 percentage points (pp) and 0.5152, respectively. Furthermore, it outperforms the state-of-the-art object detector DEIMv2 by 14.64 pp in accuracy. The method exhibits exceptional sample efficiency, converging with only 600 samples and 2 epochs, and improves inference speed by 810% over standard CoT. This study will open a door for robust and efficient flame detection in high-interference scenarios.

1. Quick Start

Installation

For installation instructions, please refer to OpenGVLab/InternVL2-8B.

Simple Inference Example

import numpy as np
import torch
import torchvision.transforms as T
from decord import VideoReader, cpu
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

import torch
from PIL import Image
from transformers import (
    AutoTokenizer,
    AutoImageProcessor,
    AutoModelForCausalLM,
)
import os


# swift_common

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
generation_config = dict(max_new_tokens=1024, do_sample=True)

path = '' # gaoqie/InternVl2-8B-fire
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    # use_flash_attn=True,
    trust_remote_code=True).eval().cuda()
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)



def traverse_directory(directory):
    '''
    获取指定文件夹下的所有图像路径
    '''
    filenames_list = []
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        if os.path.isfile(file_path) and file_path.endswith(('.png', '.jpg', '.jpeg')):
            filenames_list.append(file_path)
        elif os.path.isdir(file_path):
            filenames_list.extend(traverse_directory(file_path))  # 递归调用自身来处理子文件夹
    print(filenames_list,flush=True)
    return filenames_list


def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
    best_ratio_diff = float('inf')
    best_ratio = (1, 1)
    area = width * height
    for ratio in target_ratios:
        target_aspect_ratio = ratio[0] / ratio[1]
        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
        if ratio_diff < best_ratio_diff:
            best_ratio_diff = ratio_diff
            best_ratio = ratio
        elif ratio_diff == best_ratio_diff:
            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                best_ratio = ratio
    return best_ratio

def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
    orig_width, orig_height = image.size
    aspect_ratio = orig_width / orig_height

    # calculate the existing image aspect ratio
    target_ratios = set(
        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
        i * j <= max_num and i * j >= min_num)
    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

    # find the closest aspect ratio to the target
    target_aspect_ratio = find_closest_aspect_ratio(
        aspect_ratio, target_ratios, orig_width, orig_height, image_size)

    # calculate the target width and height
    target_width = image_size * target_aspect_ratio[0]
    target_height = image_size * target_aspect_ratio[1]
    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

    # resize the image
    resized_img = image.resize((target_width, target_height))
    processed_images = []
    for i in range(blocks):
        box = (
            (i % (target_width // image_size)) * image_size,
            (i // (target_width // image_size)) * image_size,
            ((i % (target_width // image_size)) + 1) * image_size,
            ((i // (target_width // image_size)) + 1) * image_size
        )
        # split the image
        split_img = resized_img.crop(box)
        processed_images.append(split_img)
    assert len(processed_images) == blocks
    if use_thumbnail and len(processed_images) != 1:
        thumbnail_img = image.resize((image_size, image_size))
        processed_images.append(thumbnail_img)
    return processed_images

def load_image(image_file, input_size=448, max_num=12):
    image = Image.open(image_file).convert('RGB')
    transform = build_transform(input_size=input_size)
    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
    pixel_values = [transform(image) for image in images]
    pixel_values = torch.stack(pixel_values)
    return pixel_values


def infer(img_path):
    question = f'<image>\n图像中是否存在火焰？详细分析。'
    question = f'<image>\n图像中是否存在火焰？简单回答。'
    question = f'<image>\n图像中是否存在火焰？快速回答。'
    
    # set the max number of tiles in `max_num`
    pixel_values = load_image(img_path, max_num=12).to(torch.bfloat16).cuda()
    # single-image single-round conversation (单图单轮对话)
    output_text = model.chat(tokenizer, pixel_values, question, generation_config)
    print(f'User: {question}\nAssistant: {output_text}')


if __name__=="__main__": 
    image_path = ""
    infer(image_path)

2. License

This code repository is licensed under the Apache license 2.0.

3. Citation

Downloads last month: 2

Safetensors

Model size

8B params

Tensor type

BF16

Model tree for gaoqie/InternVl2-8B-fire

Base model

OpenGVLab/InternVL2-8B

Finetuned

(17)

this model

Collection including gaoqie/InternVl2-8B-fire

Flash-Cascade

Collection

Using Multimodal Large Language Models (MLLMs) for false alarm reduction in image-based fire detection.Doi:https://doi.org/10.21203/rs.3.rs-8847038/v1 • 10 items • Updated May 15 • 2

Using Multimodal Large Language Models for False Alarm Reduction in Image-based Fire Detection

1. Quick Start

Installation

Simple Inference Example

2. License

3. Citation

Model tree for gaoqie/InternVl2-8B-fire

Collection including gaoqie/InternVl2-8B-fire