Spaces:
Paused
Paused
Zhen Ye
commited on
Commit
·
29e1c2b
1
Parent(s):
48d3b52
Remove InternVL2 logic as requested
Browse files- LaserPerception/LaserPerception.html +1 -3
- LaserPerception/LaserPerception.js +1 -1
- models/detectors/internvl2.py +0 -188
- models/model_loader.py +2 -2
- requirements.txt +1 -1
LaserPerception/LaserPerception.html
CHANGED
|
@@ -81,9 +81,7 @@
|
|
| 81 |
<optgroup label="Drone Detection Models">
|
| 82 |
<option value="drone_yolo" data-kind="drone">Drone</option>
|
| 83 |
</optgroup>
|
| 84 |
-
|
| 85 |
-
<option value="internvl2_military" data-kind="object">InternVL2 (Military)</option>
|
| 86 |
-
</optgroup>
|
| 87 |
</select>
|
| 88 |
</div>
|
| 89 |
<div>
|
|
|
|
| 81 |
<optgroup label="Drone Detection Models">
|
| 82 |
<option value="drone_yolo" data-kind="drone">Drone</option>
|
| 83 |
</optgroup>
|
| 84 |
+
|
|
|
|
|
|
|
| 85 |
</select>
|
| 86 |
</div>
|
| 87 |
<div>
|
LaserPerception/LaserPerception.js
CHANGED
|
@@ -703,7 +703,7 @@
|
|
| 703 |
"grounding_dino",
|
| 704 |
"sam3",
|
| 705 |
"drone_yolo",
|
| 706 |
-
|
| 707 |
]);
|
| 708 |
|
| 709 |
// Backend currently requires latitude/longitude form fields. We send neutral defaults (no UI, no location in outputs).
|
|
|
|
| 703 |
"grounding_dino",
|
| 704 |
"sam3",
|
| 705 |
"drone_yolo",
|
| 706 |
+
|
| 707 |
]);
|
| 708 |
|
| 709 |
// Backend currently requires latitude/longitude form fields. We send neutral defaults (no UI, no location in outputs).
|
models/detectors/internvl2.py
DELETED
|
@@ -1,188 +0,0 @@
|
|
| 1 |
-
from typing import Sequence, List
|
| 2 |
-
import logging
|
| 3 |
-
import torch
|
| 4 |
-
import numpy as np
|
| 5 |
-
import re
|
| 6 |
-
from PIL import Image
|
| 7 |
-
from transformers import AutoModel, AutoTokenizer
|
| 8 |
-
|
| 9 |
-
from models.detectors.base import ObjectDetector, DetectionResult
|
| 10 |
-
|
| 11 |
-
class InternVL2Detector(ObjectDetector):
|
| 12 |
-
name = "internvl2_military"
|
| 13 |
-
supports_batch = False # VLM inference is heavy, safer to do 1-by-1
|
| 14 |
-
|
| 15 |
-
def __init__(self, device: str = "cpu"):
|
| 16 |
-
self.device = device
|
| 17 |
-
logging.info(f"Loading InternVL2 (Military) on {device}...")
|
| 18 |
-
|
| 19 |
-
try:
|
| 20 |
-
path = "SherinSaji/internvl2-5-4b-military-object-detection"
|
| 21 |
-
# Trust remote code is required for InternVL
|
| 22 |
-
self.model = AutoModel.from_pretrained(
|
| 23 |
-
path,
|
| 24 |
-
torch_dtype=torch.float16 if "cuda" in device else torch.float32,
|
| 25 |
-
low_cpu_mem_usage=True,
|
| 26 |
-
trust_remote_code=True
|
| 27 |
-
).to(self.device).eval()
|
| 28 |
-
|
| 29 |
-
self.tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
|
| 30 |
-
|
| 31 |
-
logging.info("InternVL2 loaded successfully.")
|
| 32 |
-
except Exception as e:
|
| 33 |
-
logging.exception("Failed to load InternVL2 model")
|
| 34 |
-
raise e
|
| 35 |
-
|
| 36 |
-
def predict(self, frame: np.ndarray, queries: Sequence[str]) -> DetectionResult:
|
| 37 |
-
# Convert CV2 BGR to PIL RGB
|
| 38 |
-
if frame is None:
|
| 39 |
-
return DetectionResult(np.array([]), [], [])
|
| 40 |
-
|
| 41 |
-
image_pil = Image.fromarray(frame[:, :, ::-1])
|
| 42 |
-
width, height = image_pil.size
|
| 43 |
-
|
| 44 |
-
# Prepare Prompt
|
| 45 |
-
# Logic: We want to detect objects requested in queries.
|
| 46 |
-
# If queries is empty/default, we ask for general military objects?
|
| 47 |
-
# InternVL detection prompt usually follows a pattern.
|
| 48 |
-
# Checking general InternVL2 usage, it often supports "<ref>object</ref>" grounding or general description.
|
| 49 |
-
# However, for this specific fine-tune, let's assume standard VLM detection prompting.
|
| 50 |
-
# "Please detect {object} in this image."
|
| 51 |
-
|
| 52 |
-
detected_boxes = []
|
| 53 |
-
detected_scores = []
|
| 54 |
-
detected_labels = []
|
| 55 |
-
detected_label_names = []
|
| 56 |
-
|
| 57 |
-
# We can try to query all in one go or loop. VLM context window allows multiple.
|
| 58 |
-
# Let's try to query for the list.
|
| 59 |
-
# Construct a prompt.
|
| 60 |
-
objects_str = ", ".join(queries) if queries else "military objects"
|
| 61 |
-
prompt = f"Please detect {objects_str} in this image."
|
| 62 |
-
|
| 63 |
-
# InternVL specific input formatting might be required (e.g. pixel_values)
|
| 64 |
-
# Usage example implies standard .chat() or .generate() usage?
|
| 65 |
-
# The user provided loading code: `model = AutoModel...`
|
| 66 |
-
# Usually InternVL has a `.chat()` API if it's the chat model, or we use `build_transform`.
|
| 67 |
-
|
| 68 |
-
try:
|
| 69 |
-
# Helper to preprocess image
|
| 70 |
-
# We assume the model class has 'build_transform' or similar from remote code
|
| 71 |
-
# But since we use AutoModel, we might just call model.chat if it exposes it (typical for InternVL code).
|
| 72 |
-
|
| 73 |
-
# Note: InternVL2 remote code usually adds .chat() to the model instance.
|
| 74 |
-
pixel_values = None
|
| 75 |
-
generation_config = dict(
|
| 76 |
-
num_beams=1,
|
| 77 |
-
max_new_tokens=1024,
|
| 78 |
-
do_sample=False,
|
| 79 |
-
)
|
| 80 |
-
|
| 81 |
-
# The model likely expects the image to be processed.
|
| 82 |
-
# Let's try the standard pattern if we can't find specific documentation.
|
| 83 |
-
# Assuming `model.chat(tokenizer, pixel_values, question, generation_config)`
|
| 84 |
-
|
| 85 |
-
# We need to transform the image.
|
| 86 |
-
# The typical InternVL transform:
|
| 87 |
-
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize, InterpolationMode
|
| 88 |
-
|
| 89 |
-
# If the model has a generic `chat` method that takes PIL image directly, simpler.
|
| 90 |
-
# Many recent HF models do. Let's try passing PIL image if possible or inspect.
|
| 91 |
-
# But safer to assume we need to prepare it.
|
| 92 |
-
|
| 93 |
-
# Let's try to use the tokenizer/processor if available?
|
| 94 |
-
# User only loaded Model and Tokenizer.
|
| 95 |
-
|
| 96 |
-
# Let's attempt to use the model's `chat` method which usually handles image preprocessing
|
| 97 |
-
# if we pass the correct tensor.
|
| 98 |
-
|
| 99 |
-
# WAIT: The snippet `model = AutoModel...` returns the raw modeling code.
|
| 100 |
-
# If this is OpenGVLab/InternVL-Chat-V1-5 style code:
|
| 101 |
-
# It usually requires:
|
| 102 |
-
# pixel_values = load_image(image_file, max_num=6).to(torch.bfloat16).cuda()
|
| 103 |
-
# response = model.chat(tokenizer, pixel_values, question, generation_config)
|
| 104 |
-
|
| 105 |
-
# Dynamic resize implementation (simplified from official repo)
|
| 106 |
-
def dynamic_preprocess(image, min_num=1, max_num=6, image_size=444, use_thumbnail=True):
|
| 107 |
-
orig_width, orig_height = image.size
|
| 108 |
-
aspect_ratio = orig_width / orig_height
|
| 109 |
-
|
| 110 |
-
# calculate target box logic...
|
| 111 |
-
# For simplicity in this wrapper, we might just resize to standard 444x444 or similar
|
| 112 |
-
# if we can't easily import the complex logic.
|
| 113 |
-
# However, quality depends on it.
|
| 114 |
-
|
| 115 |
-
# Let's check if the model has a helper?
|
| 116 |
-
pass
|
| 117 |
-
|
| 118 |
-
# Let's try a simpler path: usually the repo provides `build_transform`.
|
| 119 |
-
# We can't easily import from the remote code module directly unless we know the path.
|
| 120 |
-
# But `trust_remote_code=True` imports it into the `AutoModel` namespace usually?
|
| 121 |
-
# Or we just do standard resize.
|
| 122 |
-
|
| 123 |
-
# Fallback: Resize to 448x448 (common VLM input) and Normalize
|
| 124 |
-
# But InternVL uses specific mechanics.
|
| 125 |
-
|
| 126 |
-
# Alternative: Assume `model` has a `chat` that accepts image tensors?
|
| 127 |
-
# Let's assume we can get away with a standard transform for now:
|
| 128 |
-
|
| 129 |
-
t = Compose([
|
| 130 |
-
Resize((448, 448), interpolation=InterpolationMode.BICUBIC),
|
| 131 |
-
ToTensor(),
|
| 132 |
-
Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
|
| 133 |
-
])
|
| 134 |
-
|
| 135 |
-
input_tensor = t(image_pil).unsqueeze(0).to(self.device).to(self.model.dtype)
|
| 136 |
-
|
| 137 |
-
# The model.chat signature often varies.
|
| 138 |
-
# Common: model.chat(tokenizer, pixel_values, question, generation_config)
|
| 139 |
-
response, history = self.model.chat(
|
| 140 |
-
self.tokenizer,
|
| 141 |
-
pixel_values=input_tensor,
|
| 142 |
-
question=prompt,
|
| 143 |
-
generation_config=generation_config
|
| 144 |
-
)
|
| 145 |
-
|
| 146 |
-
# Parse response
|
| 147 |
-
# Expected output format for detection: "bbox: [x1, y1, x2, y2], label" or similar?
|
| 148 |
-
# OR <ref>object</ref><box>[[x1, y1, x2, y2]]</box>
|
| 149 |
-
# We need to parse robustly.
|
| 150 |
-
|
| 151 |
-
# Let's assume the response is text describing objects.
|
| 152 |
-
# "I found a tank at [100, 200, 300, 400]..."
|
| 153 |
-
|
| 154 |
-
# Heuristic regex parsing for coordinates [x1, y1, x2, y2] (common in VLMs)
|
| 155 |
-
# Normalization? Usually VLMs output [0-1000] int or [0.0-1.0] float.
|
| 156 |
-
# InternVL often uses [0, 1000].
|
| 157 |
-
|
| 158 |
-
# Regex for [x1, y1, x2, y2] integers
|
| 159 |
-
pattern = r"\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]"
|
| 160 |
-
raw_boxes = re.findall(pattern, response)
|
| 161 |
-
|
| 162 |
-
for b in raw_boxes:
|
| 163 |
-
# raw 0-1000 coords
|
| 164 |
-
x1, y1, x2, y2 = map(int, b)
|
| 165 |
-
|
| 166 |
-
# Scale to image
|
| 167 |
-
abs_x1 = (x1 / 1000.0) * width
|
| 168 |
-
abs_y1 = (y1 / 1000.0) * height
|
| 169 |
-
abs_x2 = (x2 / 1000.0) * width
|
| 170 |
-
abs_y2 = (y2 / 1000.0) * height
|
| 171 |
-
|
| 172 |
-
detected_boxes.append([abs_x1, abs_y1, abs_x2, abs_y2])
|
| 173 |
-
detected_scores.append(0.99) # VLM doesn't always give confidence
|
| 174 |
-
detected_labels.append(0)
|
| 175 |
-
detected_label_names.append("object") # Provide generic label if parsing fails to link text
|
| 176 |
-
|
| 177 |
-
# Try to find label before the box?
|
| 178 |
-
# (Complex parsing omitted for MVP, assumes "object" or user query mapping)
|
| 179 |
-
|
| 180 |
-
except Exception as e:
|
| 181 |
-
logging.error(f"InternVL2 prediction error: {e}")
|
| 182 |
-
|
| 183 |
-
return DetectionResult(
|
| 184 |
-
np.array(detected_boxes) if detected_boxes else np.empty((0, 4)),
|
| 185 |
-
detected_scores,
|
| 186 |
-
detected_labels,
|
| 187 |
-
detected_label_names
|
| 188 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
models/model_loader.py
CHANGED
|
@@ -7,7 +7,7 @@ from models.detectors.detr import DetrDetector
|
|
| 7 |
from models.detectors.drone_yolo import DroneYoloDetector
|
| 8 |
from models.detectors.grounding_dino import GroundingDinoDetector
|
| 9 |
from models.detectors.yolov8 import HuggingFaceYoloV8Detector
|
| 10 |
-
|
| 11 |
|
| 12 |
DEFAULT_DETECTOR = "hf_yolov8"
|
| 13 |
|
|
@@ -16,7 +16,7 @@ _REGISTRY: Dict[str, Callable[[], ObjectDetector]] = {
|
|
| 16 |
"detr_resnet50": DetrDetector,
|
| 17 |
"grounding_dino": GroundingDinoDetector,
|
| 18 |
"drone_yolo": DroneYoloDetector,
|
| 19 |
-
|
| 20 |
}
|
| 21 |
|
| 22 |
|
|
|
|
| 7 |
from models.detectors.drone_yolo import DroneYoloDetector
|
| 8 |
from models.detectors.grounding_dino import GroundingDinoDetector
|
| 9 |
from models.detectors.yolov8 import HuggingFaceYoloV8Detector
|
| 10 |
+
|
| 11 |
|
| 12 |
DEFAULT_DETECTOR = "hf_yolov8"
|
| 13 |
|
|
|
|
| 16 |
"detr_resnet50": DetrDetector,
|
| 17 |
"grounding_dino": GroundingDinoDetector,
|
| 18 |
"drone_yolo": DroneYoloDetector,
|
| 19 |
+
|
| 20 |
}
|
| 21 |
|
| 22 |
|
requirements.txt
CHANGED
|
@@ -13,4 +13,4 @@ timm
|
|
| 13 |
ffmpeg-python
|
| 14 |
python-dotenv
|
| 15 |
einops
|
| 16 |
-
|
|
|
|
| 13 |
ffmpeg-python
|
| 14 |
python-dotenv
|
| 15 |
einops
|
| 16 |
+
|