Zhen Ye commited on
Commit
29e1c2b
·
1 Parent(s): 48d3b52

Remove InternVL2 logic as requested

Browse files
LaserPerception/LaserPerception.html CHANGED
@@ -81,9 +81,7 @@
81
  <optgroup label="Drone Detection Models">
82
  <option value="drone_yolo" data-kind="drone">Drone</option>
83
  </optgroup>
84
- <optgroup label="Vision-Language Models">
85
- <option value="internvl2_military" data-kind="object">InternVL2 (Military)</option>
86
- </optgroup>
87
  </select>
88
  </div>
89
  <div>
 
81
  <optgroup label="Drone Detection Models">
82
  <option value="drone_yolo" data-kind="drone">Drone</option>
83
  </optgroup>
84
+
 
 
85
  </select>
86
  </div>
87
  <div>
LaserPerception/LaserPerception.js CHANGED
@@ -703,7 +703,7 @@
703
  "grounding_dino",
704
  "sam3",
705
  "drone_yolo",
706
- "internvl2_military"
707
  ]);
708
 
709
  // Backend currently requires latitude/longitude form fields. We send neutral defaults (no UI, no location in outputs).
 
703
  "grounding_dino",
704
  "sam3",
705
  "drone_yolo",
706
+
707
  ]);
708
 
709
  // Backend currently requires latitude/longitude form fields. We send neutral defaults (no UI, no location in outputs).
models/detectors/internvl2.py DELETED
@@ -1,188 +0,0 @@
1
- from typing import Sequence, List
2
- import logging
3
- import torch
4
- import numpy as np
5
- import re
6
- from PIL import Image
7
- from transformers import AutoModel, AutoTokenizer
8
-
9
- from models.detectors.base import ObjectDetector, DetectionResult
10
-
11
- class InternVL2Detector(ObjectDetector):
12
- name = "internvl2_military"
13
- supports_batch = False # VLM inference is heavy, safer to do 1-by-1
14
-
15
- def __init__(self, device: str = "cpu"):
16
- self.device = device
17
- logging.info(f"Loading InternVL2 (Military) on {device}...")
18
-
19
- try:
20
- path = "SherinSaji/internvl2-5-4b-military-object-detection"
21
- # Trust remote code is required for InternVL
22
- self.model = AutoModel.from_pretrained(
23
- path,
24
- torch_dtype=torch.float16 if "cuda" in device else torch.float32,
25
- low_cpu_mem_usage=True,
26
- trust_remote_code=True
27
- ).to(self.device).eval()
28
-
29
- self.tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
30
-
31
- logging.info("InternVL2 loaded successfully.")
32
- except Exception as e:
33
- logging.exception("Failed to load InternVL2 model")
34
- raise e
35
-
36
- def predict(self, frame: np.ndarray, queries: Sequence[str]) -> DetectionResult:
37
- # Convert CV2 BGR to PIL RGB
38
- if frame is None:
39
- return DetectionResult(np.array([]), [], [])
40
-
41
- image_pil = Image.fromarray(frame[:, :, ::-1])
42
- width, height = image_pil.size
43
-
44
- # Prepare Prompt
45
- # Logic: We want to detect objects requested in queries.
46
- # If queries is empty/default, we ask for general military objects?
47
- # InternVL detection prompt usually follows a pattern.
48
- # Checking general InternVL2 usage, it often supports "<ref>object</ref>" grounding or general description.
49
- # However, for this specific fine-tune, let's assume standard VLM detection prompting.
50
- # "Please detect {object} in this image."
51
-
52
- detected_boxes = []
53
- detected_scores = []
54
- detected_labels = []
55
- detected_label_names = []
56
-
57
- # We can try to query all in one go or loop. VLM context window allows multiple.
58
- # Let's try to query for the list.
59
- # Construct a prompt.
60
- objects_str = ", ".join(queries) if queries else "military objects"
61
- prompt = f"Please detect {objects_str} in this image."
62
-
63
- # InternVL specific input formatting might be required (e.g. pixel_values)
64
- # Usage example implies standard .chat() or .generate() usage?
65
- # The user provided loading code: `model = AutoModel...`
66
- # Usually InternVL has a `.chat()` API if it's the chat model, or we use `build_transform`.
67
-
68
- try:
69
- # Helper to preprocess image
70
- # We assume the model class has 'build_transform' or similar from remote code
71
- # But since we use AutoModel, we might just call model.chat if it exposes it (typical for InternVL code).
72
-
73
- # Note: InternVL2 remote code usually adds .chat() to the model instance.
74
- pixel_values = None
75
- generation_config = dict(
76
- num_beams=1,
77
- max_new_tokens=1024,
78
- do_sample=False,
79
- )
80
-
81
- # The model likely expects the image to be processed.
82
- # Let's try the standard pattern if we can't find specific documentation.
83
- # Assuming `model.chat(tokenizer, pixel_values, question, generation_config)`
84
-
85
- # We need to transform the image.
86
- # The typical InternVL transform:
87
- from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize, InterpolationMode
88
-
89
- # If the model has a generic `chat` method that takes PIL image directly, simpler.
90
- # Many recent HF models do. Let's try passing PIL image if possible or inspect.
91
- # But safer to assume we need to prepare it.
92
-
93
- # Let's try to use the tokenizer/processor if available?
94
- # User only loaded Model and Tokenizer.
95
-
96
- # Let's attempt to use the model's `chat` method which usually handles image preprocessing
97
- # if we pass the correct tensor.
98
-
99
- # WAIT: The snippet `model = AutoModel...` returns the raw modeling code.
100
- # If this is OpenGVLab/InternVL-Chat-V1-5 style code:
101
- # It usually requires:
102
- # pixel_values = load_image(image_file, max_num=6).to(torch.bfloat16).cuda()
103
- # response = model.chat(tokenizer, pixel_values, question, generation_config)
104
-
105
- # Dynamic resize implementation (simplified from official repo)
106
- def dynamic_preprocess(image, min_num=1, max_num=6, image_size=444, use_thumbnail=True):
107
- orig_width, orig_height = image.size
108
- aspect_ratio = orig_width / orig_height
109
-
110
- # calculate target box logic...
111
- # For simplicity in this wrapper, we might just resize to standard 444x444 or similar
112
- # if we can't easily import the complex logic.
113
- # However, quality depends on it.
114
-
115
- # Let's check if the model has a helper?
116
- pass
117
-
118
- # Let's try a simpler path: usually the repo provides `build_transform`.
119
- # We can't easily import from the remote code module directly unless we know the path.
120
- # But `trust_remote_code=True` imports it into the `AutoModel` namespace usually?
121
- # Or we just do standard resize.
122
-
123
- # Fallback: Resize to 448x448 (common VLM input) and Normalize
124
- # But InternVL uses specific mechanics.
125
-
126
- # Alternative: Assume `model` has a `chat` that accepts image tensors?
127
- # Let's assume we can get away with a standard transform for now:
128
-
129
- t = Compose([
130
- Resize((448, 448), interpolation=InterpolationMode.BICUBIC),
131
- ToTensor(),
132
- Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
133
- ])
134
-
135
- input_tensor = t(image_pil).unsqueeze(0).to(self.device).to(self.model.dtype)
136
-
137
- # The model.chat signature often varies.
138
- # Common: model.chat(tokenizer, pixel_values, question, generation_config)
139
- response, history = self.model.chat(
140
- self.tokenizer,
141
- pixel_values=input_tensor,
142
- question=prompt,
143
- generation_config=generation_config
144
- )
145
-
146
- # Parse response
147
- # Expected output format for detection: "bbox: [x1, y1, x2, y2], label" or similar?
148
- # OR <ref>object</ref><box>[[x1, y1, x2, y2]]</box>
149
- # We need to parse robustly.
150
-
151
- # Let's assume the response is text describing objects.
152
- # "I found a tank at [100, 200, 300, 400]..."
153
-
154
- # Heuristic regex parsing for coordinates [x1, y1, x2, y2] (common in VLMs)
155
- # Normalization? Usually VLMs output [0-1000] int or [0.0-1.0] float.
156
- # InternVL often uses [0, 1000].
157
-
158
- # Regex for [x1, y1, x2, y2] integers
159
- pattern = r"\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]"
160
- raw_boxes = re.findall(pattern, response)
161
-
162
- for b in raw_boxes:
163
- # raw 0-1000 coords
164
- x1, y1, x2, y2 = map(int, b)
165
-
166
- # Scale to image
167
- abs_x1 = (x1 / 1000.0) * width
168
- abs_y1 = (y1 / 1000.0) * height
169
- abs_x2 = (x2 / 1000.0) * width
170
- abs_y2 = (y2 / 1000.0) * height
171
-
172
- detected_boxes.append([abs_x1, abs_y1, abs_x2, abs_y2])
173
- detected_scores.append(0.99) # VLM doesn't always give confidence
174
- detected_labels.append(0)
175
- detected_label_names.append("object") # Provide generic label if parsing fails to link text
176
-
177
- # Try to find label before the box?
178
- # (Complex parsing omitted for MVP, assumes "object" or user query mapping)
179
-
180
- except Exception as e:
181
- logging.error(f"InternVL2 prediction error: {e}")
182
-
183
- return DetectionResult(
184
- np.array(detected_boxes) if detected_boxes else np.empty((0, 4)),
185
- detected_scores,
186
- detected_labels,
187
- detected_label_names
188
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
models/model_loader.py CHANGED
@@ -7,7 +7,7 @@ from models.detectors.detr import DetrDetector
7
  from models.detectors.drone_yolo import DroneYoloDetector
8
  from models.detectors.grounding_dino import GroundingDinoDetector
9
  from models.detectors.yolov8 import HuggingFaceYoloV8Detector
10
- from models.detectors.internvl2 import InternVL2Detector
11
 
12
  DEFAULT_DETECTOR = "hf_yolov8"
13
 
@@ -16,7 +16,7 @@ _REGISTRY: Dict[str, Callable[[], ObjectDetector]] = {
16
  "detr_resnet50": DetrDetector,
17
  "grounding_dino": GroundingDinoDetector,
18
  "drone_yolo": DroneYoloDetector,
19
- "internvl2_military": InternVL2Detector,
20
  }
21
 
22
 
 
7
  from models.detectors.drone_yolo import DroneYoloDetector
8
  from models.detectors.grounding_dino import GroundingDinoDetector
9
  from models.detectors.yolov8 import HuggingFaceYoloV8Detector
10
+
11
 
12
  DEFAULT_DETECTOR = "hf_yolov8"
13
 
 
16
  "detr_resnet50": DetrDetector,
17
  "grounding_dino": GroundingDinoDetector,
18
  "drone_yolo": DroneYoloDetector,
19
+
20
  }
21
 
22
 
requirements.txt CHANGED
@@ -13,4 +13,4 @@ timm
13
  ffmpeg-python
14
  python-dotenv
15
  einops
16
- internvl @ git+https://github.com/OpenGVLab/InternVL.git#subdirectory=internvl_chat
 
13
  ffmpeg-python
14
  python-dotenv
15
  einops
16
+