Spaces:

BiasLab2025
/

perception

Paused

App Files Files Community

Zhen Ye commited on 14 days ago

Commit

29e1c2b

1 Parent(s): 48d3b52

Remove InternVL2 logic as requested

Browse files

Files changed (5) hide show

LaserPerception/LaserPerception.html +1 -3
LaserPerception/LaserPerception.js +1 -1
models/detectors/internvl2.py +0 -188
models/model_loader.py +2 -2
requirements.txt +1 -1

LaserPerception/LaserPerception.html CHANGED Viewed

@@ -81,9 +81,7 @@
                 <optgroup label="Drone Detection Models">
                   <option value="drone_yolo" data-kind="drone">Drone</option>
                 </optgroup>
-                <optgroup label="Vision-Language Models">
-                  <option value="internvl2_military" data-kind="object">InternVL2 (Military)</option>
-                </optgroup>
               </select>
             </div>
             <div>

                 <optgroup label="Drone Detection Models">
                   <option value="drone_yolo" data-kind="drone">Drone</option>
                 </optgroup>
               </select>
             </div>
             <div>

LaserPerception/LaserPerception.js CHANGED Viewed

@@ -703,7 +703,7 @@
         "grounding_dino",
         "sam3",
         "drone_yolo",
-        "internvl2_military"
     ]);
     // Backend currently requires latitude/longitude form fields. We send neutral defaults (no UI, no location in outputs).

         "grounding_dino",
         "sam3",
         "drone_yolo",
     ]);
     // Backend currently requires latitude/longitude form fields. We send neutral defaults (no UI, no location in outputs).

models/detectors/internvl2.py DELETED Viewed

@@ -1,188 +0,0 @@
-from typing import Sequence, List
-import logging
-import torch
-import numpy as np
-import re
-from PIL import Image
-from transformers import AutoModel, AutoTokenizer
-from models.detectors.base import ObjectDetector, DetectionResult
-class InternVL2Detector(ObjectDetector):
-    name = "internvl2_military"
-    supports_batch = False # VLM inference is heavy, safer to do 1-by-1
-    def __init__(self, device: str = "cpu"):
-        self.device = device
-        logging.info(f"Loading InternVL2 (Military) on {device}...")
-        try:
-            path = "SherinSaji/internvl2-5-4b-military-object-detection"
-            # Trust remote code is required for InternVL
-            self.model = AutoModel.from_pretrained(
-                path,
-                torch_dtype=torch.float16 if "cuda" in device else torch.float32,
-                low_cpu_mem_usage=True,
-                trust_remote_code=True
-            ).to(self.device).eval()
-            self.tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
-            logging.info("InternVL2 loaded successfully.")
-        except Exception as e:
-            logging.exception("Failed to load InternVL2 model")
-            raise e
-    def predict(self, frame: np.ndarray, queries: Sequence[str]) -> DetectionResult:
-        # Convert CV2 BGR to PIL RGB
-        if frame is None:
-             return DetectionResult(np.array([]), [], [])
-        image_pil = Image.fromarray(frame[:, :, ::-1])
-        width, height = image_pil.size
-        # Prepare Prompt
-        # Logic: We want to detect objects requested in queries.
-        # If queries is empty/default, we ask for general military objects?
-        # InternVL detection prompt usually follows a pattern.
-        # Checking general InternVL2 usage, it often supports "<ref>object</ref>" grounding or general description.
-        # However, for this specific fine-tune, let's assume standard VLM detection prompting.
-        # "Please detect {object} in this image."
-        detected_boxes = []
-        detected_scores = []
-        detected_labels = []
-        detected_label_names = []
-        # We can try to query all in one go or loop. VLM context window allows multiple.
-        # Let's try to query for the list.
-        # Construct a prompt.
-        objects_str = ", ".join(queries) if queries else "military objects"
-        prompt = f"Please detect {objects_str} in this image."
-        # InternVL specific input formatting might be required (e.g. pixel_values)
-        # Usage example implies standard .chat() or .generate() usage?
-        # The user provided loading code: `model = AutoModel...`
-        # Usually InternVL has a `.chat()` API if it's the chat model, or we use `build_transform`.
-        try:
-            # Helper to preprocess image
-            # We assume the model class has 'build_transform' or similar from remote code
-            # But since we use AutoModel, we might just call model.chat if it exposes it (typical for InternVL code).
-            # Note: InternVL2 remote code usually adds .chat() to the model instance.
-            pixel_values = None
-            generation_config = dict(
-                num_beams=1,
-                max_new_tokens=1024,
-                do_sample=False,
-            )
-            # The model likely expects the image to be processed.
-            # Let's try the standard pattern if we can't find specific documentation.
-            # Assuming `model.chat(tokenizer, pixel_values, question, generation_config)`
-            # We need to transform the image.
-            # The typical InternVL transform:
-            from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize, InterpolationMode
-            # If the model has a generic `chat` method that takes PIL image directly, simpler.
-            # Many recent HF models do. Let's try passing PIL image if possible or inspect.
-            # But safer to assume we need to prepare it.
-            # Let's try to use the tokenizer/processor if available?
-            # User only loaded Model and Tokenizer.
-            # Let's attempt to use the model's `chat` method which usually handles image preprocessing
-            # if we pass the correct tensor.
-            # WAIT: The snippet `model = AutoModel...` returns the raw modeling code.
-            # If this is OpenGVLab/InternVL-Chat-V1-5 style code:
-            # It usually requires:
-            # pixel_values = load_image(image_file, max_num=6).to(torch.bfloat16).cuda()
-            # response = model.chat(tokenizer, pixel_values, question, generation_config)
-            # Dynamic resize implementation (simplified from official repo)
-            def dynamic_preprocess(image, min_num=1, max_num=6, image_size=444, use_thumbnail=True):
-                orig_width, orig_height = image.size
-                aspect_ratio = orig_width / orig_height
-                # calculate target box logic...
-                # For simplicity in this wrapper, we might just resize to standard 444x444 or similar
-                # if we can't easily import the complex logic.
-                # However, quality depends on it.
-                # Let's check if the model has a helper?
-                pass
-            # Let's try a simpler path: usually the repo provides `build_transform`.
-            # We can't easily import from the remote code module directly unless we know the path.
-            # But `trust_remote_code=True` imports it into the `AutoModel` namespace usually?
-            # Or we just do standard resize.
-            # Fallback: Resize to 448x448 (common VLM input) and Normalize
-            # But InternVL uses specific mechanics.
-            # Alternative: Assume `model` has a `chat` that accepts image tensors?
-            # Let's assume we can get away with a standard transform for now:
-            t = Compose([
-                Resize((448, 448), interpolation=InterpolationMode.BICUBIC),
-                ToTensor(),
-                Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
-            ])
-            input_tensor = t(image_pil).unsqueeze(0).to(self.device).to(self.model.dtype)
-            # The model.chat signature often varies.
-            # Common: model.chat(tokenizer, pixel_values, question, generation_config)
-            response, history = self.model.chat(
-                self.tokenizer,
-                pixel_values=input_tensor,
-                question=prompt,
-                generation_config=generation_config
-            )
-            # Parse response
-            # Expected output format for detection: "bbox: [x1, y1, x2, y2], label" or similar?
-            # OR <ref>object</ref><box>[[x1, y1, x2, y2]]</box>
-            # We need to parse robustly.
-            # Let's assume the response is text describing objects.
-            # "I found a tank at [100, 200, 300, 400]..."
-            # Heuristic regex parsing for coordinates [x1, y1, x2, y2] (common in VLMs)
-            # Normalization? Usually VLMs output [0-1000] int or [0.0-1.0] float.
-            # InternVL often uses [0, 1000].
-            # Regex for [x1, y1, x2, y2] integers
-            pattern = r"\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]"
-            raw_boxes = re.findall(pattern, response)
-            for b in raw_boxes:
-                # raw 0-1000 coords
-                x1, y1, x2, y2 = map(int, b)
-                # Scale to image
-                abs_x1 = (x1 / 1000.0) * width
-                abs_y1 = (y1 / 1000.0) * height
-                abs_x2 = (x2 / 1000.0) * width
-                abs_y2 = (y2 / 1000.0) * height
-                detected_boxes.append([abs_x1, abs_y1, abs_x2, abs_y2])
-                detected_scores.append(0.99) # VLM doesn't always give confidence
-                detected_labels.append(0)
-                detected_label_names.append("object") # Provide generic label if parsing fails to link text
-                # Try to find label before the box?
-                # (Complex parsing omitted for MVP, assumes "object" or user query mapping)
-        except Exception as e:
-            logging.error(f"InternVL2 prediction error: {e}")
-        return DetectionResult(
-            np.array(detected_boxes) if detected_boxes else np.empty((0, 4)),
-            detected_scores,
-            detected_labels,
-            detected_label_names
-        )

models/model_loader.py CHANGED Viewed

@@ -7,7 +7,7 @@ from models.detectors.detr import DetrDetector
 from models.detectors.drone_yolo import DroneYoloDetector
 from models.detectors.grounding_dino import GroundingDinoDetector
 from models.detectors.yolov8 import HuggingFaceYoloV8Detector
-from models.detectors.internvl2 import InternVL2Detector
 DEFAULT_DETECTOR = "hf_yolov8"
@@ -16,7 +16,7 @@ _REGISTRY: Dict[str, Callable[[], ObjectDetector]] = {
     "detr_resnet50": DetrDetector,
     "grounding_dino": GroundingDinoDetector,
     "drone_yolo": DroneYoloDetector,
-    "internvl2_military": InternVL2Detector,
 }

 from models.detectors.drone_yolo import DroneYoloDetector
 from models.detectors.grounding_dino import GroundingDinoDetector
 from models.detectors.yolov8 import HuggingFaceYoloV8Detector
 DEFAULT_DETECTOR = "hf_yolov8"
     "detr_resnet50": DetrDetector,
     "grounding_dino": GroundingDinoDetector,
     "drone_yolo": DroneYoloDetector,
 }

requirements.txt CHANGED Viewed

@@ -13,4 +13,4 @@ timm
 ffmpeg-python
 python-dotenv
 einops
-internvl @ git+https://github.com/OpenGVLab/InternVL.git#subdirectory=internvl_chat

 ffmpeg-python
 python-dotenv
 einops