Spaces:
Runtime error
Runtime error
| import torch | |
| from transformers import Qwen2VLForConditionalGeneration, AutoProcessor | |
| from qwen_vl_utils import process_vision_info | |
| from PIL import Image | |
| import os | |
| from pathlib import Path | |
| from typing import Optional, List, Dict | |
| import cv2 | |
| from src.interfaces.base import PerceptionEngine | |
| class Qwen2PerceptionEngine(PerceptionEngine): | |
| """ | |
| Hugging Face Native implementation of Qwen2-VL. | |
| Optimized for HF Spaces (CPU/GPU) without requiring slow C++ builds. | |
| """ | |
| def __init__(self): | |
| self.model_id = "Qwen/Qwen2-VL-2B-Instruct" | |
| self.model = None | |
| self.processor = None | |
| self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
| def load_model(self, model_path: Optional[Path] = None) -> None: | |
| """Loads the model using Transformers.""" | |
| if self.model is not None: | |
| return | |
| print(f"Loading Qwen2-VL via Transformers on {self.device}...") | |
| # Load model with float16 if on GPU, else float32/bfloat16 for CPU | |
| self.model = Qwen2VLForConditionalGeneration.from_pretrained( | |
| self.model_id, | |
| torch_dtype="auto", | |
| device_map="auto" | |
| ) | |
| self.processor = AutoProcessor.from_pretrained(self.model_id) | |
| print("✅ Native Vision Model loaded.") | |
| def analyze_frame(self, frame_path: str, prompt: str) -> str: | |
| """Runs inference using native transformers pipeline.""" | |
| if self.model is None: | |
| self.load_model() | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "image", "image": frame_path}, | |
| {"type": "text", "text": prompt}, | |
| ], | |
| } | |
| ] | |
| # Preparation for inference | |
| text = self.processor.apply_chat_template( | |
| messages, tokenize=False, add_generation_prompt=True | |
| ) | |
| image_inputs, video_inputs = process_vision_info(messages) | |
| inputs = self.processor( | |
| text=[text], | |
| images=image_inputs, | |
| videos=video_inputs, | |
| padding=True, | |
| return_tensors="pt", | |
| ) | |
| inputs = inputs.to(self.device) | |
| # Inference: Generation of the output | |
| generated_ids = self.model.generate(**inputs, max_new_tokens=256) | |
| generated_ids_trimmed = [ | |
| out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) | |
| ] | |
| output_text = self.processor.batch_decode( | |
| generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False | |
| )[0] | |
| return output_text | |
| def analyze_video_segment(self, video_path: Path, start_time: float, end_time: float, prompt: str) -> str: | |
| """Extracts and analyzes a frame.""" | |
| cap = cv2.VideoCapture(str(video_path)) | |
| fps = cap.get(cv2.CAP_PROP_FPS) | |
| middle_time = (start_time + end_time) / 2 | |
| frame_id = int(middle_time * fps) | |
| cap.set(cv2.CAP_PROP_POS_FRAMES, frame_id) | |
| ret, frame = cap.read() | |
| cap.release() | |
| if not ret: return "Error: Could not read frame." | |
| temp_path = "temp_segment_frame.jpg" | |
| cv2.imwrite(temp_path, frame) | |
| return self.analyze_frame(temp_path, prompt) | |
| def generate_text(self, prompt: str, stop: Optional[List[str]] = None) -> str: | |
| """Text-only generation.""" | |
| if self.model is None: self.load_model() | |
| inputs = self.processor(text=[prompt], return_tensors="pt").to(self.device) | |
| generated_ids = self.model.generate(**inputs, max_new_tokens=512) | |
| # Trim the input prompt from the output | |
| output_text = self.processor.batch_decode( | |
| generated_ids[:, inputs.input_ids.shape[1]:], skip_special_tokens=True | |
| )[0] | |
| return output_text | |
| def chat(self, messages: List[Dict[str, str]]) -> str: | |
| # Simplified chat implementation | |
| prompt = "\n".join([f"{m['role']}: {m['content']}" for m in messages]) | |
| return self.generate_text(prompt) | |