Spaces:

ashleshp
/

Video-Scout

Runtime error

App Files Files Community

ashleshp commited on Jan 27

Commit

b6192e4

1 Parent(s): ff9769d

Switch transformers

Browse files

Files changed (3) hide show

requirements.txt +16 -16
src/app.py +4 -41
src/perception/engine.py +83 -124

requirements.txt CHANGED Viewed

@@ -1,21 +1,21 @@
-# Core AI & Inference
-llama-cpp-python>=0.2.82  # The engine for Qwen2-VL
-numpy>=1.24.0             # Array manipulation
-opencv-python-headless>=4.8.0 # Video processing (headless for server/CLI environments)
-# Utility & CLI
-rich>=13.0.0              # Beautiful terminal output
-pydantic>=2.0.0           # Data validation and settings management
-Pillow>=10.0.0            # Image handling
-# Development & Testing
-pytest>=7.0.0             # Testing framework
-black>=23.0.0             # Code formatter (for dev use)
-huggingface_hub>=0.19.0
-langgraph>=0.0.10
-langchain>=0.1.0
-langchain-core>=0.1.0
 streamlit>=1.30.0
 sentence-transformers>=2.2.2
 scikit-learn>=1.3.0
 decord>=0.6.0

+# Core AI (Optimized for HF Spaces)
+transformers>=4.45.0
+accelerate>=0.26.0
+torch>=2.1.0
+numpy>=1.24.0
+opencv-python-headless>=4.8.0
+qwen-vl-utils
+# Utility & UI
 streamlit>=1.30.0
+rich>=13.0.0
+Pillow>=10.0.0
+huggingface_hub>=0.19.0
+# Search & Vector
 sentence-transformers>=2.2.2
 scikit-learn>=1.3.0
 decord>=0.6.0
+langgraph>=0.0.10
+langchain>=0.1.0
+langchain-core>=0.1.0

src/app.py CHANGED Viewed

@@ -29,53 +29,16 @@ st.set_page_config(
 # --- SYSTEM SETUP ---
-def ensure_models_exist():
-    """
-    Checks if the AI models are present.
-    If not (first run or cloud deploy), it downloads them automatically.
-    """
-    REPO_ID = "bartowski/Qwen2-VL-2B-Instruct-GGUF"
-    MODEL_FILENAME = "Qwen2-VL-2B-Instruct-Q4_K_M.gguf"
-    VISION_ADAPTER_FILENAME = "Qwen2-VL-2B-Instruct-f16-mmproj.gguf"
-    if not settings.paths.models_dir.exists():
-        settings.paths.models_dir.mkdir(parents=True)
-    model_path = settings.paths.models_dir / MODEL_FILENAME
-    adapter_path = settings.paths.models_dir / VISION_ADAPTER_FILENAME
-    # If either file is missing, trigger download
-    if not model_path.exists() or not adapter_path.exists():
-        with st.spinner("📥 Performing First-Time Setup: Downloading AI Models..."):
-            if not model_path.exists():
-                st.toast("Downloading Main Model (1.5GB)...")
-                hf_hub_download(repo_id=REPO_ID, filename=MODEL_FILENAME, local_dir=settings.paths.models_dir)
-            if not adapter_path.exists():
-                st.toast("Downloading Vision Adapter...")
-                try:
-                    hf_hub_download(repo_id=REPO_ID, filename=VISION_ADAPTER_FILENAME, local_dir=settings.paths.models_dir)
-                except Exception:
-                    st.warning("Could not download specific adapter. Trying to proceed...")
-        st.success("Models Ready!")
 @st.cache_resource
 def initialize_system():
     """
-    Loads the heavy AI models once and caches them.
     """
-    ensure_models_exist()
-    print("🚀 System Startup: Initializing AI Engines...")
-    # 1. The Analyst (High Intelligence, GPU)
     perception_engine = Qwen2PerceptionEngine()
-    try:
-        perception_engine.load_model(settings.paths.model_path)
-    except Exception as error:
-        st.error(f"Critical Error Loading AI: {error}")
-        st.stop()
     # 2. The Scout (Fast Search, CPU)
     visual_scout = VisualScout()

 # --- SYSTEM SETUP ---
 @st.cache_resource
 def initialize_system():
     """
+    Loads the native Hugging Face model.
     """
+    print("🚀 System Startup: Initializing Native Transformers Engine...")
+    # 1. The Analyst (Native Qwen2-VL)
     perception_engine = Qwen2PerceptionEngine()
+    # Model will lazy-load on first use or we can trigger it here
     # 2. The Scout (Fast Search, CPU)
     visual_scout = VisualScout()

src/perception/engine.py CHANGED Viewed

@@ -1,156 +1,115 @@
 import os
 from pathlib import Path
 from typing import Optional, List, Dict
-import base64
-# Third-party imports
-from llama_cpp import Llama
-from llama_cpp.llama_chat_format import Llava15ChatHandler
 import cv2
-# Local imports
 from src.interfaces.base import PerceptionEngine
-from src.config.settings import settings
 class Qwen2PerceptionEngine(PerceptionEngine):
     """
-    The 'Eyes' of the system.
-    This class wraps the Qwen2-VL (Vision-Language) model running via llama.cpp.
-    It handles loading the heavy GPU weights and formatting images for the AI to 'see'.
     """
     def __init__(self):
-        # We hold the model in memory here.
-        # It's set to None initially to allow for lazy loading (saving RAM until needed).
-        self._vision_language_model: Optional[Llama] = None
-    def _find_vision_adapter(self) -> Path:
-        """
-        Locates the 'mmproj' file (Multimedia Projector).
-        This file acts as a translator between the Image Encoder and the Language Model.
-        """
-        candidates = list(settings.paths.models_dir.glob("*mmproj*.gguf"))
-        if not candidates:
-            raise FileNotFoundError("Critical: Could not find the vision adapter (mmproj) in models/ directory.")
-        return candidates[0]
-    def load_model(self, model_file_path: Path) -> None:
-        """Loads the AI model into GPU memory."""
-        if self._vision_language_model is not None:
-            return # Already loaded
-        print(f"Loading Qwen2-VL from {model_file_path}...")
-        try:
-            # The ChatHandler takes care of the complex CLIP image processing
-            vision_handler = Llava15ChatHandler(clip_model_path=str(self._find_vision_adapter()))
-            self._vision_language_model = Llama(
-                model_path=str(model_file_path),
-                chat_handler=vision_handler,
-                n_ctx=2048,      # Context Window (how much text/image data it can hold)
-                n_gpu_layers=-1, # -1 means "Put everything on the GPU"
-                n_batch=512,
-                verbose=False    # Keep logs clean
-            )
-            print("✅ Vision Model loaded successfully on GPU.")
-        except Exception as error:
-            print(f"❌ Failed to load model: {error}")
-            raise
-    def _convert_image_to_base64(self, local_image_path: str) -> str:
-        """Reads an image file and encodes it as a string for the API."""
-        with open(local_image_path, "rb") as image_file:
-            return base64.b64encode(image_file.read()).decode('utf-8')
-    def analyze_frame(self, frame_path: str, user_prompt: str) -> str:
-        """
-        Main Vision Function: Looks at a single image and answers a prompt.
-        """
-        if self._vision_language_model is None:
-            self.load_model(settings.paths.model_path)
-        # Create the data URI that the model expects
-        image_uri = f"data:image/jpeg;base64,{self._convert_image_to_base64(frame_path)}"
-        # Construct the conversation history
-        conversation = [
             {
                 "role": "user",
                 "content": [
-                    {"type": "image_url", "image_url": {"url": image_uri}},
-                    {"type": "text", "text": user_prompt}
-                ]
             }
         ]
-        # Ask the model
-        response = self._vision_language_model.create_chat_completion(
-            messages=conversation,
-            max_tokens=256,   # Limit response length to avoid rambling
-            temperature=0.3   # Low temperature = More factual, less creative
         )
-        return response["choices"][0]["message"]["content"]
-    def analyze_video_segment(self, video_path: Path, start_time: float, end_time: float, analysis_prompt: str) -> str:
-        """
-        Analyzes a specific time range in the video.
-        Currently extracts the middle frame of that segment.
-        """
-        # 1. Open the video file
-        video_capture = cv2.VideoCapture(str(video_path))
-        fps = video_capture.get(cv2.CAP_PROP_FPS)
-        # 2. Jump to the middle of the requested segment
-        middle_timestamp = (start_time + end_time) / 2
-        target_frame_number = int(middle_timestamp * fps)
-        video_capture.set(cv2.CAP_PROP_POS_FRAMES, target_frame_number)
-        success, video_frame = video_capture.read()
-        video_capture.release()
-        if not success:
-            return "Error: Could not read video frame at this timestamp."
-        # 3. Save a temporary snapshot to disk (Model reads from disk)
-        temp_snapshot_path = settings.paths.data_dir / "temp_analysis_frame.jpg"
-        # Ensure directory exists
-        if not temp_snapshot_path.parent.exists():
-            temp_snapshot_path.parent.mkdir(parents=True)
-        cv2.imwrite(str(temp_snapshot_path), video_frame)
-        # 4. Perform the analysis
-        return self.analyze_frame(str(temp_snapshot_path), analysis_prompt)
-    def chat(self, chat_history: List[Dict[str, str]], stop_sequences: Optional[List[str]] = None) -> str:
-        """Standard text-only chat (for reasoning without new images)."""
-        if self._vision_language_model is None:
-            self.load_model(settings.paths.model_path)
-        response = self._vision_language_model.create_chat_completion(
-            messages=chat_history,
-            max_tokens=512,
-            temperature=0.7,
-            stop=stop_sequences
-        )
-        return response["choices"][0]["message"]["content"]
-    def generate_text(self, raw_prompt: str, stop_sequences: Optional[List[str]] = None) -> str:
-        """
-        Raw text completion.
-        Useful when we want strict control over the output format (like standardizing a summary).
-        """
-        if self._vision_language_model is None:
-            self.load_model(settings.paths.model_path)
-        response = self._vision_language_model.create_completion(
-            prompt=raw_prompt,
-            max_tokens=512,
-            temperature=0.7,
-            stop=stop_sequences
-        )
-        return response["choices"][0]["text"]

+import torch
+from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
+from qwen_vl_utils import process_vision_info
+from PIL import Image
 import os
 from pathlib import Path
 from typing import Optional, List, Dict
 import cv2
 from src.interfaces.base import PerceptionEngine
 class Qwen2PerceptionEngine(PerceptionEngine):
     """
+    Hugging Face Native implementation of Qwen2-VL.
+    Optimized for HF Spaces (CPU/GPU) without requiring slow C++ builds.
     """
     def __init__(self):
+        self.model_id = "Qwen/Qwen2-VL-2B-Instruct"
+        self.model = None
+        self.processor = None
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+    def load_model(self, model_path: Optional[Path] = None) -> None:
+        """Loads the model using Transformers."""
+        if self.model is not None:
+            return
+        print(f"Loading Qwen2-VL via Transformers on {self.device}...")
+        # Load model with float16 if on GPU, else float32/bfloat16 for CPU
+        self.model = Qwen2VLForConditionalGeneration.from_pretrained(
+            self.model_id,
+            torch_dtype="auto",
+            device_map="auto"
+        )
+        self.processor = AutoProcessor.from_pretrained(self.model_id)
+        print("✅ Native Vision Model loaded.")
+    def analyze_frame(self, frame_path: str, prompt: str) -> str:
+        """Runs inference using native transformers pipeline."""
+        if self.model is None:
+            self.load_model()
+        messages = [
             {
                 "role": "user",
                 "content": [
+                    {"type": "image", "image": frame_path},
+                    {"type": "text", "text": prompt},
+                ],
             }
         ]
+        # Preparation for inference
+        text = self.processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
         )
+        image_inputs, video_inputs = process_vision_info(messages)
+        inputs = self.processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt",
+        )
+        inputs = inputs.to(self.device)
+        # Inference: Generation of the output
+        generated_ids = self.model.generate(**inputs, max_new_tokens=256)
+        generated_ids_trimmed = [
+            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        output_text = self.processor.batch_decode(
+            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )[0]
+        return output_text
+    def analyze_video_segment(self, video_path: Path, start_time: float, end_time: float, prompt: str) -> str:
+        """Extracts and analyzes a frame."""
+        cap = cv2.VideoCapture(str(video_path))
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        middle_time = (start_time + end_time) / 2
+        frame_id = int(middle_time * fps)
+        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_id)
+        ret, frame = cap.read()
+        cap.release()
+        if not ret: return "Error: Could not read frame."
+        temp_path = "temp_segment_frame.jpg"
+        cv2.imwrite(temp_path, frame)
+        return self.analyze_frame(temp_path, prompt)
+    def generate_text(self, prompt: str, stop: Optional[List[str]] = None) -> str:
+        """Text-only generation."""
+        if self.model is None: self.load_model()
+        inputs = self.processor(text=[prompt], return_tensors="pt").to(self.device)
+        generated_ids = self.model.generate(**inputs, max_new_tokens=512)
+        # Trim the input prompt from the output
+        output_text = self.processor.batch_decode(
+            generated_ids[:, inputs.input_ids.shape[1]:], skip_special_tokens=True
+        )[0]
+        return output_text
+    def chat(self, messages: List[Dict[str, str]]) -> str:
+        # Simplified chat implementation
+        prompt = "\n".join([f"{m['role']}: {m['content']}" for m in messages])
+        return self.generate_text(prompt)