ashleshp commited on
Commit
b6192e4
·
1 Parent(s): ff9769d

Switch transformers

Browse files
Files changed (3) hide show
  1. requirements.txt +16 -16
  2. src/app.py +4 -41
  3. src/perception/engine.py +83 -124
requirements.txt CHANGED
@@ -1,21 +1,21 @@
1
- # Core AI & Inference
2
- llama-cpp-python>=0.2.82 # The engine for Qwen2-VL
3
- numpy>=1.24.0 # Array manipulation
4
- opencv-python-headless>=4.8.0 # Video processing (headless for server/CLI environments)
 
 
 
5
 
6
- # Utility & CLI
7
- rich>=13.0.0 # Beautiful terminal output
8
- pydantic>=2.0.0 # Data validation and settings management
9
- Pillow>=10.0.0 # Image handling
10
-
11
- # Development & Testing
12
- pytest>=7.0.0 # Testing framework
13
- black>=23.0.0 # Code formatter (for dev use)
14
- huggingface_hub>=0.19.0
15
- langgraph>=0.0.10
16
- langchain>=0.1.0
17
- langchain-core>=0.1.0
18
  streamlit>=1.30.0
 
 
 
 
 
19
  sentence-transformers>=2.2.2
20
  scikit-learn>=1.3.0
21
  decord>=0.6.0
 
 
 
 
1
+ # Core AI (Optimized for HF Spaces)
2
+ transformers>=4.45.0
3
+ accelerate>=0.26.0
4
+ torch>=2.1.0
5
+ numpy>=1.24.0
6
+ opencv-python-headless>=4.8.0
7
+ qwen-vl-utils
8
 
9
+ # Utility & UI
 
 
 
 
 
 
 
 
 
 
 
10
  streamlit>=1.30.0
11
+ rich>=13.0.0
12
+ Pillow>=10.0.0
13
+ huggingface_hub>=0.19.0
14
+
15
+ # Search & Vector
16
  sentence-transformers>=2.2.2
17
  scikit-learn>=1.3.0
18
  decord>=0.6.0
19
+ langgraph>=0.0.10
20
+ langchain>=0.1.0
21
+ langchain-core>=0.1.0
src/app.py CHANGED
@@ -29,53 +29,16 @@ st.set_page_config(
29
 
30
  # --- SYSTEM SETUP ---
31
 
32
- def ensure_models_exist():
33
- """
34
- Checks if the AI models are present.
35
- If not (first run or cloud deploy), it downloads them automatically.
36
- """
37
- REPO_ID = "bartowski/Qwen2-VL-2B-Instruct-GGUF"
38
- MODEL_FILENAME = "Qwen2-VL-2B-Instruct-Q4_K_M.gguf"
39
- VISION_ADAPTER_FILENAME = "Qwen2-VL-2B-Instruct-f16-mmproj.gguf"
40
-
41
- if not settings.paths.models_dir.exists():
42
- settings.paths.models_dir.mkdir(parents=True)
43
-
44
- model_path = settings.paths.models_dir / MODEL_FILENAME
45
- adapter_path = settings.paths.models_dir / VISION_ADAPTER_FILENAME
46
-
47
- # If either file is missing, trigger download
48
- if not model_path.exists() or not adapter_path.exists():
49
- with st.spinner("📥 Performing First-Time Setup: Downloading AI Models..."):
50
- if not model_path.exists():
51
- st.toast("Downloading Main Model (1.5GB)...")
52
- hf_hub_download(repo_id=REPO_ID, filename=MODEL_FILENAME, local_dir=settings.paths.models_dir)
53
-
54
- if not adapter_path.exists():
55
- st.toast("Downloading Vision Adapter...")
56
- try:
57
- hf_hub_download(repo_id=REPO_ID, filename=VISION_ADAPTER_FILENAME, local_dir=settings.paths.models_dir)
58
- except Exception:
59
- st.warning("Could not download specific adapter. Trying to proceed...")
60
-
61
- st.success("Models Ready!")
62
-
63
  @st.cache_resource
64
  def initialize_system():
65
  """
66
- Loads the heavy AI models once and caches them.
67
  """
68
- ensure_models_exist()
69
-
70
- print("🚀 System Startup: Initializing AI Engines...")
71
 
72
- # 1. The Analyst (High Intelligence, GPU)
73
  perception_engine = Qwen2PerceptionEngine()
74
- try:
75
- perception_engine.load_model(settings.paths.model_path)
76
- except Exception as error:
77
- st.error(f"Critical Error Loading AI: {error}")
78
- st.stop()
79
 
80
  # 2. The Scout (Fast Search, CPU)
81
  visual_scout = VisualScout()
 
29
 
30
  # --- SYSTEM SETUP ---
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  @st.cache_resource
33
  def initialize_system():
34
  """
35
+ Loads the native Hugging Face model.
36
  """
37
+ print("🚀 System Startup: Initializing Native Transformers Engine...")
 
 
38
 
39
+ # 1. The Analyst (Native Qwen2-VL)
40
  perception_engine = Qwen2PerceptionEngine()
41
+ # Model will lazy-load on first use or we can trigger it here
 
 
 
 
42
 
43
  # 2. The Scout (Fast Search, CPU)
44
  visual_scout = VisualScout()
src/perception/engine.py CHANGED
@@ -1,156 +1,115 @@
 
 
 
 
1
  import os
2
  from pathlib import Path
3
  from typing import Optional, List, Dict
4
- import base64
5
-
6
- # Third-party imports
7
- from llama_cpp import Llama
8
- from llama_cpp.llama_chat_format import Llava15ChatHandler
9
  import cv2
10
 
11
- # Local imports
12
  from src.interfaces.base import PerceptionEngine
13
- from src.config.settings import settings
14
 
15
  class Qwen2PerceptionEngine(PerceptionEngine):
16
  """
17
- The 'Eyes' of the system.
18
-
19
- This class wraps the Qwen2-VL (Vision-Language) model running via llama.cpp.
20
- It handles loading the heavy GPU weights and formatting images for the AI to 'see'.
21
  """
22
 
23
  def __init__(self):
24
- # We hold the model in memory here.
25
- # It's set to None initially to allow for lazy loading (saving RAM until needed).
26
- self._vision_language_model: Optional[Llama] = None
27
-
28
- def _find_vision_adapter(self) -> Path:
29
- """
30
- Locates the 'mmproj' file (Multimedia Projector).
31
- This file acts as a translator between the Image Encoder and the Language Model.
32
- """
33
- candidates = list(settings.paths.models_dir.glob("*mmproj*.gguf"))
34
- if not candidates:
35
- raise FileNotFoundError("Critical: Could not find the vision adapter (mmproj) in models/ directory.")
36
- return candidates[0]
37
 
38
- def load_model(self, model_file_path: Path) -> None:
39
- """Loads the AI model into GPU memory."""
40
- if self._vision_language_model is not None:
41
- return # Already loaded
42
 
43
- print(f"Loading Qwen2-VL from {model_file_path}...")
44
 
45
- try:
46
- # The ChatHandler takes care of the complex CLIP image processing
47
- vision_handler = Llava15ChatHandler(clip_model_path=str(self._find_vision_adapter()))
48
-
49
- self._vision_language_model = Llama(
50
- model_path=str(model_file_path),
51
- chat_handler=vision_handler,
52
- n_ctx=2048, # Context Window (how much text/image data it can hold)
53
- n_gpu_layers=-1, # -1 means "Put everything on the GPU"
54
- n_batch=512,
55
- verbose=False # Keep logs clean
56
- )
57
- print("✅ Vision Model loaded successfully on GPU.")
58
- except Exception as error:
59
- print(f"❌ Failed to load model: {error}")
60
- raise
61
-
62
- def _convert_image_to_base64(self, local_image_path: str) -> str:
63
- """Reads an image file and encodes it as a string for the API."""
64
- with open(local_image_path, "rb") as image_file:
65
- return base64.b64encode(image_file.read()).decode('utf-8')
66
 
67
- def analyze_frame(self, frame_path: str, user_prompt: str) -> str:
68
- """
69
- Main Vision Function: Looks at a single image and answers a prompt.
70
- """
71
- if self._vision_language_model is None:
72
- self.load_model(settings.paths.model_path)
73
 
74
- # Create the data URI that the model expects
75
- image_uri = f"data:image/jpeg;base64,{self._convert_image_to_base64(frame_path)}"
76
-
77
- # Construct the conversation history
78
- conversation = [
79
  {
80
  "role": "user",
81
  "content": [
82
- {"type": "image_url", "image_url": {"url": image_uri}},
83
- {"type": "text", "text": user_prompt}
84
- ]
85
  }
86
  ]
87
 
88
- # Ask the model
89
- response = self._vision_language_model.create_chat_completion(
90
- messages=conversation,
91
- max_tokens=256, # Limit response length to avoid rambling
92
- temperature=0.3 # Low temperature = More factual, less creative
93
  )
94
-
95
- return response["choices"][0]["message"]["content"]
 
 
 
 
 
 
 
96
 
97
- def analyze_video_segment(self, video_path: Path, start_time: float, end_time: float, analysis_prompt: str) -> str:
98
- """
99
- Analyzes a specific time range in the video.
100
- Currently extracts the middle frame of that segment.
101
- """
102
- # 1. Open the video file
103
- video_capture = cv2.VideoCapture(str(video_path))
104
- fps = video_capture.get(cv2.CAP_PROP_FPS)
105
 
106
- # 2. Jump to the middle of the requested segment
107
- middle_timestamp = (start_time + end_time) / 2
108
- target_frame_number = int(middle_timestamp * fps)
 
 
 
 
 
109
 
110
- video_capture.set(cv2.CAP_PROP_POS_FRAMES, target_frame_number)
111
- success, video_frame = video_capture.read()
112
- video_capture.release()
113
 
114
- if not success:
115
- return "Error: Could not read video frame at this timestamp."
116
 
117
- # 3. Save a temporary snapshot to disk (Model reads from disk)
118
- temp_snapshot_path = settings.paths.data_dir / "temp_analysis_frame.jpg"
119
 
120
- # Ensure directory exists
121
- if not temp_snapshot_path.parent.exists():
122
- temp_snapshot_path.parent.mkdir(parents=True)
123
-
124
- cv2.imwrite(str(temp_snapshot_path), video_frame)
125
-
126
- # 4. Perform the analysis
127
- return self.analyze_frame(str(temp_snapshot_path), analysis_prompt)
128
 
129
- def chat(self, chat_history: List[Dict[str, str]], stop_sequences: Optional[List[str]] = None) -> str:
130
- """Standard text-only chat (for reasoning without new images)."""
131
- if self._vision_language_model is None:
132
- self.load_model(settings.paths.model_path)
133
-
134
- response = self._vision_language_model.create_chat_completion(
135
- messages=chat_history,
136
- max_tokens=512,
137
- temperature=0.7,
138
- stop=stop_sequences
139
- )
140
- return response["choices"][0]["message"]["content"]
141
 
142
- def generate_text(self, raw_prompt: str, stop_sequences: Optional[List[str]] = None) -> str:
143
- """
144
- Raw text completion.
145
- Useful when we want strict control over the output format (like standardizing a summary).
146
- """
147
- if self._vision_language_model is None:
148
- self.load_model(settings.paths.model_path)
149
-
150
- response = self._vision_language_model.create_completion(
151
- prompt=raw_prompt,
152
- max_tokens=512,
153
- temperature=0.7,
154
- stop=stop_sequences
155
- )
156
- return response["choices"][0]["text"]
 
1
+ import torch
2
+ from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
3
+ from qwen_vl_utils import process_vision_info
4
+ from PIL import Image
5
  import os
6
  from pathlib import Path
7
  from typing import Optional, List, Dict
 
 
 
 
 
8
  import cv2
9
 
 
10
  from src.interfaces.base import PerceptionEngine
 
11
 
12
  class Qwen2PerceptionEngine(PerceptionEngine):
13
  """
14
+ Hugging Face Native implementation of Qwen2-VL.
15
+ Optimized for HF Spaces (CPU/GPU) without requiring slow C++ builds.
 
 
16
  """
17
 
18
  def __init__(self):
19
+ self.model_id = "Qwen/Qwen2-VL-2B-Instruct"
20
+ self.model = None
21
+ self.processor = None
22
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
 
 
 
 
 
 
 
 
 
23
 
24
+ def load_model(self, model_path: Optional[Path] = None) -> None:
25
+ """Loads the model using Transformers."""
26
+ if self.model is not None:
27
+ return
28
 
29
+ print(f"Loading Qwen2-VL via Transformers on {self.device}...")
30
 
31
+ # Load model with float16 if on GPU, else float32/bfloat16 for CPU
32
+ self.model = Qwen2VLForConditionalGeneration.from_pretrained(
33
+ self.model_id,
34
+ torch_dtype="auto",
35
+ device_map="auto"
36
+ )
37
+
38
+ self.processor = AutoProcessor.from_pretrained(self.model_id)
39
+ print(" Native Vision Model loaded.")
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
+ def analyze_frame(self, frame_path: str, prompt: str) -> str:
42
+ """Runs inference using native transformers pipeline."""
43
+ if self.model is None:
44
+ self.load_model()
 
 
45
 
46
+ messages = [
 
 
 
 
47
  {
48
  "role": "user",
49
  "content": [
50
+ {"type": "image", "image": frame_path},
51
+ {"type": "text", "text": prompt},
52
+ ],
53
  }
54
  ]
55
 
56
+ # Preparation for inference
57
+ text = self.processor.apply_chat_template(
58
+ messages, tokenize=False, add_generation_prompt=True
 
 
59
  )
60
+ image_inputs, video_inputs = process_vision_info(messages)
61
+ inputs = self.processor(
62
+ text=[text],
63
+ images=image_inputs,
64
+ videos=video_inputs,
65
+ padding=True,
66
+ return_tensors="pt",
67
+ )
68
+ inputs = inputs.to(self.device)
69
 
70
+ # Inference: Generation of the output
71
+ generated_ids = self.model.generate(**inputs, max_new_tokens=256)
72
+ generated_ids_trimmed = [
73
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
74
+ ]
75
+ output_text = self.processor.batch_decode(
76
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
77
+ )[0]
78
 
79
+ return output_text
80
+
81
+ def analyze_video_segment(self, video_path: Path, start_time: float, end_time: float, prompt: str) -> str:
82
+ """Extracts and analyzes a frame."""
83
+ cap = cv2.VideoCapture(str(video_path))
84
+ fps = cap.get(cv2.CAP_PROP_FPS)
85
+ middle_time = (start_time + end_time) / 2
86
+ frame_id = int(middle_time * fps)
87
 
88
+ cap.set(cv2.CAP_PROP_POS_FRAMES, frame_id)
89
+ ret, frame = cap.read()
90
+ cap.release()
91
 
92
+ if not ret: return "Error: Could not read frame."
 
93
 
94
+ temp_path = "temp_segment_frame.jpg"
95
+ cv2.imwrite(temp_path, frame)
96
 
97
+ return self.analyze_frame(temp_path, prompt)
 
 
 
 
 
 
 
98
 
99
+ def generate_text(self, prompt: str, stop: Optional[List[str]] = None) -> str:
100
+ """Text-only generation."""
101
+ if self.model is None: self.load_model()
102
+
103
+ inputs = self.processor(text=[prompt], return_tensors="pt").to(self.device)
104
+ generated_ids = self.model.generate(**inputs, max_new_tokens=512)
105
+
106
+ # Trim the input prompt from the output
107
+ output_text = self.processor.batch_decode(
108
+ generated_ids[:, inputs.input_ids.shape[1]:], skip_special_tokens=True
109
+ )[0]
110
+ return output_text
111
 
112
+ def chat(self, messages: List[Dict[str, str]]) -> str:
113
+ # Simplified chat implementation
114
+ prompt = "\n".join([f"{m['role']}: {m['content']}" for m in messages])
115
+ return self.generate_text(prompt)