peterproofpath commited on
Commit
5e5265b
·
verified ·
1 Parent(s): 14de624

Update requirements.txt

Browse files
Files changed (1) hide show
  1. requirements.txt +554 -19
requirements.txt CHANGED
@@ -1,25 +1,560 @@
1
- # Eagle 2.5 Inference Endpoint Requirements
2
- # CRITICAL: Eagle 2.5 requires transformers >= 4.53.0 for auto_docstring support
 
3
 
4
- # Exact version required for Eagle 2.5's custom code
5
- transformers>=4.53.0
6
- torch>=2.0.0
7
 
8
- # CRITICAL: Eagle 2.5 uses Qwen2-VL architecture
9
- qwen-vl-utils>=0.0.8
 
 
10
 
11
- # Video processing
12
- opencv-python-headless>=4.8.0
13
- av>=10.0.0
14
- decord
 
 
 
 
15
 
16
- # Image processing
17
- Pillow>=9.0.0
18
- requests>=2.28.0
19
 
20
- # Standard deps - pin numpy to avoid conflicts
21
- numpy>=1.24.0,<2.0.0
22
- einops>=0.7.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
- # For efficient attention (flash attention)
25
- accelerate>=0.25.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Eagle 2.5 Custom Inference Handler for Hugging Face Inference Endpoints
3
+ Model: nvidia/Eagle2.5-8B
4
 
5
+ For ProofPath video assessment - long video understanding with up to 512 frames.
6
+ Ideal for full rubric-based video grading in a single call.
 
7
 
8
+ REQUIREMENTS:
9
+ 1. Set HF_TOKEN environment variable (model is gated)
10
+ 2. Accept license at https://huggingface.co/nvidia/Eagle2.5-8B
11
+ """
12
 
13
+ from typing import Dict, List, Any, Optional, Union
14
+ import torch
15
+ import numpy as np
16
+ import base64
17
+ import io
18
+ import tempfile
19
+ import os
20
+ import re
21
 
 
 
 
22
 
23
+ class EndpointHandler:
24
+ def __init__(self, path: str = ""):
25
+ """
26
+ Initialize Eagle 2.5 model for video understanding.
27
+
28
+ Args:
29
+ path: Path to the model directory (ignored - we always load from HF hub)
30
+ """
31
+ # IMPORTANT: Eagle 2.5 must be loaded from HF hub, not the repository path
32
+ # The repository only contains handler.py and requirements.txt
33
+ model_id = "nvidia/Eagle2.5-8B"
34
+
35
+ # Get HF token from environment for gated model access
36
+ hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
37
+
38
+ # Determine device
39
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
40
+
41
+ # Eagle 2.5 uses Qwen2VL architecture - use AutoProcessor with use_fast=False
42
+ # to avoid the broken Eagle2_5_VLVideoProcessorFast class
43
+ from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
44
+
45
+ self.processor = AutoProcessor.from_pretrained(
46
+ model_id,
47
+ trust_remote_code=True,
48
+ token=hf_token,
49
+ use_fast=True, # Eagle2_5_VLImageProcessorFast requires use_fast=True
50
+ )
51
+
52
+ # Set padding side for batch processing
53
+ if hasattr(self.processor, 'tokenizer'):
54
+ self.processor.tokenizer.padding_side = "left"
55
+
56
+ self.model = Qwen2VLForConditionalGeneration.from_pretrained(
57
+ model_id,
58
+ trust_remote_code=True,
59
+ torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
60
+ attn_implementation="flash_attention_2" if torch.cuda.is_available() else "sdpa",
61
+ device_map="auto" if torch.cuda.is_available() else None,
62
+ token=hf_token,
63
+ )
64
+
65
+ if not torch.cuda.is_available():
66
+ self.model = self.model.to(self.device)
67
+
68
+ self.model.eval()
69
+
70
+ # Default config - Eagle 2.5 supports up to 512 frames
71
+ self.default_max_frames = 256 # Conservative default
72
+ self.max_frames_limit = 512
73
+
74
+ def _load_video_frames(
75
+ self,
76
+ video_data: Any,
77
+ max_frames: int = 256,
78
+ fps: float = 2.0
79
+ ) -> tuple:
80
+ """
81
+ Load video frames from various input formats.
82
+
83
+ Supports:
84
+ - URL to video file
85
+ - Base64 encoded video
86
+ - Raw bytes
87
+ """
88
+ import cv2
89
+ from PIL import Image
90
+
91
+ # Decode video to temp file if needed
92
+ if isinstance(video_data, str):
93
+ if video_data.startswith(('http://', 'https://')):
94
+ # URL - download to temp file
95
+ import requests
96
+ response = requests.get(video_data, stream=True)
97
+ with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as f:
98
+ for chunk in response.iter_content(chunk_size=8192):
99
+ f.write(chunk)
100
+ video_path = f.name
101
+ elif video_data.startswith('data:'):
102
+ # Data URL format
103
+ header, encoded = video_data.split(',', 1)
104
+ video_bytes = base64.b64decode(encoded)
105
+ with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as f:
106
+ f.write(video_bytes)
107
+ video_path = f.name
108
+ else:
109
+ # Assume base64 encoded
110
+ video_bytes = base64.b64decode(video_data)
111
+ with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as f:
112
+ f.write(video_bytes)
113
+ video_path = f.name
114
+ elif isinstance(video_data, bytes):
115
+ with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as f:
116
+ f.write(video_data)
117
+ video_path = f.name
118
+ else:
119
+ raise ValueError(f"Unsupported video input type: {type(video_data)}")
120
+
121
+ try:
122
+ # Open video with OpenCV
123
+ cap = cv2.VideoCapture(video_path)
124
+ video_fps = cap.get(cv2.CAP_PROP_FPS)
125
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
126
+ duration = total_frames / video_fps if video_fps > 0 else 0
127
+
128
+ # Calculate frame indices to sample
129
+ target_frames = min(max_frames, int(duration * fps), total_frames)
130
+ if target_frames <= 0:
131
+ target_frames = min(max_frames, total_frames)
132
+
133
+ frame_indices = np.linspace(0, total_frames - 1, target_frames, dtype=int)
134
+
135
+ frames = []
136
+ for idx in frame_indices:
137
+ cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
138
+ ret, frame = cap.read()
139
+ if ret:
140
+ # Convert BGR to RGB
141
+ frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
142
+ pil_image = Image.fromarray(frame_rgb)
143
+ frames.append(pil_image)
144
+
145
+ cap.release()
146
+
147
+ return frames, {
148
+ "duration": duration,
149
+ "total_frames": total_frames,
150
+ "sampled_frames": len(frames),
151
+ "video_fps": video_fps
152
+ }
153
+
154
+ finally:
155
+ # Clean up temp file
156
+ if os.path.exists(video_path):
157
+ os.unlink(video_path)
158
+
159
+ def _load_image(self, image_data: Any):
160
+ """Load a single image from various formats."""
161
+ from PIL import Image
162
+ import requests
163
+
164
+ if isinstance(image_data, Image.Image):
165
+ return image_data
166
+ elif isinstance(image_data, str):
167
+ if image_data.startswith(('http://', 'https://')):
168
+ response = requests.get(image_data, stream=True)
169
+ return Image.open(response.raw).convert('RGB')
170
+ elif image_data.startswith('data:'):
171
+ header, encoded = image_data.split(',', 1)
172
+ image_bytes = base64.b64decode(encoded)
173
+ return Image.open(io.BytesIO(image_bytes)).convert('RGB')
174
+ else:
175
+ image_bytes = base64.b64decode(image_data)
176
+ return Image.open(io.BytesIO(image_bytes)).convert('RGB')
177
+ elif isinstance(image_data, bytes):
178
+ return Image.open(io.BytesIO(image_data)).convert('RGB')
179
+ else:
180
+ raise ValueError(f"Unsupported image input type: {type(image_data)}")
181
+
182
+ def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
183
+ """
184
+ Process video or images with Eagle 2.5.
185
+
186
+ Expected input formats:
187
+
188
+ 1. Video analysis:
189
+ {
190
+ "inputs": <video_url_or_base64>,
191
+ "parameters": {
192
+ "prompt": "Describe what happens in this video.",
193
+ "max_frames": 256,
194
+ "fps": 2.0,
195
+ "max_new_tokens": 2048
196
+ }
197
+ }
198
+
199
+ 2. Image analysis:
200
+ {
201
+ "inputs": <image_url_or_base64>,
202
+ "parameters": {
203
+ "prompt": "Describe this image.",
204
+ "max_new_tokens": 512
205
+ }
206
+ }
207
+
208
+ 3. Multi-image analysis:
209
+ {
210
+ "inputs": [<image1>, <image2>, ...],
211
+ "parameters": {
212
+ "prompt": "Compare these images.",
213
+ "max_new_tokens": 1024
214
+ }
215
+ }
216
+
217
+ 4. ProofPath rubric grading:
218
+ {
219
+ "inputs": <video_url>,
220
+ "parameters": {
221
+ "mode": "rubric",
222
+ "rubric": [
223
+ {"step": 1, "description": "Click cell B2"},
224
+ {"step": 2, "description": "Type 123"},
225
+ {"step": 3, "description": "Press Enter"}
226
+ ],
227
+ "max_frames": 512,
228
+ "output_format": "json"
229
+ }
230
+ }
231
+
232
+ Returns:
233
+ {
234
+ "generated_text": "...",
235
+ "video_metadata": {...}, # If video input
236
+ }
237
+ """
238
+ inputs = data.get("inputs")
239
+ if inputs is None:
240
+ inputs = data.get("video") or data.get("image") or data.get("images")
241
+ if inputs is None:
242
+ raise ValueError("No input provided. Use 'inputs', 'video', 'image', or 'images' key.")
243
+
244
+ params = data.get("parameters", {})
245
+ mode = params.get("mode", "default")
246
+ prompt = params.get("prompt", "Describe this content in detail.")
247
+ max_new_tokens = params.get("max_new_tokens", 2048)
248
+
249
+ try:
250
+ if mode == "rubric":
251
+ return self._grade_rubric(inputs, params)
252
+ elif isinstance(inputs, list):
253
+ return self._process_multi_image(inputs, prompt, max_new_tokens)
254
+ elif self._is_video(inputs, params):
255
+ return self._process_video(inputs, prompt, params, max_new_tokens)
256
+ else:
257
+ return self._process_image(inputs, prompt, max_new_tokens)
258
+
259
+ except Exception as e:
260
+ import traceback
261
+ return {"error": str(e), "error_type": type(e).__name__, "traceback": traceback.format_exc()}
262
+
263
+ def _is_video(self, inputs: Any, params: Dict) -> bool:
264
+ """Determine if input is video based on params or file extension."""
265
+ if params.get("input_type") == "video":
266
+ return True
267
+ if params.get("input_type") == "image":
268
+ return False
269
+
270
+ if isinstance(inputs, str):
271
+ lower = inputs.lower()
272
+ video_exts = ['.mp4', '.avi', '.mov', '.mkv', '.webm', '.m4v']
273
+ return any(ext in lower for ext in video_exts)
274
+
275
+ return False
276
+
277
+ def _process_video(
278
+ self,
279
+ video_data: Any,
280
+ prompt: str,
281
+ params: Dict,
282
+ max_new_tokens: int
283
+ ) -> Dict[str, Any]:
284
+ """Process a video input."""
285
+ from qwen_vl_utils import process_vision_info
286
+
287
+ max_frames = min(params.get("max_frames", self.default_max_frames), self.max_frames_limit)
288
+ fps = params.get("fps", 2.0)
289
+
290
+ # Load video frames
291
+ frames, video_metadata = self._load_video_frames(video_data, max_frames, fps)
292
+
293
+ # Build message for Eagle 2.5 / Qwen2-VL format
294
+ messages = [
295
+ {
296
+ "role": "user",
297
+ "content": [
298
+ {"type": "video", "video": frames, "fps": fps},
299
+ {"type": "text", "text": prompt},
300
+ ],
301
+ }
302
+ ]
303
+
304
+ # Apply chat template
305
+ text = self.processor.apply_chat_template(
306
+ messages,
307
+ tokenize=False,
308
+ add_generation_prompt=True
309
+ )
310
+
311
+ # Process vision info
312
+ image_inputs, video_inputs = process_vision_info(messages)
313
+
314
+ inputs = self.processor(
315
+ text=[text],
316
+ images=image_inputs,
317
+ videos=video_inputs,
318
+ padding=True,
319
+ return_tensors="pt",
320
+ )
321
+ inputs = inputs.to(self.model.device)
322
+
323
+ # Generate
324
+ with torch.inference_mode():
325
+ generated_ids = self.model.generate(
326
+ **inputs,
327
+ max_new_tokens=max_new_tokens,
328
+ do_sample=False,
329
+ )
330
+
331
+ # Decode - only the new tokens
332
+ generated_ids_trimmed = [
333
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
334
+ ]
335
+ generated_text = self.processor.batch_decode(
336
+ generated_ids_trimmed,
337
+ skip_special_tokens=True,
338
+ clean_up_tokenization_spaces=False
339
+ )[0]
340
+
341
+ return {
342
+ "generated_text": generated_text,
343
+ "video_metadata": video_metadata
344
+ }
345
+
346
+ def _process_image(self, image_data: Any, prompt: str, max_new_tokens: int) -> Dict[str, Any]:
347
+ """Process a single image."""
348
+ from qwen_vl_utils import process_vision_info
349
+
350
+ image = self._load_image(image_data)
351
+
352
+ messages = [
353
+ {
354
+ "role": "user",
355
+ "content": [
356
+ {"type": "image", "image": image},
357
+ {"type": "text", "text": prompt},
358
+ ],
359
+ }
360
+ ]
361
+
362
+ text = self.processor.apply_chat_template(
363
+ messages,
364
+ tokenize=False,
365
+ add_generation_prompt=True
366
+ )
367
+
368
+ image_inputs, video_inputs = process_vision_info(messages)
369
+
370
+ inputs = self.processor(
371
+ text=[text],
372
+ images=image_inputs,
373
+ videos=video_inputs,
374
+ padding=True,
375
+ return_tensors="pt",
376
+ )
377
+ inputs = inputs.to(self.model.device)
378
+
379
+ with torch.inference_mode():
380
+ generated_ids = self.model.generate(
381
+ **inputs,
382
+ max_new_tokens=max_new_tokens,
383
+ do_sample=False,
384
+ )
385
+
386
+ generated_ids_trimmed = [
387
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
388
+ ]
389
+ generated_text = self.processor.batch_decode(
390
+ generated_ids_trimmed,
391
+ skip_special_tokens=True,
392
+ clean_up_tokenization_spaces=False
393
+ )[0]
394
+
395
+ return {
396
+ "generated_text": generated_text,
397
+ "image_size": {"width": image.width, "height": image.height}
398
+ }
399
+
400
+ def _process_multi_image(self, images_data: List, prompt: str, max_new_tokens: int) -> Dict[str, Any]:
401
+ """Process multiple images."""
402
+ from qwen_vl_utils import process_vision_info
403
+
404
+ images = [self._load_image(img) for img in images_data]
405
+
406
+ # Build content with all images
407
+ content = []
408
+ for image in images:
409
+ content.append({"type": "image", "image": image})
410
+ content.append({"type": "text", "text": prompt})
411
+
412
+ messages = [{"role": "user", "content": content}]
413
+
414
+ text = self.processor.apply_chat_template(
415
+ messages,
416
+ tokenize=False,
417
+ add_generation_prompt=True
418
+ )
419
+
420
+ image_inputs, video_inputs = process_vision_info(messages)
421
+
422
+ inputs = self.processor(
423
+ text=[text],
424
+ images=image_inputs,
425
+ videos=video_inputs,
426
+ padding=True,
427
+ return_tensors="pt",
428
+ )
429
+ inputs = inputs.to(self.model.device)
430
+
431
+ with torch.inference_mode():
432
+ generated_ids = self.model.generate(
433
+ **inputs,
434
+ max_new_tokens=max_new_tokens,
435
+ do_sample=False,
436
+ )
437
+
438
+ generated_ids_trimmed = [
439
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
440
+ ]
441
+ generated_text = self.processor.batch_decode(
442
+ generated_ids_trimmed,
443
+ skip_special_tokens=True,
444
+ clean_up_tokenization_spaces=False
445
+ )[0]
446
+
447
+ return {
448
+ "generated_text": generated_text,
449
+ "num_images": len(images)
450
+ }
451
+
452
+ def _grade_rubric(self, video_data: Any, params: Dict) -> Dict[str, Any]:
453
+ """
454
+ Grade a video against a rubric - ProofPath specific mode.
455
+ """
456
+ from qwen_vl_utils import process_vision_info
457
+
458
+ rubric = params.get("rubric", [])
459
+ if not rubric:
460
+ raise ValueError("Rubric required for rubric mode")
461
+
462
+ max_frames = min(params.get("max_frames", 512), self.max_frames_limit)
463
+ fps = params.get("fps", 2.0)
464
+ output_format = params.get("output_format", "json")
465
+
466
+ # Load video
467
+ frames, video_metadata = self._load_video_frames(video_data, max_frames, fps)
468
+
469
+ # Build rubric prompt
470
+ rubric_text = "\n".join([
471
+ f"Step {item.get('step', i+1)}: {item.get('description', '')}"
472
+ for i, item in enumerate(rubric)
473
+ ])
474
+
475
+ if output_format == "json":
476
+ prompt = f"""Analyze this video against the following rubric and grade each step.
477
 
478
+ RUBRIC:
479
+ {rubric_text}
480
+
481
+ For EACH step, determine:
482
+ 1. Whether it was completed (true/false)
483
+ 2. The approximate timestamp where it occurs (if completed)
484
+ 3. Any issues or partial completion notes
485
+
486
+ Respond ONLY with a JSON array in this exact format:
487
+ [
488
+ {{"step": 1, "completed": true, "timestamp": "0:15", "notes": "Clicked cell B2 correctly"}},
489
+ {{"step": 2, "completed": true, "timestamp": "0:22", "notes": "Typed 123"}},
490
+ ...
491
+ ]"""
492
+ else:
493
+ prompt = f"""Analyze this video against the following rubric:
494
+
495
+ RUBRIC:
496
+ {rubric_text}
497
+
498
+ For each step, describe whether it was completed, when it occurred, and any issues observed."""
499
+
500
+ messages = [
501
+ {
502
+ "role": "user",
503
+ "content": [
504
+ {"type": "video", "video": frames, "fps": fps},
505
+ {"type": "text", "text": prompt},
506
+ ],
507
+ }
508
+ ]
509
+
510
+ text = self.processor.apply_chat_template(
511
+ messages,
512
+ tokenize=False,
513
+ add_generation_prompt=True
514
+ )
515
+
516
+ image_inputs, video_inputs = process_vision_info(messages)
517
+
518
+ inputs = self.processor(
519
+ text=[text],
520
+ images=image_inputs,
521
+ videos=video_inputs,
522
+ padding=True,
523
+ return_tensors="pt",
524
+ )
525
+ inputs = inputs.to(self.model.device)
526
+
527
+ with torch.inference_mode():
528
+ generated_ids = self.model.generate(
529
+ **inputs,
530
+ max_new_tokens=params.get("max_new_tokens", 2048),
531
+ do_sample=False,
532
+ )
533
+
534
+ generated_ids_trimmed = [
535
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
536
+ ]
537
+ generated_text = self.processor.batch_decode(
538
+ generated_ids_trimmed,
539
+ skip_special_tokens=True,
540
+ clean_up_tokenization_spaces=False
541
+ )[0]
542
+
543
+ result = {
544
+ "generated_text": generated_text,
545
+ "video_metadata": video_metadata,
546
+ "rubric": rubric
547
+ }
548
+
549
+ # Try to parse JSON if requested
550
+ if output_format == "json":
551
+ try:
552
+ import json
553
+ # Extract JSON array from response
554
+ json_match = re.search(r'\[[\s\S]*\]', generated_text)
555
+ if json_match:
556
+ result["grading_results"] = json.loads(json_match.group())
557
+ except json.JSONDecodeError:
558
+ pass # Keep raw text if JSON parsing fails
559
+
560
+ return result