peterproofpath commited on
Commit
89bf21d
·
verified ·
1 Parent(s): 5e5265b

Update requirements.txt

Browse files
Files changed (1) hide show
  1. requirements.txt +12 -560
requirements.txt CHANGED
@@ -1,560 +1,12 @@
1
- """
2
- Eagle 2.5 Custom Inference Handler for Hugging Face Inference Endpoints
3
- Model: nvidia/Eagle2.5-8B
4
-
5
- For ProofPath video assessment - long video understanding with up to 512 frames.
6
- Ideal for full rubric-based video grading in a single call.
7
-
8
- REQUIREMENTS:
9
- 1. Set HF_TOKEN environment variable (model is gated)
10
- 2. Accept license at https://huggingface.co/nvidia/Eagle2.5-8B
11
- """
12
-
13
- from typing import Dict, List, Any, Optional, Union
14
- import torch
15
- import numpy as np
16
- import base64
17
- import io
18
- import tempfile
19
- import os
20
- import re
21
-
22
-
23
- class EndpointHandler:
24
- def __init__(self, path: str = ""):
25
- """
26
- Initialize Eagle 2.5 model for video understanding.
27
-
28
- Args:
29
- path: Path to the model directory (ignored - we always load from HF hub)
30
- """
31
- # IMPORTANT: Eagle 2.5 must be loaded from HF hub, not the repository path
32
- # The repository only contains handler.py and requirements.txt
33
- model_id = "nvidia/Eagle2.5-8B"
34
-
35
- # Get HF token from environment for gated model access
36
- hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
37
-
38
- # Determine device
39
- self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
40
-
41
- # Eagle 2.5 uses Qwen2VL architecture - use AutoProcessor with use_fast=False
42
- # to avoid the broken Eagle2_5_VLVideoProcessorFast class
43
- from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
44
-
45
- self.processor = AutoProcessor.from_pretrained(
46
- model_id,
47
- trust_remote_code=True,
48
- token=hf_token,
49
- use_fast=True, # Eagle2_5_VLImageProcessorFast requires use_fast=True
50
- )
51
-
52
- # Set padding side for batch processing
53
- if hasattr(self.processor, 'tokenizer'):
54
- self.processor.tokenizer.padding_side = "left"
55
-
56
- self.model = Qwen2VLForConditionalGeneration.from_pretrained(
57
- model_id,
58
- trust_remote_code=True,
59
- torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
60
- attn_implementation="flash_attention_2" if torch.cuda.is_available() else "sdpa",
61
- device_map="auto" if torch.cuda.is_available() else None,
62
- token=hf_token,
63
- )
64
-
65
- if not torch.cuda.is_available():
66
- self.model = self.model.to(self.device)
67
-
68
- self.model.eval()
69
-
70
- # Default config - Eagle 2.5 supports up to 512 frames
71
- self.default_max_frames = 256 # Conservative default
72
- self.max_frames_limit = 512
73
-
74
- def _load_video_frames(
75
- self,
76
- video_data: Any,
77
- max_frames: int = 256,
78
- fps: float = 2.0
79
- ) -> tuple:
80
- """
81
- Load video frames from various input formats.
82
-
83
- Supports:
84
- - URL to video file
85
- - Base64 encoded video
86
- - Raw bytes
87
- """
88
- import cv2
89
- from PIL import Image
90
-
91
- # Decode video to temp file if needed
92
- if isinstance(video_data, str):
93
- if video_data.startswith(('http://', 'https://')):
94
- # URL - download to temp file
95
- import requests
96
- response = requests.get(video_data, stream=True)
97
- with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as f:
98
- for chunk in response.iter_content(chunk_size=8192):
99
- f.write(chunk)
100
- video_path = f.name
101
- elif video_data.startswith('data:'):
102
- # Data URL format
103
- header, encoded = video_data.split(',', 1)
104
- video_bytes = base64.b64decode(encoded)
105
- with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as f:
106
- f.write(video_bytes)
107
- video_path = f.name
108
- else:
109
- # Assume base64 encoded
110
- video_bytes = base64.b64decode(video_data)
111
- with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as f:
112
- f.write(video_bytes)
113
- video_path = f.name
114
- elif isinstance(video_data, bytes):
115
- with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as f:
116
- f.write(video_data)
117
- video_path = f.name
118
- else:
119
- raise ValueError(f"Unsupported video input type: {type(video_data)}")
120
-
121
- try:
122
- # Open video with OpenCV
123
- cap = cv2.VideoCapture(video_path)
124
- video_fps = cap.get(cv2.CAP_PROP_FPS)
125
- total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
126
- duration = total_frames / video_fps if video_fps > 0 else 0
127
-
128
- # Calculate frame indices to sample
129
- target_frames = min(max_frames, int(duration * fps), total_frames)
130
- if target_frames <= 0:
131
- target_frames = min(max_frames, total_frames)
132
-
133
- frame_indices = np.linspace(0, total_frames - 1, target_frames, dtype=int)
134
-
135
- frames = []
136
- for idx in frame_indices:
137
- cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
138
- ret, frame = cap.read()
139
- if ret:
140
- # Convert BGR to RGB
141
- frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
142
- pil_image = Image.fromarray(frame_rgb)
143
- frames.append(pil_image)
144
-
145
- cap.release()
146
-
147
- return frames, {
148
- "duration": duration,
149
- "total_frames": total_frames,
150
- "sampled_frames": len(frames),
151
- "video_fps": video_fps
152
- }
153
-
154
- finally:
155
- # Clean up temp file
156
- if os.path.exists(video_path):
157
- os.unlink(video_path)
158
-
159
- def _load_image(self, image_data: Any):
160
- """Load a single image from various formats."""
161
- from PIL import Image
162
- import requests
163
-
164
- if isinstance(image_data, Image.Image):
165
- return image_data
166
- elif isinstance(image_data, str):
167
- if image_data.startswith(('http://', 'https://')):
168
- response = requests.get(image_data, stream=True)
169
- return Image.open(response.raw).convert('RGB')
170
- elif image_data.startswith('data:'):
171
- header, encoded = image_data.split(',', 1)
172
- image_bytes = base64.b64decode(encoded)
173
- return Image.open(io.BytesIO(image_bytes)).convert('RGB')
174
- else:
175
- image_bytes = base64.b64decode(image_data)
176
- return Image.open(io.BytesIO(image_bytes)).convert('RGB')
177
- elif isinstance(image_data, bytes):
178
- return Image.open(io.BytesIO(image_data)).convert('RGB')
179
- else:
180
- raise ValueError(f"Unsupported image input type: {type(image_data)}")
181
-
182
- def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
183
- """
184
- Process video or images with Eagle 2.5.
185
-
186
- Expected input formats:
187
-
188
- 1. Video analysis:
189
- {
190
- "inputs": <video_url_or_base64>,
191
- "parameters": {
192
- "prompt": "Describe what happens in this video.",
193
- "max_frames": 256,
194
- "fps": 2.0,
195
- "max_new_tokens": 2048
196
- }
197
- }
198
-
199
- 2. Image analysis:
200
- {
201
- "inputs": <image_url_or_base64>,
202
- "parameters": {
203
- "prompt": "Describe this image.",
204
- "max_new_tokens": 512
205
- }
206
- }
207
-
208
- 3. Multi-image analysis:
209
- {
210
- "inputs": [<image1>, <image2>, ...],
211
- "parameters": {
212
- "prompt": "Compare these images.",
213
- "max_new_tokens": 1024
214
- }
215
- }
216
-
217
- 4. ProofPath rubric grading:
218
- {
219
- "inputs": <video_url>,
220
- "parameters": {
221
- "mode": "rubric",
222
- "rubric": [
223
- {"step": 1, "description": "Click cell B2"},
224
- {"step": 2, "description": "Type 123"},
225
- {"step": 3, "description": "Press Enter"}
226
- ],
227
- "max_frames": 512,
228
- "output_format": "json"
229
- }
230
- }
231
-
232
- Returns:
233
- {
234
- "generated_text": "...",
235
- "video_metadata": {...}, # If video input
236
- }
237
- """
238
- inputs = data.get("inputs")
239
- if inputs is None:
240
- inputs = data.get("video") or data.get("image") or data.get("images")
241
- if inputs is None:
242
- raise ValueError("No input provided. Use 'inputs', 'video', 'image', or 'images' key.")
243
-
244
- params = data.get("parameters", {})
245
- mode = params.get("mode", "default")
246
- prompt = params.get("prompt", "Describe this content in detail.")
247
- max_new_tokens = params.get("max_new_tokens", 2048)
248
-
249
- try:
250
- if mode == "rubric":
251
- return self._grade_rubric(inputs, params)
252
- elif isinstance(inputs, list):
253
- return self._process_multi_image(inputs, prompt, max_new_tokens)
254
- elif self._is_video(inputs, params):
255
- return self._process_video(inputs, prompt, params, max_new_tokens)
256
- else:
257
- return self._process_image(inputs, prompt, max_new_tokens)
258
-
259
- except Exception as e:
260
- import traceback
261
- return {"error": str(e), "error_type": type(e).__name__, "traceback": traceback.format_exc()}
262
-
263
- def _is_video(self, inputs: Any, params: Dict) -> bool:
264
- """Determine if input is video based on params or file extension."""
265
- if params.get("input_type") == "video":
266
- return True
267
- if params.get("input_type") == "image":
268
- return False
269
-
270
- if isinstance(inputs, str):
271
- lower = inputs.lower()
272
- video_exts = ['.mp4', '.avi', '.mov', '.mkv', '.webm', '.m4v']
273
- return any(ext in lower for ext in video_exts)
274
-
275
- return False
276
-
277
- def _process_video(
278
- self,
279
- video_data: Any,
280
- prompt: str,
281
- params: Dict,
282
- max_new_tokens: int
283
- ) -> Dict[str, Any]:
284
- """Process a video input."""
285
- from qwen_vl_utils import process_vision_info
286
-
287
- max_frames = min(params.get("max_frames", self.default_max_frames), self.max_frames_limit)
288
- fps = params.get("fps", 2.0)
289
-
290
- # Load video frames
291
- frames, video_metadata = self._load_video_frames(video_data, max_frames, fps)
292
-
293
- # Build message for Eagle 2.5 / Qwen2-VL format
294
- messages = [
295
- {
296
- "role": "user",
297
- "content": [
298
- {"type": "video", "video": frames, "fps": fps},
299
- {"type": "text", "text": prompt},
300
- ],
301
- }
302
- ]
303
-
304
- # Apply chat template
305
- text = self.processor.apply_chat_template(
306
- messages,
307
- tokenize=False,
308
- add_generation_prompt=True
309
- )
310
-
311
- # Process vision info
312
- image_inputs, video_inputs = process_vision_info(messages)
313
-
314
- inputs = self.processor(
315
- text=[text],
316
- images=image_inputs,
317
- videos=video_inputs,
318
- padding=True,
319
- return_tensors="pt",
320
- )
321
- inputs = inputs.to(self.model.device)
322
-
323
- # Generate
324
- with torch.inference_mode():
325
- generated_ids = self.model.generate(
326
- **inputs,
327
- max_new_tokens=max_new_tokens,
328
- do_sample=False,
329
- )
330
-
331
- # Decode - only the new tokens
332
- generated_ids_trimmed = [
333
- out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
334
- ]
335
- generated_text = self.processor.batch_decode(
336
- generated_ids_trimmed,
337
- skip_special_tokens=True,
338
- clean_up_tokenization_spaces=False
339
- )[0]
340
-
341
- return {
342
- "generated_text": generated_text,
343
- "video_metadata": video_metadata
344
- }
345
-
346
- def _process_image(self, image_data: Any, prompt: str, max_new_tokens: int) -> Dict[str, Any]:
347
- """Process a single image."""
348
- from qwen_vl_utils import process_vision_info
349
-
350
- image = self._load_image(image_data)
351
-
352
- messages = [
353
- {
354
- "role": "user",
355
- "content": [
356
- {"type": "image", "image": image},
357
- {"type": "text", "text": prompt},
358
- ],
359
- }
360
- ]
361
-
362
- text = self.processor.apply_chat_template(
363
- messages,
364
- tokenize=False,
365
- add_generation_prompt=True
366
- )
367
-
368
- image_inputs, video_inputs = process_vision_info(messages)
369
-
370
- inputs = self.processor(
371
- text=[text],
372
- images=image_inputs,
373
- videos=video_inputs,
374
- padding=True,
375
- return_tensors="pt",
376
- )
377
- inputs = inputs.to(self.model.device)
378
-
379
- with torch.inference_mode():
380
- generated_ids = self.model.generate(
381
- **inputs,
382
- max_new_tokens=max_new_tokens,
383
- do_sample=False,
384
- )
385
-
386
- generated_ids_trimmed = [
387
- out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
388
- ]
389
- generated_text = self.processor.batch_decode(
390
- generated_ids_trimmed,
391
- skip_special_tokens=True,
392
- clean_up_tokenization_spaces=False
393
- )[0]
394
-
395
- return {
396
- "generated_text": generated_text,
397
- "image_size": {"width": image.width, "height": image.height}
398
- }
399
-
400
- def _process_multi_image(self, images_data: List, prompt: str, max_new_tokens: int) -> Dict[str, Any]:
401
- """Process multiple images."""
402
- from qwen_vl_utils import process_vision_info
403
-
404
- images = [self._load_image(img) for img in images_data]
405
-
406
- # Build content with all images
407
- content = []
408
- for image in images:
409
- content.append({"type": "image", "image": image})
410
- content.append({"type": "text", "text": prompt})
411
-
412
- messages = [{"role": "user", "content": content}]
413
-
414
- text = self.processor.apply_chat_template(
415
- messages,
416
- tokenize=False,
417
- add_generation_prompt=True
418
- )
419
-
420
- image_inputs, video_inputs = process_vision_info(messages)
421
-
422
- inputs = self.processor(
423
- text=[text],
424
- images=image_inputs,
425
- videos=video_inputs,
426
- padding=True,
427
- return_tensors="pt",
428
- )
429
- inputs = inputs.to(self.model.device)
430
-
431
- with torch.inference_mode():
432
- generated_ids = self.model.generate(
433
- **inputs,
434
- max_new_tokens=max_new_tokens,
435
- do_sample=False,
436
- )
437
-
438
- generated_ids_trimmed = [
439
- out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
440
- ]
441
- generated_text = self.processor.batch_decode(
442
- generated_ids_trimmed,
443
- skip_special_tokens=True,
444
- clean_up_tokenization_spaces=False
445
- )[0]
446
-
447
- return {
448
- "generated_text": generated_text,
449
- "num_images": len(images)
450
- }
451
-
452
- def _grade_rubric(self, video_data: Any, params: Dict) -> Dict[str, Any]:
453
- """
454
- Grade a video against a rubric - ProofPath specific mode.
455
- """
456
- from qwen_vl_utils import process_vision_info
457
-
458
- rubric = params.get("rubric", [])
459
- if not rubric:
460
- raise ValueError("Rubric required for rubric mode")
461
-
462
- max_frames = min(params.get("max_frames", 512), self.max_frames_limit)
463
- fps = params.get("fps", 2.0)
464
- output_format = params.get("output_format", "json")
465
-
466
- # Load video
467
- frames, video_metadata = self._load_video_frames(video_data, max_frames, fps)
468
-
469
- # Build rubric prompt
470
- rubric_text = "\n".join([
471
- f"Step {item.get('step', i+1)}: {item.get('description', '')}"
472
- for i, item in enumerate(rubric)
473
- ])
474
-
475
- if output_format == "json":
476
- prompt = f"""Analyze this video against the following rubric and grade each step.
477
-
478
- RUBRIC:
479
- {rubric_text}
480
-
481
- For EACH step, determine:
482
- 1. Whether it was completed (true/false)
483
- 2. The approximate timestamp where it occurs (if completed)
484
- 3. Any issues or partial completion notes
485
-
486
- Respond ONLY with a JSON array in this exact format:
487
- [
488
- {{"step": 1, "completed": true, "timestamp": "0:15", "notes": "Clicked cell B2 correctly"}},
489
- {{"step": 2, "completed": true, "timestamp": "0:22", "notes": "Typed 123"}},
490
- ...
491
- ]"""
492
- else:
493
- prompt = f"""Analyze this video against the following rubric:
494
-
495
- RUBRIC:
496
- {rubric_text}
497
-
498
- For each step, describe whether it was completed, when it occurred, and any issues observed."""
499
-
500
- messages = [
501
- {
502
- "role": "user",
503
- "content": [
504
- {"type": "video", "video": frames, "fps": fps},
505
- {"type": "text", "text": prompt},
506
- ],
507
- }
508
- ]
509
-
510
- text = self.processor.apply_chat_template(
511
- messages,
512
- tokenize=False,
513
- add_generation_prompt=True
514
- )
515
-
516
- image_inputs, video_inputs = process_vision_info(messages)
517
-
518
- inputs = self.processor(
519
- text=[text],
520
- images=image_inputs,
521
- videos=video_inputs,
522
- padding=True,
523
- return_tensors="pt",
524
- )
525
- inputs = inputs.to(self.model.device)
526
-
527
- with torch.inference_mode():
528
- generated_ids = self.model.generate(
529
- **inputs,
530
- max_new_tokens=params.get("max_new_tokens", 2048),
531
- do_sample=False,
532
- )
533
-
534
- generated_ids_trimmed = [
535
- out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
536
- ]
537
- generated_text = self.processor.batch_decode(
538
- generated_ids_trimmed,
539
- skip_special_tokens=True,
540
- clean_up_tokenization_spaces=False
541
- )[0]
542
-
543
- result = {
544
- "generated_text": generated_text,
545
- "video_metadata": video_metadata,
546
- "rubric": rubric
547
- }
548
-
549
- # Try to parse JSON if requested
550
- if output_format == "json":
551
- try:
552
- import json
553
- # Extract JSON array from response
554
- json_match = re.search(r'\[[\s\S]*\]', generated_text)
555
- if json_match:
556
- result["grading_results"] = json.loads(json_match.group())
557
- except json.JSONDecodeError:
558
- pass # Keep raw text if JSON parsing fails
559
-
560
- return result
 
1
+ # Eagle 2.5 Inference Endpoint Requirements
2
+ transformers>=4.53.0
3
+ torch>=2.0.0
4
+ qwen-vl-utils>=0.0.8
5
+ opencv-python-headless>=4.8.0
6
+ av>=10.0.0
7
+ decord
8
+ Pillow>=9.0.0
9
+ requests>=2.28.0
10
+ numpy>=1.24.0,<2.0.0
11
+ einops>=0.7.0
12
+ accelerate>=0.25.0