Duplicate from uncensored-com/video-llava-7b-deployable

e505c6b 8 days ago

9.92 kB

	import torch
	import av
	import numpy as np
	import os
	import requests
	import tempfile
	import gc
	import time
	import threading
	import uuid
	from transformers import VideoLlavaProcessor, VideoLlavaForConditionalGeneration

	class EndpointHandler:
	def __init__(self, path=""):
	# 1. SETUP
	model_id = "LanguageBind/Video-LLaVA-7B-hf"
	print(f"Loading model: {model_id}...")

	# Using bfloat16 to match your local script's success
	self.processor = VideoLlavaProcessor.from_pretrained(model_id)
	self.model = VideoLlavaForConditionalGeneration.from_pretrained(
	model_id,
	torch_dtype=torch.bfloat16,
	device_map="auto",
	low_cpu_mem_usage=True
	)
	self.model.eval()
	print("Model loaded successfully.")

	def download_video(self, video_url):
	# Exact logic from your script, adapted for class structure
	suffix = os.path.splitext(video_url)[1] or '.mp4'
	temp_file = tempfile.NamedTemporaryFile(suffix=suffix, delete=False)
	temp_path = temp_file.name
	temp_file.close()

	try:
	# Added 30s timeout to prevent hanging, otherwise logic matches
	response = requests.get(video_url, stream=True, timeout=60)
	response.raise_for_status()

	# Helper to get size for logging
	file_size = int(response.headers.get('content-length', 0))

	with open(temp_path, 'wb') as f:
	for chunk in response.iter_content(chunk_size=8192):
	if chunk:
	f.write(chunk)

	if file_size == 0:
	file_size = os.path.getsize(temp_path)

	print(f"Downloaded video ({file_size/1024/1024:.2f} MB) to {temp_path}")
	return temp_path

	except Exception as e:
	if os.path.exists(temp_path):
	os.unlink(temp_path)
	raise Exception(f"Failed to download video: {str(e)}")

	def read_video_pyav(self, container, indices):
	# The logic expected by VideoLlava
	frames = []
	container.seek(0)
	start_index = indices[0]
	end_index = indices[-1]
	for i, frame in enumerate(container.decode(video=0)):
	if i > end_index:
	break
	if i >= start_index and i in indices:
	frames.append(frame)

	if not frames:
	raise ValueError("Video decoding failed: No frames found.")

	# Return list of numpy arrays (RGB)
	return [x.to_ndarray(format="rgb24") for x in frames]

	def trigger_webhook(self, url, payload):
	"""
	Sends payload to callback_url.
	Fire-and-forget style: catches errors so main execution doesn't fail.
	"""
	if not url:
	return

	print(f"Sending webhook to {url}")
	try:
	# 5s timeout ensures the HF Endpoint doesn't hang if your server is slow
	resp = requests.post(url, json=payload, timeout=5)
	resp.raise_for_status()
	print(f"Webhook success: {resp.status_code}")
	except Exception as e:
	# We print the error but do NOT raise it, ensuring the user still gets their result
	print(f"Webhook failed: {str(e)}")

	def _process_video(self, inputs, video_url, parameters, callback_url=None, request_id=None):
	"""
	Core video processing logic. Used by both sync and async paths.
	If callback_url is provided, sends result via webhook.
	Returns the response payload.
	"""
	# Start timing exactly like your script
	predict_start = time.time()
	print(f"\nStarting prediction at {time.strftime('%H:%M:%S')}")

	container = None
	video_path = None

	try:
	# 1. CONFIGURATION matches your script defaults
	# Your script defaulted to 10 frames
	num_frames = parameters.get("num_frames", 10)

	# Your script defaults: max 500, temp 0.1, top_p 0.9
	max_new_tokens = parameters.get("max_new_tokens", 500)
	temperature = parameters.get("temperature", 0.1)
	top_p = parameters.get("top_p", 0.9)

	print(f"Prompt: {inputs}")

	# 2. DOWNLOAD
	video_path = self.download_video(video_url)
	container = av.open(video_path)

	# 3. FRAME EXTRACTION
	total_frames = container.streams.video[0].frames
	if total_frames == 0:
	total_frames = sum(1 for _ in container.decode(video=0))
	container.seek(0)

	# Logic: frames_to_use = min(total_frames, num_frames)
	frames_to_use = min(total_frames, num_frames) if total_frames > 0 else num_frames
	print(f"Using {frames_to_use} frames")

	indices = np.linspace(0, total_frames - 1, frames_to_use, dtype=int)
	print(f"Using indices: {indices}")

	clip = self.read_video_pyav(container, indices)
	print(f"Extracted {len(clip)} frames")

	# 4. PROMPT CONSTRUCTION
	# We check if 'USER:' exists to allow your custom full prompts to pass through.
	# If it's a simple string, we apply your script's formatting exactly.
	if "USER:" in inputs:
	full_prompt = inputs
	else:
	full_prompt = f"USER: <video>{inputs} ASSISTANT:"

	# 5. TOKENIZE
	model_inputs = self.processor(
	text=full_prompt,
	videos=clip,
	return_tensors="pt"
	).to(self.model.device)

	# 6. GENERATE
	with torch.inference_mode():
	generate_ids = self.model.generate(
	**model_inputs,
	max_new_tokens=max_new_tokens,
	temperature=temperature,
	top_p=top_p,
	do_sample=True if temperature > 0 else False
	)

	# 7. DECODE
	result = self.processor.batch_decode(
	generate_ids,
	skip_special_tokens=True,
	clean_up_tokenization_spaces=False
	)[0]

	if "ASSISTANT:" in result:
	final_output = result.split("ASSISTANT:")[-1].strip()
	else:
	final_output = result

	# 8. END TIMING
	execution_time = f"{time.time() - predict_start:.2f}s"
	print(f"Total prediction time: {execution_time}")

	response_payload = {
	"generated_text": final_output,
	"status": "success",
	"execution_time": execution_time
	}

	# 9. SEND WEBHOOK (if callback_url provided)
	if callback_url:
	webhook_data = {
	"request_id": request_id,
	"input_prompt": inputs,
	"video_url": video_url,
	"result": response_payload
	}
	self.trigger_webhook(callback_url, webhook_data)

	return response_payload

	except Exception as e:
	import traceback
	traceback.print_exc()
	print(f"Inference failed: {str(e)}")

	error_payload = {"error": str(e), "status": "failed"}

	# Send error via webhook if callback_url provided
	if callback_url:
	webhook_data = {
	"request_id": request_id,
	"input_prompt": inputs,
	"video_url": video_url,
	"result": error_payload
	}
	self.trigger_webhook(callback_url, webhook_data)

	return error_payload

	finally:
	# Cleanup
	if container: container.close()
	if video_path and os.path.exists(video_path):
	os.unlink(video_path)
	torch.cuda.empty_cache()
	gc.collect()

	def __call__(self, data):
	# --- EXTRACT DATA ---
	callback_url = data.get("callback_url", None)
	inputs = data.get("inputs", "What is happening in this video?")
	video_url = data.get("video", None)
	parameters = data.get("parameters", {})

	# Generate unique request ID
	request_id = str(uuid.uuid4())

	# Validation
	if not video_url:
	return {"error": "Missing 'video' URL.", "status": "failed", "request_id": request_id}

	# --- ASYNC MODE: Return early, process in background ---
	if callback_url:
	print(f"Async mode: request_id={request_id}, will send result to {callback_url}")

	# Spawn background thread for processing
	thread = threading.Thread(
	target=self._process_video,
	args=(inputs, video_url, parameters, callback_url, request_id),
	daemon=True # Daemon thread won't block process exit
	)
	thread.start()

	# Return immediately with acknowledgment
	return [{
	"request_id": request_id,
	"status": "accepted",
	"message": "Processing started. Result will be sent to callback_url.",
	"callback_url": callback_url
	}]

	# --- SYNC MODE: Process and return result ---
	else:
	result = self._process_video(inputs, video_url, parameters, request_id=request_id)
	result["request_id"] = request_id
	return [result]