Spaces:
Runtime error
Runtime error
| import cv2 | |
| import torch | |
| from PIL import Image | |
| import numpy as np | |
| import os | |
| import shutil | |
| import gradio as gr | |
| import mediapipe as mp | |
| from transformers import LlavaNextVideoProcessor, LlavaNextVideoForConditionalGeneration, BitsAndBytesConfig | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| model_id = "llava-hf/LLaVA-NeXT-Video-7B-hf" | |
| quantization_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_compute_dtype=torch.float16, | |
| bnb_4bit_use_double_quant=True, | |
| bnb_4bit_quant_type="nf4" | |
| ) | |
| model = LlavaNextVideoForConditionalGeneration.from_pretrained( | |
| model_id, | |
| quantization_config=quantization_config, | |
| low_cpu_mem_usage=True, | |
| device_map="auto" | |
| ) | |
| processor = LlavaNextVideoProcessor.from_pretrained(model_id) | |
| mpHands = mp.solutions.hands | |
| hands = mpHands.Hands(static_image_mode=True, max_num_hands=2) | |
| mpDraw = mp.solutions.drawing_utils | |
| def track_hand_position(frame): | |
| height, width = frame.shape[:2] | |
| mid_width = width // 2 | |
| imgRGB = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) | |
| results = hands.process(imgRGB) | |
| hand_positions = [] | |
| if results.multi_hand_landmarks: | |
| for handLms in results.multi_hand_landmarks: | |
| cx_values = [] | |
| for lm in handLms.landmark: | |
| cx = int(lm.x * width) | |
| cx_values.append(cx) | |
| avg_cx = sum(cx_values) / len(cx_values) | |
| if avg_cx < mid_width: | |
| hand_positions.append("Region A") | |
| else: | |
| hand_positions.append("Region B") | |
| mpDraw.draw_landmarks(frame, handLms, mpHands.HAND_CONNECTIONS) | |
| return frame, hand_positions | |
| def add_regions_to_frame(frame, frame_idx, output_dir): | |
| height, width = frame.shape[:2] | |
| mid_width = width // 2 | |
| overlay = frame.copy() | |
| cv2.rectangle(overlay, (0, 0), (mid_width, height), (255, 0, 0), -1) | |
| cv2.rectangle(overlay, (mid_width, 0), (width, height), (0, 255, 0), -1) | |
| frame = cv2.addWeighted(frame, 0.7, overlay, 0.3, 0) | |
| cv2.line(frame, (mid_width, 0), (mid_width, height), (255, 255, 255), 3) | |
| cv2.putText(frame, "Region A", (mid_width//4, height//2), cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 255, 255), 3) | |
| cv2.putText(frame, "Region B", (mid_width + mid_width//4, height//2), cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 255, 255), 3) | |
| tracked_frame, hand_pos = track_hand_position(frame.copy()) | |
| cv2.imwrite(f"{output_dir}/frame_{frame_idx:03d}.jpg", tracked_frame) | |
| return tracked_frame, hand_pos | |
| def sample_frames(video_path, num_frames): | |
| output_dir = "/tmp/processed_frames" | |
| if os.path.exists(output_dir): | |
| shutil.rmtree(output_dir) | |
| os.makedirs(output_dir) | |
| video = cv2.VideoCapture(video_path) | |
| if not video.isOpened(): | |
| raise ValueError(f"Could not open video file: {video_path}") | |
| total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT)) | |
| interval = max(1, total_frames // num_frames) | |
| frames = [] | |
| frame_count = 0 | |
| hand_tracking_log = [] | |
| for i in range(total_frames): | |
| ret, frame = video.read() | |
| if not ret: | |
| continue | |
| if i % interval == 0 and len(frames) < num_frames: | |
| processed_frame, hand_positions = add_regions_to_frame(frame, frame_count, output_dir) | |
| pil_img = Image.fromarray(cv2.cvtColor(processed_frame, cv2.COLOR_BGR2RGB)) | |
| frames.append(pil_img) | |
| hand_tracking_log.append(f"Frame {frame_count}: {hand_positions}") | |
| frame_count += 1 | |
| video.release() | |
| frame_paths = [f"{output_dir}/frame_{i:03d}.jpg" for i in range(frame_count)] | |
| return frames, frame_paths, hand_tracking_log | |
| def analyze_video(video_path): | |
| conversation = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "text", "text": """Analyze this gas pipe quality control video and classify into one category: | |
| 1) PASSED - pipe taken from Region A, dipped in water, no bubbles, moved to Region B. | |
| Example: Person picks pipe from left side, tests in water, no bubbles seen, places in right side. | |
| 2) FAILED - pipe tested in water, bubbles visible. Example: Person dips pipe in water, bubbles appear indicating leak, pipe rejected. | |
| 3) CHEATING - pipe moved from A to B without testing. Example: Person takes pipe from left and directly places in right without water test. | |
| Give classification and brief reason."""}, | |
| {"type": "video"}, | |
| ], | |
| }, | |
| ] | |
| prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) | |
| video_frames, frame_paths, hand_log = sample_frames(video_path, 8) | |
| inputs = processor(text=prompt, videos=video_frames, padding=True) | |
| inputs = {k: v.to(model.device) for k, v in inputs.items()} | |
| output = model.generate( | |
| **inputs, | |
| max_new_tokens=150, | |
| do_sample=True, | |
| temperature=0.7, | |
| top_p=0.9, | |
| top_k=50, | |
| repetition_penalty=1.1, | |
| pad_token_id=processor.tokenizer.eos_token_id | |
| ) | |
| result = processor.decode(output[0][2:], skip_special_tokens=True) | |
| hand_tracking_summary = "\n".join(hand_log) | |
| return frame_paths, result, hand_tracking_summary | |
| examples = [ | |
| ["07.mp4"], | |
| ["07_part1.mp4"], | |
| ["07_part2.mp4"] | |
| ] | |
| iface = gr.Interface( | |
| fn=analyze_video, | |
| inputs=gr.Video(), | |
| outputs=[ | |
| gr.Gallery(label="Processed Frames"), | |
| gr.Textbox(label="LLM Analysis", lines=10), | |
| gr.Textbox(label="Hand Tracking Log", lines=15) | |
| ], | |
| title="Gas Pipe Quality Control Analyzer", | |
| examples=examples, | |
| cache_examples=False | |
| ) | |
| iface.launch(share=True) |