Spaces:

ignitariumcloud
/

Gas_Pipe_Quality_Monitor

Runtime error

App Files Files Community

Gas_Pipe_Quality_Monitor / app_region+handtrack.py

arjunanand13

Rename app.py to app_region+handtrack.py

bb1a482 verified 5 months ago

raw

history blame

5.84 kB

	import cv2
	import torch
	from PIL import Image
	import numpy as np
	import os
	import shutil
	import gradio as gr
	import mediapipe as mp
	from transformers import LlavaNextVideoProcessor, LlavaNextVideoForConditionalGeneration, BitsAndBytesConfig

	device = "cuda" if torch.cuda.is_available() else "cpu"
	model_id = "llava-hf/LLaVA-NeXT-Video-7B-hf"

	quantization_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_compute_dtype=torch.float16,
	bnb_4bit_use_double_quant=True,
	bnb_4bit_quant_type="nf4"
	)

	model = LlavaNextVideoForConditionalGeneration.from_pretrained(
	model_id,
	quantization_config=quantization_config,
	low_cpu_mem_usage=True,
	device_map="auto"
	)

	processor = LlavaNextVideoProcessor.from_pretrained(model_id)

	mpHands = mp.solutions.hands
	hands = mpHands.Hands(static_image_mode=True, max_num_hands=2)
	mpDraw = mp.solutions.drawing_utils

	def track_hand_position(frame):
	height, width = frame.shape[:2]
	mid_width = width // 2

	imgRGB = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	results = hands.process(imgRGB)

	hand_positions = []

	if results.multi_hand_landmarks:
	for handLms in results.multi_hand_landmarks:
	cx_values = []
	for lm in handLms.landmark:
	cx = int(lm.x * width)
	cx_values.append(cx)

	avg_cx = sum(cx_values) / len(cx_values)

	if avg_cx < mid_width:
	hand_positions.append("Region A")
	else:
	hand_positions.append("Region B")

	mpDraw.draw_landmarks(frame, handLms, mpHands.HAND_CONNECTIONS)

	return frame, hand_positions

	def add_regions_to_frame(frame, frame_idx, output_dir):
	height, width = frame.shape[:2]
	mid_width = width // 2

	overlay = frame.copy()
	cv2.rectangle(overlay, (0, 0), (mid_width, height), (255, 0, 0), -1)
	cv2.rectangle(overlay, (mid_width, 0), (width, height), (0, 255, 0), -1)

	frame = cv2.addWeighted(frame, 0.7, overlay, 0.3, 0)

	cv2.line(frame, (mid_width, 0), (mid_width, height), (255, 255, 255), 3)

	cv2.putText(frame, "Region A", (mid_width//4, height//2), cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 255, 255), 3)
	cv2.putText(frame, "Region B", (mid_width + mid_width//4, height//2), cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 255, 255), 3)

	tracked_frame, hand_pos = track_hand_position(frame.copy())

	cv2.imwrite(f"{output_dir}/frame_{frame_idx:03d}.jpg", tracked_frame)

	return tracked_frame, hand_pos

	def sample_frames(video_path, num_frames):
	output_dir = "/tmp/processed_frames"

	if os.path.exists(output_dir):
	shutil.rmtree(output_dir)
	os.makedirs(output_dir)

	video = cv2.VideoCapture(video_path)

	if not video.isOpened():
	raise ValueError(f"Could not open video file: {video_path}")

	total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
	interval = max(1, total_frames // num_frames)
	frames = []
	frame_count = 0
	hand_tracking_log = []

	for i in range(total_frames):
	ret, frame = video.read()
	if not ret:
	continue
	if i % interval == 0 and len(frames) < num_frames:
	processed_frame, hand_positions = add_regions_to_frame(frame, frame_count, output_dir)
	pil_img = Image.fromarray(cv2.cvtColor(processed_frame, cv2.COLOR_BGR2RGB))
	frames.append(pil_img)
	hand_tracking_log.append(f"Frame {frame_count}: {hand_positions}")
	frame_count += 1

	video.release()

	frame_paths = [f"{output_dir}/frame_{i:03d}.jpg" for i in range(frame_count)]

	return frames, frame_paths, hand_tracking_log

	def analyze_video(video_path):
	conversation = [
	{
	"role": "user",
	"content": [
	{"type": "text", "text": """Analyze this gas pipe quality control video and classify into one category:
	1) PASSED - pipe taken from Region A, dipped in water, no bubbles, moved to Region B.
	Example: Person picks pipe from left side, tests in water, no bubbles seen, places in right side.
	2) FAILED - pipe tested in water, bubbles visible. Example: Person dips pipe in water, bubbles appear indicating leak, pipe rejected.
	3) CHEATING - pipe moved from A to B without testing. Example: Person takes pipe from left and directly places in right without water test.
	Give classification and brief reason."""},
	{"type": "video"},
	],
	},
	]

	prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

	video_frames, frame_paths, hand_log = sample_frames(video_path, 8)

	inputs = processor(text=prompt, videos=video_frames, padding=True)
	inputs = {k: v.to(model.device) for k, v in inputs.items()}

	output = model.generate(
	**inputs,
	max_new_tokens=150,
	do_sample=True,
	temperature=0.7,
	top_p=0.9,
	top_k=50,
	repetition_penalty=1.1,
	pad_token_id=processor.tokenizer.eos_token_id
	)

	result = processor.decode(output[0][2:], skip_special_tokens=True)

	hand_tracking_summary = "\n".join(hand_log)

	return frame_paths, result, hand_tracking_summary

	examples = [
	["07.mp4"],
	["07_part1.mp4"],
	["07_part2.mp4"]
	]

	iface = gr.Interface(
	fn=analyze_video,
	inputs=gr.Video(),
	outputs=[
	gr.Gallery(label="Processed Frames"),
	gr.Textbox(label="LLM Analysis", lines=10),
	gr.Textbox(label="Hand Tracking Log", lines=15)
	],
	title="Gas Pipe Quality Control Analyzer",
	examples=examples,
	cache_examples=False
	)

	iface.launch(share=True)