Spaces:

yukeshwaradse
/

SceneChnageDetectionUsingSAM

Sleeping

App Files Files Community

SceneChnageDetectionUsingSAM / app.py

yukeshwaradse

Update app.py

340c2ff verified over 1 year ago

raw

history blame

5.26 kB

	import os
	import cv2
	import numpy as np
	import gradio as gr
	from segment_anything import sam_model_registry, SamPredictor
	from youtube_transcript_api import YouTubeTranscriptApi

	def video_to_frames(video_path, output_dir, frame_rate=0.7):
	if not os.path.exists(output_dir):
	os.makedirs(output_dir)
	cap = cv2.VideoCapture(video_path)
	fps = cap.get(cv2.CAP_PROP_FPS)
	frame_interval = int(fps / frame_rate)
	frame_count = 0
	while True:
	ret, frame = cap.read()
	if not ret:
	break
	if frame_count % frame_interval == 0:
	cv2.imwrite(os.path.join(output_dir, f'frame_{frame_count:05d}.jpg'), frame)
	frame_count += 1
	cap.release()
	return fps

	def select_background_points(image, num_points=4):
	h, w, _ = image.shape
	points = np.array([
	[0, 0], # top-left corner
	[0, w - 1], # top-right corner
	[h - 1, 0], # bottom-left corner
	[h - 1, w - 1] # bottom-right corner
	])

	if num_points > 4:
	points = np.vstack([points,
	[0, w // 2],
	[h // 2, 0],
	[h - 1, w // 2],
	[h // 2, w - 1]])

	return points

	def compare_histograms(frame1, frame2, threshold=0.4):
	hist1 = cv2.calcHist([frame1], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
	hist2 = cv2.calcHist([frame2], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
	hist1 = cv2.normalize(hist1, hist1).flatten()
	hist2 = cv2.normalize(hist2, hist2).flatten()
	diff = cv2.compareHist(hist1, hist2, cv2.HISTCMP_CORREL)
	return diff < threshold

	def detect_scene_changes(frame_dir, fps, threshold=0.15, hist_threshold=0.3):
	frames = sorted(os.listdir(frame_dir))
	scene_changes = []
	prev_mask = None
	prev_frame = None

	for i, frame_name in enumerate(frames):
	frame = cv2.imread(os.path.join(frame_dir, frame_name))
	frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	predictor.set_image(frame_rgb)
	background_points = select_background_points(frame_rgb)
	point_labels = np.zeros(background_points.shape[0], dtype=int) # Label points as background (0)
	masks, _, _ = predictor.predict(point_coords=background_points,
	point_labels=point_labels,
	multimask_output=False)
	mask_diff = 0
	if prev_mask is not None:
	mask_diff = np.logical_xor(masks[0], prev_mask).mean()
	hist_diff = False
	if prev_frame is not None:
	hist_diff = compare_histograms(prev_frame, frame, threshold=hist_threshold)

	if mask_diff > threshold or hist_diff:
	timestamp = int(frame_name.split('_')[1].split('.')[0]) / fps
	scene_changes.append(timestamp)

	prev_mask = masks[0]
	prev_frame = frame

	return scene_changes

	def get_transcript(video_id):
	try:
	transcript = YouTubeTranscriptApi.get_transcript(video_id)
	return transcript
	except Exception as e:
	return []

	def group_transcripts_by_scenes(transcripts, scene_changes):
	grouped_transcripts = []
	scene_index = 0
	current_group = []

	for transcript in transcripts:
	start_time = transcript['start']
	if scene_index < len(scene_changes) and start_time > scene_changes[scene_index]:
	grouped_transcripts.append(' '.join([t['text'] for t in current_group]))
	current_group = []
	scene_index += 1
	current_group.append(transcript)

	if current_group:
	grouped_transcripts.append(' '.join([t['text'] for t in current_group]))

	return grouped_transcripts

	def process_video_and_transcript(video_file, youtube_video_id):
	output_dir = "Output_frames"

	# Save the uploaded video to a temporary location
	video_path = os.path.join(output_dir, "uploaded_video.mp4")
	with open(video_path, "wb") as f:
	f.write(video_file.read())

	fps = video_to_frames(video_path, output_dir, frame_rate=0.7)

	# Initialize the SAM predictor
	model = sam_model_registry["vit_h"](checkpoint="sam_vit_h_4b8939.pth")
	global predictor
	predictor = SamPredictor(model)

	# Detect scene changes
	scene_changes = detect_scene_changes(output_dir, fps, threshold=0.15, hist_threshold=0.3)

	# Get YouTube transcript
	transcripts = get_transcript(youtube_video_id)

	# Group transcripts by scene changes
	grouped_transcripts = group_transcripts_by_scenes(transcripts, scene_changes)

	return "\n\n".join([f"Scene {i + 1}: {text}" for i, text in enumerate(grouped_transcripts)])

	# Gradio Interface
	interface = gr.Interface(
	fn=process_video_and_transcript,
	inputs=[
	gr.Video(label="Upload Video File (.mp4)"),
	gr.Textbox(label="YouTube Video ID")
	],
	outputs="text",
	title="Scene Change Detection & Transcript Grouping",
	description="Upload a video file and input a YouTube video ID. The app will detect scene changes in the video and group the transcript text according to these scene changes."
	)

	interface.launch()