Spaces:

nirmanpatel
/

video-to-text-labelling

Sleeping

App Files Files Community

video-to-text-labelling / pipeline.py

nirmanpatel

Upload folder using huggingface_hub

ff789a7 verified 9 months ago

raw

history blame contribute delete

5.23 kB

	import cv2
	from PIL import Image
	import torch
	from transformers import BlipProcessor, BlipForConditionalGeneration, pipeline as hf_pipeline
	import re
	import os

	os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"

	def extract_keyframes(video_path, num_frames=5):
	"""
	Extracts evenly spaced frames from the video.
	"""
	vidcap = cv2.VideoCapture(video_path)
	total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
	frame_indices = [int(i * total_frames / num_frames) for i in range(num_frames)]
	frames = []
	for idx in frame_indices:
	vidcap.set(cv2.CAP_PROP_POS_FRAMES, idx)
	success, image = vidcap.read()
	if success:
	img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
	frames.append(Image.fromarray(img))
	vidcap.release()
	return frames

	def caption_image(image, processor, model, device):
	"""
	Generates a caption for a single image using BLIP.
	"""
	inputs = processor(image, return_tensors="pt").to(device)
	out = model.generate(**inputs)
	return processor.decode(out[0], skip_special_tokens=True)

	# Initialize T5 summarization model once
	summary_pipeline = hf_pipeline("summarization", model="t5-base", device=-1)

	def summarize_captions_t5(captions):
	"""
	Use T5 with improved preprocessing and prompting to create a single coherent summary.
	"""
	# Remove duplicate captions while preserving order
	unique_captions = []
	seen = set()
	for caption in captions:
	if caption.lower() not in seen:
	unique_captions.append(caption)
	seen.add(caption.lower())

	# If only one unique caption, return it
	if len(unique_captions) == 1:
	return unique_captions[0]

	# Clean and preprocess captions
	# cleaned_captions = []
	# for caption in unique_captions:
	# Remove common prefixes that BLIP adds
	# cleaned = re.sub(r'^(a \|an \|the )', '', caption.lower())
	# cleaned_captions.append(cleaned)

	# Create a structured input for T5 that encourages consolidation
	text = f"summarize: The video shows: {'. '.join(unique_captions)}."

	# Use T5 with parameters optimized for consolidation
	try:
	summary = summary_pipeline(
	text,
	max_length=20, # Longer max length for complete thoughts
	min_length=10, # Ensure substantial output
	do_sample=True, # Enable sampling for more natural output
	temperature=0.7, # Add some creativity
	num_beams=4, # Use beam search for better quality
	early_stopping=True,
	no_repeat_ngram_size=3 # Prevent repetition of 3-grams
	)

	result = summary[0]['summary_text']

	# Clean up the output
	result = re.sub(r'^(summarize:\|summary:)', '', result, flags=re.IGNORECASE).strip()

	# Fix spacing issues around punctuation
	result = re.sub(r'\s+([.,;:!?])', r'\1', result) # Remove spaces before punctuation
	result = re.sub(r'([.,;:!?])(\w)', r'\1 \2', result) # Add space after punctuation if missing

	# Fix article issues
	# Add "a" before people (indefinite articles for persons)
	result = re.sub(r'\b(man\|woman\|person\|student\|teacher\|individual)\b', r'a \1', result)

	# Fix sentence starters
	result = re.sub(r'^Video shows\b', 'The video shows', result)
	result = re.sub(r'^Man\b', 'A man', result)
	result = re.sub(r'^Woman\b', 'A woman', result)
	result = re.sub(r'^Person\b', 'A person', result)

	# Fix double articles that might have been created
	result = re.sub(r'\ba a\b', 'a', result)
	result = re.sub(r'\bthe the\b', 'the', result)
	result = re.sub(r'\bA a\b', 'A', result)
	result = re.sub(r'\bThe the\b', 'The', result)
	result = re.sub(r'\ban a\b', 'a', result)
	result = re.sub(r'\bAn a\b', 'A', result)

	# Fix capitalization after periods (sentence boundaries)
	result = re.sub(r'(\.)(\s*)([a-z])', lambda m: m.group(1) + ' ' + m.group(3).upper(), result)

	# Ensure proper capitalization at the start
	result = result.capitalize()

	# Ensure it ends with proper punctuation
	if not result.endswith('.'):
	result += '.'

	return result

	except Exception as e:
	print(f"T5 summarization failed: {e}")
	# Fallback: return the most descriptive unique caption
	return max(unique_captions, key=len)

	def describe_video(video_path, num_frames=5):
	device = "cpu"
	processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
	model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

	frames = extract_keyframes(video_path, num_frames)
	captions = [caption_image(frame, processor, model, device) for frame in frames]

	# print(f"Individual frame captions: {captions}") # Debug output

	summary = summarize_captions_t5(captions)
	return summary

	print(describe_video("WritingOnBoard.mp4", num_frames=5))