nirmanpatel's picture
Upload folder using huggingface_hub
ff789a7 verified
import cv2
from PIL import Image
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration, pipeline as hf_pipeline
import re
import os
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
def extract_keyframes(video_path, num_frames=5):
"""
Extracts evenly spaced frames from the video.
"""
vidcap = cv2.VideoCapture(video_path)
total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
frame_indices = [int(i * total_frames / num_frames) for i in range(num_frames)]
frames = []
for idx in frame_indices:
vidcap.set(cv2.CAP_PROP_POS_FRAMES, idx)
success, image = vidcap.read()
if success:
img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
frames.append(Image.fromarray(img))
vidcap.release()
return frames
def caption_image(image, processor, model, device):
"""
Generates a caption for a single image using BLIP.
"""
inputs = processor(image, return_tensors="pt").to(device)
out = model.generate(**inputs)
return processor.decode(out[0], skip_special_tokens=True)
# Initialize T5 summarization model once
summary_pipeline = hf_pipeline("summarization", model="t5-base", device=-1)
def summarize_captions_t5(captions):
"""
Use T5 with improved preprocessing and prompting to create a single coherent summary.
"""
# Remove duplicate captions while preserving order
unique_captions = []
seen = set()
for caption in captions:
if caption.lower() not in seen:
unique_captions.append(caption)
seen.add(caption.lower())
# If only one unique caption, return it
if len(unique_captions) == 1:
return unique_captions[0]
# Clean and preprocess captions
# cleaned_captions = []
# for caption in unique_captions:
# Remove common prefixes that BLIP adds
# cleaned = re.sub(r'^(a |an |the )', '', caption.lower())
# cleaned_captions.append(cleaned)
# Create a structured input for T5 that encourages consolidation
text = f"summarize: The video shows: {'. '.join(unique_captions)}."
# Use T5 with parameters optimized for consolidation
try:
summary = summary_pipeline(
text,
max_length=20, # Longer max length for complete thoughts
min_length=10, # Ensure substantial output
do_sample=True, # Enable sampling for more natural output
temperature=0.7, # Add some creativity
num_beams=4, # Use beam search for better quality
early_stopping=True,
no_repeat_ngram_size=3 # Prevent repetition of 3-grams
)
result = summary[0]['summary_text']
# Clean up the output
result = re.sub(r'^(summarize:|summary:)', '', result, flags=re.IGNORECASE).strip()
# Fix spacing issues around punctuation
result = re.sub(r'\s+([.,;:!?])', r'\1', result) # Remove spaces before punctuation
result = re.sub(r'([.,;:!?])(\w)', r'\1 \2', result) # Add space after punctuation if missing
# Fix article issues
# Add "a" before people (indefinite articles for persons)
result = re.sub(r'\b(man|woman|person|student|teacher|individual)\b', r'a \1', result)
# Fix sentence starters
result = re.sub(r'^Video shows\b', 'The video shows', result)
result = re.sub(r'^Man\b', 'A man', result)
result = re.sub(r'^Woman\b', 'A woman', result)
result = re.sub(r'^Person\b', 'A person', result)
# Fix double articles that might have been created
result = re.sub(r'\ba a\b', 'a', result)
result = re.sub(r'\bthe the\b', 'the', result)
result = re.sub(r'\bA a\b', 'A', result)
result = re.sub(r'\bThe the\b', 'The', result)
result = re.sub(r'\ban a\b', 'a', result)
result = re.sub(r'\bAn a\b', 'A', result)
# Fix capitalization after periods (sentence boundaries)
result = re.sub(r'(\.)(\s*)([a-z])', lambda m: m.group(1) + ' ' + m.group(3).upper(), result)
# Ensure proper capitalization at the start
result = result.capitalize()
# Ensure it ends with proper punctuation
if not result.endswith('.'):
result += '.'
return result
except Exception as e:
print(f"T5 summarization failed: {e}")
# Fallback: return the most descriptive unique caption
return max(unique_captions, key=len)
def describe_video(video_path, num_frames=5):
device = "cpu"
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)
frames = extract_keyframes(video_path, num_frames)
captions = [caption_image(frame, processor, model, device) for frame in frames]
# print(f"Individual frame captions: {captions}") # Debug output
summary = summarize_captions_t5(captions)
return summary
print(describe_video("WritingOnBoard.mp4", num_frames=5))