Spaces:
Sleeping
Sleeping
| import cv2 | |
| from PIL import Image | |
| import torch | |
| from transformers import BlipProcessor, BlipForConditionalGeneration, pipeline as hf_pipeline | |
| import re | |
| import os | |
| os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" | |
| def extract_keyframes(video_path, num_frames=5): | |
| """ | |
| Extracts evenly spaced frames from the video. | |
| """ | |
| vidcap = cv2.VideoCapture(video_path) | |
| total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT)) | |
| frame_indices = [int(i * total_frames / num_frames) for i in range(num_frames)] | |
| frames = [] | |
| for idx in frame_indices: | |
| vidcap.set(cv2.CAP_PROP_POS_FRAMES, idx) | |
| success, image = vidcap.read() | |
| if success: | |
| img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) | |
| frames.append(Image.fromarray(img)) | |
| vidcap.release() | |
| return frames | |
| def caption_image(image, processor, model, device): | |
| """ | |
| Generates a caption for a single image using BLIP. | |
| """ | |
| inputs = processor(image, return_tensors="pt").to(device) | |
| out = model.generate(**inputs) | |
| return processor.decode(out[0], skip_special_tokens=True) | |
| # Initialize T5 summarization model once | |
| summary_pipeline = hf_pipeline("summarization", model="t5-base", device=-1) | |
| def summarize_captions_t5(captions): | |
| """ | |
| Use T5 with improved preprocessing and prompting to create a single coherent summary. | |
| """ | |
| # Remove duplicate captions while preserving order | |
| unique_captions = [] | |
| seen = set() | |
| for caption in captions: | |
| if caption.lower() not in seen: | |
| unique_captions.append(caption) | |
| seen.add(caption.lower()) | |
| # If only one unique caption, return it | |
| if len(unique_captions) == 1: | |
| return unique_captions[0] | |
| # Clean and preprocess captions | |
| # cleaned_captions = [] | |
| # for caption in unique_captions: | |
| # Remove common prefixes that BLIP adds | |
| # cleaned = re.sub(r'^(a |an |the )', '', caption.lower()) | |
| # cleaned_captions.append(cleaned) | |
| # Create a structured input for T5 that encourages consolidation | |
| text = f"summarize: The video shows: {'. '.join(unique_captions)}." | |
| # Use T5 with parameters optimized for consolidation | |
| try: | |
| summary = summary_pipeline( | |
| text, | |
| max_length=20, # Longer max length for complete thoughts | |
| min_length=10, # Ensure substantial output | |
| do_sample=True, # Enable sampling for more natural output | |
| temperature=0.7, # Add some creativity | |
| num_beams=4, # Use beam search for better quality | |
| early_stopping=True, | |
| no_repeat_ngram_size=3 # Prevent repetition of 3-grams | |
| ) | |
| result = summary[0]['summary_text'] | |
| # Clean up the output | |
| result = re.sub(r'^(summarize:|summary:)', '', result, flags=re.IGNORECASE).strip() | |
| # Fix spacing issues around punctuation | |
| result = re.sub(r'\s+([.,;:!?])', r'\1', result) # Remove spaces before punctuation | |
| result = re.sub(r'([.,;:!?])(\w)', r'\1 \2', result) # Add space after punctuation if missing | |
| # Fix article issues | |
| # Add "a" before people (indefinite articles for persons) | |
| result = re.sub(r'\b(man|woman|person|student|teacher|individual)\b', r'a \1', result) | |
| # Fix sentence starters | |
| result = re.sub(r'^Video shows\b', 'The video shows', result) | |
| result = re.sub(r'^Man\b', 'A man', result) | |
| result = re.sub(r'^Woman\b', 'A woman', result) | |
| result = re.sub(r'^Person\b', 'A person', result) | |
| # Fix double articles that might have been created | |
| result = re.sub(r'\ba a\b', 'a', result) | |
| result = re.sub(r'\bthe the\b', 'the', result) | |
| result = re.sub(r'\bA a\b', 'A', result) | |
| result = re.sub(r'\bThe the\b', 'The', result) | |
| result = re.sub(r'\ban a\b', 'a', result) | |
| result = re.sub(r'\bAn a\b', 'A', result) | |
| # Fix capitalization after periods (sentence boundaries) | |
| result = re.sub(r'(\.)(\s*)([a-z])', lambda m: m.group(1) + ' ' + m.group(3).upper(), result) | |
| # Ensure proper capitalization at the start | |
| result = result.capitalize() | |
| # Ensure it ends with proper punctuation | |
| if not result.endswith('.'): | |
| result += '.' | |
| return result | |
| except Exception as e: | |
| print(f"T5 summarization failed: {e}") | |
| # Fallback: return the most descriptive unique caption | |
| return max(unique_captions, key=len) | |
| def describe_video(video_path, num_frames=5): | |
| device = "cpu" | |
| processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") | |
| model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device) | |
| frames = extract_keyframes(video_path, num_frames) | |
| captions = [caption_image(frame, processor, model, device) for frame in frames] | |
| # print(f"Individual frame captions: {captions}") # Debug output | |
| summary = summarize_captions_t5(captions) | |
| return summary | |
| print(describe_video("WritingOnBoard.mp4", num_frames=5)) |