Spaces:
Sleeping
Sleeping
| """ | |
| Emotion-based keyframe selection - analyzes emotions FIRST, then selects matching frames | |
| """ | |
| import os | |
| import cv2 | |
| import srt | |
| from typing import List, Dict, Tuple | |
| import numpy as np | |
| from backend.enhanced_emotion_matcher import EnhancedEmotionMatcher | |
| from backend.eye_state_detector import EyeStateDetector | |
| from backend.emotion_aware_comic import FacialExpressionAnalyzer | |
| def generate_keyframes_emotion_based(video_path: str, story_subs: List, max_frames: int = 48): | |
| """ | |
| Generate keyframes by matching facial expressions to dialogue emotions | |
| This analyzes emotions FIRST, then finds the best matching frames | |
| """ | |
| print(f"π Emotion-Based Frame Selection (Analyzing emotions BEFORE frame selection)") | |
| print(f"π Analyzing {len(story_subs)} dialogues for emotions...") | |
| # Initialize analyzers | |
| emotion_matcher = EnhancedEmotionMatcher() | |
| face_analyzer = FacialExpressionAnalyzer() | |
| eye_detector = EyeStateDetector() | |
| # Step 1: Analyze all dialogue emotions first | |
| dialogue_emotions = [] | |
| for i, sub in enumerate(story_subs[:max_frames]): | |
| text_emotions = emotion_matcher.analyze_text_emotion(sub.content) | |
| dominant_emotion = max(text_emotions.items(), | |
| key=lambda x: x[1] if x[0] != 'intensity' else 0)[0] | |
| dialogue_emotions.append({ | |
| 'subtitle': sub, | |
| 'text': sub.content, | |
| 'emotions': text_emotions, | |
| 'dominant': dominant_emotion, | |
| 'start_time': sub.start.total_seconds(), | |
| 'end_time': sub.end.total_seconds() | |
| }) | |
| print(f" π Dialogue {i+1}: '{sub.content[:40]}...' β {dominant_emotion}") | |
| print(f"\n㪠Scanning video for matching facial expressions...") | |
| # Ensure output directory exists | |
| final_dir = "frames/final" | |
| os.makedirs(final_dir, exist_ok=True) | |
| # Clear existing frames | |
| for f in os.listdir(final_dir): | |
| if f.endswith('.png'): | |
| os.remove(os.path.join(final_dir, f)) | |
| # Open video | |
| cap = cv2.VideoCapture(video_path) | |
| if not cap.isOpened(): | |
| print(f"β Failed to open video: {video_path}") | |
| return False | |
| fps = cap.get(cv2.CAP_PROP_FPS) | |
| total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) | |
| print(f"πΉ Video: {fps} fps, {total_frames} total frames") | |
| # Step 2: For each dialogue, find the best matching frame | |
| selected_frames = [] | |
| for idx, dialogue_data in enumerate(dialogue_emotions): | |
| print(f"\nπ Finding best frame for dialogue {idx+1}: {dialogue_data['dominant']} emotion") | |
| best_frame = find_best_emotional_frame( | |
| cap, dialogue_data, fps, | |
| face_analyzer, eye_detector, | |
| scan_window=2.0 # Scan 2 seconds around dialogue | |
| ) | |
| if best_frame is not None: | |
| # Save the selected frame | |
| output_path = os.path.join(final_dir, f"frame{idx:03d}.png") | |
| cv2.imwrite(output_path, best_frame['image']) | |
| selected_frames.append({ | |
| 'path': output_path, | |
| 'dialogue': dialogue_data, | |
| 'face_emotion': best_frame['face_emotion'], | |
| 'match_score': best_frame['match_score'], | |
| 'eye_state': best_frame['eye_state'] | |
| }) | |
| print(f" β Selected frame with {best_frame['face_emotion']} face " + | |
| f"(match: {best_frame['match_score']:.0%}, eyes: {best_frame['eye_state']})") | |
| else: | |
| print(f" β οΈ No good emotional match found, using default frame") | |
| # Fallback: just get middle frame | |
| fallback_frame = get_fallback_frame(cap, dialogue_data, fps) | |
| if fallback_frame is not None: | |
| output_path = os.path.join(final_dir, f"frame{idx:03d}.png") | |
| cv2.imwrite(output_path, fallback_frame) | |
| selected_frames.append({ | |
| 'path': output_path, | |
| 'dialogue': dialogue_data, | |
| 'face_emotion': 'unknown', | |
| 'match_score': 0.0, | |
| 'eye_state': 'unknown' | |
| }) | |
| cap.release() | |
| # Summary | |
| print(f"\nπ Emotion-Based Selection Summary:") | |
| print(f"β Selected {len(selected_frames)} frames based on emotion matching") | |
| if selected_frames: | |
| good_matches = sum(1 for f in selected_frames if f['match_score'] > 0.7) | |
| print(f"π Good emotion matches: {good_matches}/{len(selected_frames)}") | |
| # Count emotions | |
| emotion_counts = {} | |
| for frame in selected_frames: | |
| emotion = frame['face_emotion'] | |
| emotion_counts[emotion] = emotion_counts.get(emotion, 0) + 1 | |
| print("\nπ Selected facial expressions:") | |
| for emotion, count in sorted(emotion_counts.items(), key=lambda x: x[1], reverse=True): | |
| print(f" {emotion}: {count} frames") | |
| return len(selected_frames) > 0 | |
| def find_best_emotional_frame(cap, dialogue_data, fps, face_analyzer, eye_detector, scan_window=2.0): | |
| """ | |
| Find the best frame that matches the dialogue emotion | |
| Scans frames around the dialogue timing to find matching facial expression | |
| """ | |
| target_emotion = dialogue_data['dominant'] | |
| text_emotions = dialogue_data['emotions'] | |
| # Calculate scan range | |
| center_time = (dialogue_data['start_time'] + dialogue_data['end_time']) / 2 | |
| start_time = max(0, center_time - scan_window) | |
| end_time = center_time + scan_window | |
| start_frame = int(start_time * fps) | |
| end_frame = int(end_time * fps) | |
| # Sample frames (don't check every single frame) | |
| num_samples = min(20, end_frame - start_frame) # Check up to 20 frames | |
| if num_samples <= 0: | |
| num_samples = 5 | |
| frame_step = max(1, (end_frame - start_frame) // num_samples) | |
| best_match = None | |
| best_score = -1 | |
| for frame_num in range(start_frame, end_frame, frame_step): | |
| cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num) | |
| ret, frame = cap.read() | |
| if not ret or frame is None: | |
| continue | |
| # Save temp frame for analysis | |
| temp_path = f"temp_emotion_check_{frame_num}.png" | |
| cv2.imwrite(temp_path, frame) | |
| try: | |
| # Check eye state first | |
| eye_state = eye_detector.check_eyes_state(temp_path) | |
| # Skip if eyes are closed or half-closed | |
| if eye_state['state'] in ['closed', 'half_closed']: | |
| continue | |
| # Analyze facial expression | |
| face_emotions = face_analyzer.analyze_expression(temp_path) | |
| face_dominant = max(face_emotions.items(), | |
| key=lambda x: x[1] if x[0] != 'intensity' else 0)[0] | |
| # Calculate match score | |
| score = calculate_emotion_match_score(text_emotions, face_emotions, target_emotion) | |
| # Bonus for good eye state | |
| if eye_state['state'] == 'open': | |
| score *= 1.2 | |
| # Update best match | |
| if score > best_score: | |
| best_score = score | |
| best_match = { | |
| 'image': frame.copy(), | |
| 'face_emotion': face_dominant, | |
| 'face_emotions': face_emotions, | |
| 'match_score': min(score, 1.0), | |
| 'eye_state': eye_state['state'], | |
| 'frame_num': frame_num | |
| } | |
| finally: | |
| # Clean up temp file | |
| if os.path.exists(temp_path): | |
| os.remove(temp_path) | |
| return best_match | |
| def calculate_emotion_match_score(text_emotions: Dict, face_emotions: Dict, target_emotion: str) -> float: | |
| """Calculate how well the face matches the text emotion""" | |
| score = 0.0 | |
| # Direct match bonus | |
| if target_emotion in face_emotions and face_emotions[target_emotion] > 0.3: | |
| score += face_emotions[target_emotion] * 2.0 | |
| # Check if face has the target emotion as dominant | |
| face_dominant = max(face_emotions.items(), | |
| key=lambda x: x[1] if x[0] != 'intensity' else 0)[0] | |
| if face_dominant == target_emotion: | |
| score += 0.5 | |
| # Compare all emotions | |
| for emotion in ['happy', 'sad', 'angry', 'surprised', 'scared', 'neutral']: | |
| text_val = text_emotions.get(emotion, 0) | |
| face_val = face_emotions.get(emotion, 0) | |
| if text_val > 0.3 and face_val > 0.3: | |
| # Both have this emotion | |
| score += min(text_val, face_val) * 0.5 | |
| elif text_val > 0.5 and face_val < 0.2: | |
| # Text has emotion but face doesn't - penalty | |
| score -= 0.2 | |
| # Intensity matching | |
| text_intensity = text_emotions.get('intensity', 0.5) | |
| face_intensity = face_emotions.get('intensity', 0.5) | |
| intensity_diff = abs(text_intensity - face_intensity) | |
| score += (1 - intensity_diff) * 0.3 | |
| return max(0, score) | |
| def get_fallback_frame(cap, dialogue_data, fps): | |
| """Get a fallback frame from the middle of the dialogue""" | |
| middle_time = (dialogue_data['start_time'] + dialogue_data['end_time']) / 2 | |
| frame_num = int(middle_time * fps) | |
| cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num) | |
| ret, frame = cap.read() | |
| return frame if ret else None |