|
|
""" |
|
|
Scene Planner - Uses Transformer Model for Intelligent Scene Generation |
|
|
Replaces toy logic with proper ML-based scene planning |
|
|
""" |
|
|
|
|
|
import math |
|
|
import logging |
|
|
from typing import List, Dict, Tuple |
|
|
from models.text.bangla_parser import extract_scenes, BanglaSceneParser |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
class ScenePlanner: |
|
|
""" |
|
|
Production-grade scene planner using transformer models. |
|
|
Handles timing, pacing, and visual coherence. |
|
|
""" |
|
|
|
|
|
def __init__(self, model_id: str = "google/mt5-small"): |
|
|
""" |
|
|
Initialize the scene planner. |
|
|
|
|
|
Args: |
|
|
model_id: Model for Bangla text processing |
|
|
""" |
|
|
self.parser = BanglaSceneParser(model_id) |
|
|
logger.info("ScenePlanner initialized with transformer model") |
|
|
|
|
|
def plan_scenes(self, text_bn: str, duration: int = 15) -> List[Dict]: |
|
|
""" |
|
|
Generate intelligent scene plan from Bangla text. |
|
|
|
|
|
Args: |
|
|
text_bn: Input Bangla text |
|
|
duration: Total video duration in seconds |
|
|
|
|
|
Returns: |
|
|
List of scene dictionaries with timing and descriptions |
|
|
""" |
|
|
if not text_bn.strip(): |
|
|
logger.warning("Empty text provided to scene planner") |
|
|
return self._fallback_scenes(duration) |
|
|
|
|
|
try: |
|
|
|
|
|
scene_count = self._calculate_scene_count(text_bn, duration) |
|
|
logger.info(f"Planning {scene_count} scenes for {duration}s video") |
|
|
|
|
|
|
|
|
raw_scenes = self.parser.extract_scenes(text_bn, scene_count) |
|
|
|
|
|
|
|
|
scenes = self._generate_scene_timing(raw_scenes, duration, scene_count) |
|
|
|
|
|
logger.info(f"Generated {len(scenes)} scenes successfully") |
|
|
return scenes |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Scene planning failed: {e}") |
|
|
return self._fallback_scenes(duration) |
|
|
|
|
|
def _calculate_scene_count(self, text_bn: str, duration: int) -> int: |
|
|
""" |
|
|
Calculate optimal number of scenes based on content and duration. |
|
|
|
|
|
Args: |
|
|
text_bn: Input Bangla text |
|
|
duration: Video duration in seconds |
|
|
|
|
|
Returns: |
|
|
Optimal scene count (3-12) |
|
|
""" |
|
|
text_length = len(text_bn) |
|
|
|
|
|
|
|
|
if duration <= 10: |
|
|
base_scenes = 3 |
|
|
elif duration <= 20: |
|
|
base_scenes = 5 |
|
|
elif duration <= 30: |
|
|
base_scenes = 7 |
|
|
else: |
|
|
base_scenes = min(12, max(5, duration // 3)) |
|
|
|
|
|
|
|
|
sentences = text_bn.count('।') + text_bn.count('.') + text_bn.count('!') |
|
|
if sentences > 0: |
|
|
content_based = min(10, sentences + 2) |
|
|
scene_count = min(base_scenes, content_based) |
|
|
else: |
|
|
scene_count = base_scenes |
|
|
|
|
|
|
|
|
return max(3, min(scene_count, 12)) |
|
|
|
|
|
def _generate_scene_timing(self, scenes: List[str], duration: int, scene_count: int) -> List[Dict]: |
|
|
""" |
|
|
Generate scene timing with proper pacing. |
|
|
|
|
|
Args: |
|
|
scenes: List of scene descriptions |
|
|
duration: Total video duration |
|
|
scene_count: Number of scenes |
|
|
|
|
|
Returns: |
|
|
List of scene dictionaries with timing |
|
|
""" |
|
|
if not scenes: |
|
|
return self._fallback_scenes(duration) |
|
|
|
|
|
|
|
|
base_duration = duration / len(scenes) |
|
|
|
|
|
|
|
|
scenes_with_timing = [] |
|
|
|
|
|
for i, scene_desc in enumerate(scenes): |
|
|
|
|
|
scene_duration = self._calculate_scene_duration( |
|
|
scene_desc, base_duration, i, len(scenes) |
|
|
) |
|
|
|
|
|
|
|
|
start_time = sum(s.get('duration', 0) for s in scenes_with_timing) |
|
|
|
|
|
scene = { |
|
|
"id": i + 1, |
|
|
"description": scene_desc, |
|
|
"duration": scene_duration, |
|
|
"start_time": start_time, |
|
|
"end_time": start_time + scene_duration, |
|
|
"visual_style": self._determine_visual_style(scene_desc), |
|
|
"transition_type": self._determine_transition(i, len(scenes)) |
|
|
} |
|
|
|
|
|
scenes_with_timing.append(scene) |
|
|
|
|
|
|
|
|
self._adjust_timing_for_total_duration(scenes_with_timing, duration) |
|
|
|
|
|
return scenes_with_timing |
|
|
|
|
|
def _calculate_scene_duration(self, scene_desc: str, base_duration: float, |
|
|
scene_index: int, total_scenes: int) -> float: |
|
|
""" |
|
|
Calculate optimal duration for individual scene. |
|
|
|
|
|
Args: |
|
|
scene_desc: Scene description |
|
|
base_duration: Base duration per scene |
|
|
scene_index: Index of current scene |
|
|
total_scenes: Total number of scenes |
|
|
|
|
|
Returns: |
|
|
Duration for this scene |
|
|
""" |
|
|
|
|
|
duration = base_duration * (0.9 + 0.2 * (scene_index % 3) / 2) |
|
|
|
|
|
|
|
|
complexity_indicators = ['চলাচল', 'কথোপকথন', 'অনেক', 'জটিল'] |
|
|
complexity = sum(1 for indicator in complexity_indicators if indicator in scene_desc) |
|
|
|
|
|
if complexity > 0: |
|
|
duration *= (1 + 0.3 * complexity) |
|
|
|
|
|
|
|
|
return max(1.5, min(duration, 8.0)) |
|
|
|
|
|
def _determine_visual_style(self, scene_desc: str) -> str: |
|
|
"""Determine appropriate visual style for scene.""" |
|
|
if any(word in scene_desc.lower() for word in ['প্রকৃতি', 'বন', 'নদী']): |
|
|
return "nature_landscape" |
|
|
elif any(word in scene_desc.lower() for word in ['শহর', 'রাস্তা', 'গাড়ি']): |
|
|
return "urban_environment" |
|
|
elif any(word in scene_desc.lower() for word in ['বাড়ি', 'ঘর', 'আসবাব']): |
|
|
return "indoor_scene" |
|
|
elif any(word in scene_desc.lower() for word in ['মানুষ', 'ব্যক্তি', 'দল']): |
|
|
return "character_focused" |
|
|
else: |
|
|
return "general_visual" |
|
|
|
|
|
def _determine_transition(self, scene_index: int, total_scenes: int) -> str: |
|
|
"""Determine transition type between scenes.""" |
|
|
if scene_index == 0: |
|
|
return "fade_in" |
|
|
elif scene_index == total_scenes - 1: |
|
|
return "fade_out" |
|
|
else: |
|
|
return "cross_fade" |
|
|
|
|
|
def _adjust_timing_for_total_duration(self, scenes: List[Dict], target_duration: float): |
|
|
""" |
|
|
Adjust scene timings to match target duration exactly. |
|
|
|
|
|
Args: |
|
|
scenes: List of scenes with timing |
|
|
target_duration: Target total duration |
|
|
""" |
|
|
current_total = sum(scene['duration'] for scene in scenes) |
|
|
|
|
|
if abs(current_total - target_duration) < 0.1: |
|
|
return |
|
|
|
|
|
|
|
|
adjustment_factor = target_duration / current_total |
|
|
|
|
|
|
|
|
for scene in scenes: |
|
|
original_duration = scene['duration'] |
|
|
scene['duration'] = original_duration * adjustment_factor |
|
|
|
|
|
|
|
|
scene_index = scene['id'] - 1 |
|
|
if scene_index == 0: |
|
|
scene['start_time'] = 0 |
|
|
else: |
|
|
scene['start_time'] = sum(s['duration'] for s in scenes[:scene_index]) |
|
|
|
|
|
scene['end_time'] = scene['start_time'] + scene['duration'] |
|
|
|
|
|
def _fallback_scenes(self, duration: int) -> List[Dict]: |
|
|
""" |
|
|
Generate fallback scenes when main planning fails. |
|
|
|
|
|
Args: |
|
|
duration: Video duration |
|
|
|
|
|
Returns: |
|
|
Basic scene plan |
|
|
""" |
|
|
scene_count = 3 |
|
|
scene_duration = duration / scene_count |
|
|
|
|
|
scenes = [] |
|
|
for i in range(scene_count): |
|
|
scene = { |
|
|
"id": i + 1, |
|
|
"description": f"Fallback Scene {i+1}: Visual content for segment {i+1}", |
|
|
"duration": scene_duration, |
|
|
"start_time": i * scene_duration, |
|
|
"end_time": (i + 1) * scene_duration, |
|
|
"visual_style": "general_visual", |
|
|
"transition_type": "cross_fade" if i < scene_count - 1 else "fade_out" |
|
|
} |
|
|
scenes.append(scene) |
|
|
|
|
|
return scenes |
|
|
|
|
|
def get_scene_statistics(self, scenes: List[Dict]) -> Dict: |
|
|
""" |
|
|
Get statistics about the generated scene plan. |
|
|
|
|
|
Args: |
|
|
scenes: List of scenes |
|
|
|
|
|
Returns: |
|
|
Dictionary with scene statistics |
|
|
""" |
|
|
if not scenes: |
|
|
return {"total_scenes": 0, "total_duration": 0} |
|
|
|
|
|
durations = [scene['duration'] for scene in scenes] |
|
|
styles = [scene['visual_style'] for scene in scenes] |
|
|
|
|
|
return { |
|
|
"total_scenes": len(scenes), |
|
|
"total_duration": sum(durations), |
|
|
"avg_scene_duration": sum(durations) / len(durations), |
|
|
"min_scene_duration": min(durations), |
|
|
"max_scene_duration": max(durations), |
|
|
"visual_styles": list(set(styles)), |
|
|
"scene_distribution": {style: styles.count(style) for style in set(styles)} |
|
|
} |
|
|
|
|
|
|
|
|
_planner_instance = None |
|
|
|
|
|
def get_planner(model_id: str = "google/mt5-small") -> ScenePlanner: |
|
|
"""Get or create a global scene planner instance.""" |
|
|
global _planner_instance |
|
|
if _planner_instance is None or _planner_instance.parser.model_id != model_id: |
|
|
_planner_instance = ScenePlanner(model_id) |
|
|
return _planner_instance |
|
|
|
|
|
def plan_scenes(text_bn: str, duration: int = 15) -> List[Dict]: |
|
|
"""Convenience function for scene planning.""" |
|
|
planner = get_planner() |
|
|
return planner.plan_scenes(text_bn, duration) |