# ──────────────────────────────────────────────────────────────────────────────── # 🚧 FUTURE FEATURE: DECLARATIVE JSON RENDERER # ──────────────────────────────────────────────────────────────────────────────── # This module implements a standalone "Video Engine" that renders videos based on a # JSON specification (similar to Remotion or After Effects Scripting). # # NOTE: This is currently EXPERIMENTAL and separate from the main auto-clipping pipeline. # It is intended for future use cases where precise, programmatic control over # every frame, text, and transition is required (e.g., frontend-driven editing). # ──────────────────────────────────────────────────────────────────────────────── import os import requests import tempfile from moviepy.editor import ( VideoFileClip, TextClip, ImageClip, CompositeVideoClip, ColorClip, AudioFileClip, CompositeAudioClip ) from pydantic import BaseModel from typing import List, Optional, Union, Literal # ───────────────────────────────────────────────────────────── # 1. Define the Schema (The Language of the Engine) # ───────────────────────────────────────────────────────────── class Asset(BaseModel): type: Literal['video', 'image', 'text', 'audio'] src: Optional[str] = None text: Optional[str] = None style: Optional[dict] = {} # Font, color, size, bg_color, stroke_color, stroke_width, shadow_color, shadow_offset class Animation(BaseModel): type: Literal['fade_in', 'fade_out', 'pop_in', 'scale_in', 'slide_up', 'slide_left'] duration: float = 0.5 class Clip(BaseModel): asset: Asset start: float length: Optional[float] = None trim_start: float = 0.0 scale: float = 1.0 position: Union[Literal['center', 'top', 'bottom', 'left', 'right'], List[int]] = 'center' opacity: float = 1.0 volume: float = 1.0 layer: int = 0 animations: List[Animation] = [] # List of animations to apply class Track(BaseModel): clips: List[Clip] class Timeline(BaseModel): background: str = "#000000" tracks: List[Track] class OutputSpec(BaseModel): format: str = "mp4" resolution: str = "1080:1920" # width:height fps: int = 30 class RenderRequest(BaseModel): timeline: Timeline output: OutputSpec # ───────────────────────────────────────────────────────────── # 2. The Engine (JSON -> MoviePy) # ───────────────────────────────────────────────────────────── class JSONRenderer: def __init__(self, output_dir="outputs"): self.output_dir = output_dir os.makedirs(output_dir, exist_ok=True) self.temp_files = [] def _download_asset(self, url): """Helper to download assets from URLs""" if not url.startswith(('http:', 'https:')): return url try: response = requests.get(url, stream=True) response.raise_for_status() # Get extension or default ext = os.path.splitext(url)[1] or ".tmp" tf = tempfile.NamedTemporaryFile(delete=False, suffix=ext) for chunk in response.iter_content(chunk_size=8192): tf.write(chunk) tf.close() self.temp_files.append(tf.name) return tf.name except Exception as e: print(f"Failed to download asset: {url} - {e}") return None def cleanup(self): """Remove temp files""" for f in self.temp_files: try: os.remove(f) except: pass def render(self, request: RenderRequest, output_filename: str): """ Takes a JSON spec and renders a video file. """ try: width, height = map(int, request.output.resolution.split(":")) fps = request.output.fps # 1. Create Background final_video_clips = [] max_duration = 0 audio_clips = [] # Background Color bg_clip = ColorClip(size=(width, height), color=request.timeline.background) # 2. Process Tracks & Clips # Flatten all clips and sort by layer all_clips_spec = [] for track in request.timeline.tracks: for clip_spec in track.clips: all_clips_spec.append(clip_spec) # Sort by layer (ascending) all_clips_spec.sort(key=lambda x: x.layer) for clip_spec in all_clips_spec: clip = self._create_moviepy_clip(clip_spec, width, height) if clip: # Apply timing clip = clip.set_start(clip_spec.start) # Update max duration end_time = clip_spec.start + clip.duration if end_time > max_duration: max_duration = end_time # Separate audio/video if clip_spec.asset.type == 'audio': audio_clips.append(clip) else: final_video_clips.append(clip) # 3. Final Composition bg_clip = bg_clip.set_duration(max_duration) final_video_clips.insert(0, bg_clip) final_video = CompositeVideoClip(final_video_clips, size=(width, height)) # Handle Audio Mixing composite_audio_list = [] if final_video.audio: composite_audio_list.append(final_video.audio) composite_audio_list.extend(audio_clips) if composite_audio_list: final_video.audio = CompositeAudioClip(composite_audio_list) final_video = final_video.set_duration(max_duration) # 4. Write File output_path = os.path.join(self.output_dir, output_filename) final_video.write_videofile( output_path, fps=fps, codec="libx264", audio_codec="aac", threads=4, preset="medium" ) return output_path finally: self.cleanup() def _create_moviepy_clip(self, clip_spec: Clip, screen_w, screen_h): asset = clip_spec.asset clip = None try: src_path = asset.src if src_path and src_path.startswith(('http', 'https')): src_path = self._download_asset(src_path) # Check file existence for local files if src_path and not os.path.exists(src_path) and not src_path.startswith(('http', 'https')): # Try relative to project root if absolute fails if os.path.exists(os.path.abspath(src_path)): src_path = os.path.abspath(src_path) # --- Video --- if asset.type == 'video': if not src_path: return None clip = VideoFileClip(src_path) if clip_spec.length: end = clip_spec.trim_start + clip_spec.length clip = clip.subclip(clip_spec.trim_start, min(end, clip.duration)) else: clip = clip.subclip(clip_spec.trim_start) # Resize video if clip_spec.scale != 1.0: clip = clip.resize(clip_spec.scale) # Audio Volume if clip.audio: clip = clip.volumex(clip_spec.volume) # --- Image --- elif asset.type == 'image': if not src_path: return None clip = ImageClip(src_path) if clip_spec.length: clip = clip.set_duration(clip_spec.length) if clip_spec.scale != 1.0: clip = clip.resize(clip_spec.scale) # --- Text --- elif asset.type == 'text': if not asset.text: return None fontsize = asset.style.get('fontSize', 70) color = asset.style.get('color', 'white') font = asset.style.get('font', 'Arial') bg_color = asset.style.get('backgroundColor', None) stroke_color = asset.style.get('stroke_color', None) stroke_width = asset.style.get('stroke_width', 1) # TextClip wrapper # Note: You need ImageMagick installed for TextClip clip = TextClip( asset.text, fontsize=fontsize, color=color, font=font, bg_color=bg_color, stroke_color=stroke_color, stroke_width=stroke_width, method='caption', size=(int(screen_w * 0.9), None) # Auto-wrap ) if clip_spec.length: clip = clip.set_duration(clip_spec.length) # --- Audio --- elif asset.type == 'audio': if not src_path: return None clip = AudioFileClip(src_path) if clip_spec.length: end = clip_spec.trim_start + clip_spec.length clip = clip.subclip(clip_spec.trim_start, min(end, clip.duration)) clip = clip.volumex(clip_spec.volume) return clip # --- Common Visual Props --- if clip: # 1. Apply Position first pos = clip_spec.position if isinstance(pos, list): pos = tuple(pos) clip = clip.set_position(pos) # 2. Apply Opacity if clip_spec.opacity < 1.0: clip = clip.set_opacity(clip_spec.opacity) # 3. Apply Animations for anim in clip_spec.animations: clip = self._apply_animation(clip, anim, screen_w, screen_h) return clip except Exception as e: print(f"Error creating clip for asset {asset}: {e}") return None def _create_text_clip_from_style(self, text, style, screen_w): """Helper to create a TextClip with full styling support""" try: fontsize = style.get('fontSize', 70) color = style.get('color', 'white') font = style.get('font', 'Arial') bg_color = style.get('backgroundColor', None) stroke_color = style.get('stroke_color', None) stroke_width = style.get('stroke_width', 0) # Shadow implementation (simple drop shadow via composition if needed, # but TextClip has limited shadow support directly. # We can simulate it by creating a black copy behind.) shadow_color = style.get('shadow_color', None) shadow_offset = style.get('shadow_offset', (2, 2)) # Main Text txt_clip = TextClip( text, fontsize=fontsize, color=color, font=font, bg_color=bg_color, stroke_color=stroke_color, stroke_width=stroke_width, method='caption', align='center', size=(int(screen_w * 0.9), None) # Auto-wrap ) if shadow_color: # Create shadow layer shadow_clip = TextClip( text, fontsize=fontsize, color=shadow_color, font=font, method='caption', align='center', size=(int(screen_w * 0.9), None) ).set_position(lambda t: (shadow_offset[0], shadow_offset[1])) # Offset relative to parent # Composite shadow + text # We need a CompositeVideoClip that fits both w, h = txt_clip.size composite = CompositeVideoClip( [shadow_clip, txt_clip.set_position('center')], size=(w + abs(shadow_offset[0])*2, h + abs(shadow_offset[1])*2) ) return composite return txt_clip except Exception as e: print(f"Error creating text clip: {e}") return None def _apply_animation(self, clip, anim: Animation, w, h): """Apply MoviePy transformations for animations""" d = anim.duration if anim.type == 'fade_in': return clip.fadein(d) elif anim.type == 'fade_out': return clip.fadeout(d) elif anim.type == 'pop_in': # Scale from 0 to 1 with a slight bounce effect could be complex, # simple linear scale 0->1 for now return clip.resize(lambda t: min(1, t / d) if t < d else 1) elif anim.type == 'scale_in': # Zoom from 0.8 to 1.0 return clip.resize(lambda t: 0.8 + 0.2 * (t / d) if t < d else 1) elif anim.type == 'slide_up': # Move from bottom to original position # Note: This overrides static position, so needs care. # We assume 'pos' was set to the final destination. # Get final x, y. This is tricky in MoviePy as pos can be strings. # Simplified: Slide from bottom of screen def slide(t): if t >= d: return clip.pos(t) # Stay at final progress = t / d x, y = clip.pos(t) # If y is a string (like 'center'), we can't easily calculate offset without computing logic # Fallback to simple fade if pos is relative, or implement relative sliding later return x, y # Placeholder: Real sliding requires resolving 'center' to pixels # Better approach for slide: CompositeVideoClip handles pos better. # For now, let's use a simple transform if pos is absolute, else skip pass return clip # ───────────────────────────────────────────────────────────── # 3. Helpers (STT -> Timeline) # ───────────────────────────────────────────────────────────── def convert_whisper_to_timeline( whisper_result: dict, video_path: str, max_words_per_line: int = 5, base_style: dict = {}, highlight_style: dict = {} ) -> Timeline: """ Convert Whisper STT output to a renderer Timeline. Args: whisper_result: The raw output from Whisper (segments with words). video_path: Path to the source video. max_words_per_line: Max words to show at once (auto-segmentation). base_style: Default text style. highlight_style: Style for the active word (karaoke effect). """ tracks = [] # 1. Video Track (Background) video_track = Track(clips=[ Clip( asset=Asset(type='video', src=video_path), start=0, layer=0 ) ]) tracks.append(video_track) # 2. Text Track (Captions) text_clips = [] all_words = [] # Flatten segments into a single list of words if 'segments' in whisper_result: for seg in whisper_result['segments']: if 'words' in seg: all_words.extend(seg['words']) # Group words into chunks (lines) for i in range(0, len(all_words), max_words_per_line): chunk = all_words[i : i + max_words_per_line] if not chunk: continue start_time = chunk[0]['start'] end_time = chunk[-1]['end'] text_content = " ".join([w['word'].strip() for w in chunk]) # Build Word objects with highlight timing words_objs = [] for w in chunk: words_objs.append(Word( text=w['word'].strip(), start=w['start'], end=w['end'], style=highlight_style # Active style )) text_clips.append(Clip( asset=Asset( type='text', text=text_content, words=words_objs, style=base_style ), start=start_time, length=end_time - start_time, position='center', # Default position layer=1 )) tracks.append(Track(clips=text_clips)) return Timeline(background="#000000", tracks=tracks) def _apply_animation(self, clip, anim: Animation, w, h): """Apply MoviePy transformations for animations""" d = anim.duration if anim.type == 'fade_in': return clip.fadein(d) elif anim.type == 'fade_out': return clip.fadeout(d) elif anim.type == 'pop_in': # Scale from 0 to 1 with a slight bounce effect could be complex, # simple linear scale 0->1 for now return clip.resize(lambda t: min(1, t / d) if t < d else 1) elif anim.type == 'scale_in': # Zoom from 0.8 to 1.0 return clip.resize(lambda t: 0.8 + 0.2 * (t / d) if t < d else 1) elif anim.type == 'slide_up': # Move from bottom to original position # Note: This overrides static position, so needs care. # We assume 'pos' was set to the final destination. # Get final x, y. This is tricky in MoviePy as pos can be strings. # Simplified: Slide from bottom of screen def slide(t): if t >= d: return clip.pos(t) # Stay at final progress = t / d x, y = clip.pos(t) # If y is a string (like 'center'), we can't easily calculate offset without computing logic # Fallback to simple fade if pos is relative, or implement relative sliding later return x, y # Placeholder: Real sliding requires resolving 'center' to pixels # Better approach for slide: CompositeVideoClip handles pos better. # For now, let's use a simple transform if pos is absolute, else skip pass return clip