Spaces:

ex510
/

auto_cliper

Running

App Files Files Community

auto_cliper / core /renderer.py

aliSaac510

feat: Add text wrapping support for long captions in highlight mode

d392f23 about 2 months ago

raw

history blame contribute delete

19.8 kB

	# ────────────────────────────────────────────────────────────────────────────────
	# 🚧 FUTURE FEATURE: DECLARATIVE JSON RENDERER
	# ────────────────────────────────────────────────────────────────────────────────
	# This module implements a standalone "Video Engine" that renders videos based on a
	# JSON specification (similar to Remotion or After Effects Scripting).
	#
	# NOTE: This is currently EXPERIMENTAL and separate from the main auto-clipping pipeline.
	# It is intended for future use cases where precise, programmatic control over
	# every frame, text, and transition is required (e.g., frontend-driven editing).
	# ────────────────────────────────────────────────────────────────────────────────

	import os
	import requests
	import tempfile
	from moviepy.editor import (
	VideoFileClip, TextClip, ImageClip, CompositeVideoClip,
	ColorClip, AudioFileClip, CompositeAudioClip
	)
	from pydantic import BaseModel
	from typing import List, Optional, Union, Literal

	# ─────────────────────────────────────────────────────────────
	# 1. Define the Schema (The Language of the Engine)
	# ─────────────────────────────────────────────────────────────

	class Asset(BaseModel):
	type: Literal['video', 'image', 'text', 'audio']
	src: Optional[str] = None
	text: Optional[str] = None
	style: Optional[dict] = {} # Font, color, size, bg_color, stroke_color, stroke_width, shadow_color, shadow_offset

	class Animation(BaseModel):
	type: Literal['fade_in', 'fade_out', 'pop_in', 'scale_in', 'slide_up', 'slide_left']
	duration: float = 0.5

	class Clip(BaseModel):
	asset: Asset
	start: float
	length: Optional[float] = None
	trim_start: float = 0.0
	scale: float = 1.0
	position: Union[Literal['center', 'top', 'bottom', 'left', 'right'], List[int]] = 'center'
	opacity: float = 1.0
	volume: float = 1.0
	layer: int = 0
	animations: List[Animation] = [] # List of animations to apply

	class Track(BaseModel):
	clips: List[Clip]

	class Timeline(BaseModel):
	background: str = "#000000"
	tracks: List[Track]

	class OutputSpec(BaseModel):
	format: str = "mp4"
	resolution: str = "1080:1920" # width:height
	fps: int = 30

	class RenderRequest(BaseModel):
	timeline: Timeline
	output: OutputSpec

	# ─────────────────────────────────────────────────────────────
	# 2. The Engine (JSON -> MoviePy)
	# ─────────────────────────────────────────────────────────────

	class JSONRenderer:
	def __init__(self, output_dir="outputs"):
	self.output_dir = output_dir
	os.makedirs(output_dir, exist_ok=True)
	self.temp_files = []

	def _download_asset(self, url):
	"""Helper to download assets from URLs"""
	if not url.startswith(('http:', 'https:')):
	return url

	try:
	response = requests.get(url, stream=True)
	response.raise_for_status()

	# Get extension or default
	ext = os.path.splitext(url)[1] or ".tmp"
	tf = tempfile.NamedTemporaryFile(delete=False, suffix=ext)

	for chunk in response.iter_content(chunk_size=8192):
	tf.write(chunk)

	tf.close()
	self.temp_files.append(tf.name)
	return tf.name
	except Exception as e:
	print(f"Failed to download asset: {url} - {e}")
	return None

	def cleanup(self):
	"""Remove temp files"""
	for f in self.temp_files:
	try:
	os.remove(f)
	except:
	pass

	def render(self, request: RenderRequest, output_filename: str):
	"""
	Takes a JSON spec and renders a video file.
	"""
	try:
	width, height = map(int, request.output.resolution.split(":"))
	fps = request.output.fps

	# 1. Create Background
	final_video_clips = []
	max_duration = 0
	audio_clips = []

	# Background Color
	bg_clip = ColorClip(size=(width, height), color=request.timeline.background)

	# 2. Process Tracks & Clips
	# Flatten all clips and sort by layer
	all_clips_spec = []
	for track in request.timeline.tracks:
	for clip_spec in track.clips:
	all_clips_spec.append(clip_spec)

	# Sort by layer (ascending)
	all_clips_spec.sort(key=lambda x: x.layer)

	for clip_spec in all_clips_spec:
	clip = self._create_moviepy_clip(clip_spec, width, height)

	if clip:
	# Apply timing
	clip = clip.set_start(clip_spec.start)

	# Update max duration
	end_time = clip_spec.start + clip.duration
	if end_time > max_duration:
	max_duration = end_time

	# Separate audio/video
	if clip_spec.asset.type == 'audio':
	audio_clips.append(clip)
	else:
	final_video_clips.append(clip)

	# 3. Final Composition
	bg_clip = bg_clip.set_duration(max_duration)
	final_video_clips.insert(0, bg_clip)

	final_video = CompositeVideoClip(final_video_clips, size=(width, height))

	# Handle Audio Mixing
	composite_audio_list = []
	if final_video.audio:
	composite_audio_list.append(final_video.audio)
	composite_audio_list.extend(audio_clips)

	if composite_audio_list:
	final_video.audio = CompositeAudioClip(composite_audio_list)

	final_video = final_video.set_duration(max_duration)

	# 4. Write File
	output_path = os.path.join(self.output_dir, output_filename)
	final_video.write_videofile(
	output_path,
	fps=fps,
	codec="libx264",
	audio_codec="aac",
	threads=4,
	preset="medium"
	)

	return output_path
	finally:
	self.cleanup()

	def _create_moviepy_clip(self, clip_spec: Clip, screen_w, screen_h):
	asset = clip_spec.asset
	clip = None

	try:
	src_path = asset.src
	if src_path and src_path.startswith(('http', 'https')):
	src_path = self._download_asset(src_path)

	# Check file existence for local files
	if src_path and not os.path.exists(src_path) and not src_path.startswith(('http', 'https')):
	# Try relative to project root if absolute fails
	if os.path.exists(os.path.abspath(src_path)):
	src_path = os.path.abspath(src_path)

	# --- Video ---
	if asset.type == 'video':
	if not src_path: return None
	clip = VideoFileClip(src_path)
	if clip_spec.length:
	end = clip_spec.trim_start + clip_spec.length
	clip = clip.subclip(clip_spec.trim_start, min(end, clip.duration))
	else:
	clip = clip.subclip(clip_spec.trim_start)

	# Resize video
	if clip_spec.scale != 1.0:
	clip = clip.resize(clip_spec.scale)

	# Audio Volume
	if clip.audio:
	clip = clip.volumex(clip_spec.volume)

	# --- Image ---
	elif asset.type == 'image':
	if not src_path: return None
	clip = ImageClip(src_path)
	if clip_spec.length:
	clip = clip.set_duration(clip_spec.length)

	if clip_spec.scale != 1.0:
	clip = clip.resize(clip_spec.scale)

	# --- Text ---
	elif asset.type == 'text':
	if not asset.text: return None
	fontsize = asset.style.get('fontSize', 70)
	color = asset.style.get('color', 'white')
	font = asset.style.get('font', 'Arial')
	bg_color = asset.style.get('backgroundColor', None)
	stroke_color = asset.style.get('stroke_color', None)
	stroke_width = asset.style.get('stroke_width', 1)

	# TextClip wrapper
	# Note: You need ImageMagick installed for TextClip
	clip = TextClip(
	asset.text,
	fontsize=fontsize,
	color=color,
	font=font,
	bg_color=bg_color,
	stroke_color=stroke_color,
	stroke_width=stroke_width,
	method='caption',
	size=(int(screen_w * 0.9), None) # Auto-wrap
	)
	if clip_spec.length:
	clip = clip.set_duration(clip_spec.length)

	# --- Audio ---
	elif asset.type == 'audio':
	if not src_path: return None
	clip = AudioFileClip(src_path)
	if clip_spec.length:
	end = clip_spec.trim_start + clip_spec.length
	clip = clip.subclip(clip_spec.trim_start, min(end, clip.duration))
	clip = clip.volumex(clip_spec.volume)
	return clip

	# --- Common Visual Props ---
	if clip:
	# 1. Apply Position first
	pos = clip_spec.position
	if isinstance(pos, list):
	pos = tuple(pos)
	clip = clip.set_position(pos)

	# 2. Apply Opacity
	if clip_spec.opacity < 1.0:
	clip = clip.set_opacity(clip_spec.opacity)

	# 3. Apply Animations
	for anim in clip_spec.animations:
	clip = self._apply_animation(clip, anim, screen_w, screen_h)

	return clip

	except Exception as e:
	print(f"Error creating clip for asset {asset}: {e}")
	return None

	def _create_text_clip_from_style(self, text, style, screen_w):
	"""Helper to create a TextClip with full styling support"""
	try:
	fontsize = style.get('fontSize', 70)
	color = style.get('color', 'white')
	font = style.get('font', 'Arial')
	bg_color = style.get('backgroundColor', None)
	stroke_color = style.get('stroke_color', None)
	stroke_width = style.get('stroke_width', 0)

	# Shadow implementation (simple drop shadow via composition if needed,
	# but TextClip has limited shadow support directly.
	# We can simulate it by creating a black copy behind.)
	shadow_color = style.get('shadow_color', None)
	shadow_offset = style.get('shadow_offset', (2, 2))

	# Main Text
	txt_clip = TextClip(
	text,
	fontsize=fontsize,
	color=color,
	font=font,
	bg_color=bg_color,
	stroke_color=stroke_color,
	stroke_width=stroke_width,
	method='caption',
	align='center',
	size=(int(screen_w * 0.9), None) # Auto-wrap
	)

	if shadow_color:
	# Create shadow layer
	shadow_clip = TextClip(
	text,
	fontsize=fontsize,
	color=shadow_color,
	font=font,
	method='caption',
	align='center',
	size=(int(screen_w * 0.9), None)
	).set_position(lambda t: (shadow_offset[0], shadow_offset[1])) # Offset relative to parent

	# Composite shadow + text
	# We need a CompositeVideoClip that fits both
	w, h = txt_clip.size
	composite = CompositeVideoClip(
	[shadow_clip, txt_clip.set_position('center')],
	size=(w + abs(shadow_offset[0])2, h + abs(shadow_offset[1])2)
	)
	return composite

	return txt_clip
	except Exception as e:
	print(f"Error creating text clip: {e}")
	return None

	def _apply_animation(self, clip, anim: Animation, w, h):
	"""Apply MoviePy transformations for animations"""
	d = anim.duration

	if anim.type == 'fade_in':
	return clip.fadein(d)

	elif anim.type == 'fade_out':
	return clip.fadeout(d)

	elif anim.type == 'pop_in':
	# Scale from 0 to 1 with a slight bounce effect could be complex,
	# simple linear scale 0->1 for now
	return clip.resize(lambda t: min(1, t / d) if t < d else 1)

	elif anim.type == 'scale_in':
	# Zoom from 0.8 to 1.0
	return clip.resize(lambda t: 0.8 + 0.2 * (t / d) if t < d else 1)

	elif anim.type == 'slide_up':
	# Move from bottom to original position
	# Note: This overrides static position, so needs care.
	# We assume 'pos' was set to the final destination.
	# Get final x, y. This is tricky in MoviePy as pos can be strings.
	# Simplified: Slide from bottom of screen
	def slide(t):
	if t >= d: return clip.pos(t) # Stay at final
	progress = t / d
	x, y = clip.pos(t)
	# If y is a string (like 'center'), we can't easily calculate offset without computing logic
	# Fallback to simple fade if pos is relative, or implement relative sliding later
	return x, y # Placeholder: Real sliding requires resolving 'center' to pixels

	# Better approach for slide: CompositeVideoClip handles pos better.
	# For now, let's use a simple transform if pos is absolute, else skip
	pass

	return clip

	# ─────────────────────────────────────────────────────────────
	# 3. Helpers (STT -> Timeline)
	# ─────────────────────────────────────────────────────────────

	def convert_whisper_to_timeline(
	whisper_result: dict,
	video_path: str,
	max_words_per_line: int = 5,
	base_style: dict = {},
	highlight_style: dict = {}
	) -> Timeline:
	"""
	Convert Whisper STT output to a renderer Timeline.

	Args:
	whisper_result: The raw output from Whisper (segments with words).
	video_path: Path to the source video.
	max_words_per_line: Max words to show at once (auto-segmentation).
	base_style: Default text style.
	highlight_style: Style for the active word (karaoke effect).
	"""
	tracks = []

	# 1. Video Track (Background)
	video_track = Track(clips=[
	Clip(
	asset=Asset(type='video', src=video_path),
	start=0,
	layer=0
	)
	])
	tracks.append(video_track)

	# 2. Text Track (Captions)
	text_clips = []

	all_words = []
	# Flatten segments into a single list of words
	if 'segments' in whisper_result:
	for seg in whisper_result['segments']:
	if 'words' in seg:
	all_words.extend(seg['words'])

	# Group words into chunks (lines)
	for i in range(0, len(all_words), max_words_per_line):
	chunk = all_words[i : i + max_words_per_line]
	if not chunk: continue

	start_time = chunk[0]['start']
	end_time = chunk[-1]['end']
	text_content = " ".join([w['word'].strip() for w in chunk])

	# Build Word objects with highlight timing
	words_objs = []
	for w in chunk:
	words_objs.append(Word(
	text=w['word'].strip(),
	start=w['start'],
	end=w['end'],
	style=highlight_style # Active style
	))

	text_clips.append(Clip(
	asset=Asset(
	type='text',
	text=text_content,
	words=words_objs,
	style=base_style
	),
	start=start_time,
	length=end_time - start_time,
	position='center', # Default position
	layer=1
	))

	tracks.append(Track(clips=text_clips))

	return Timeline(background="#000000", tracks=tracks)

	def _apply_animation(self, clip, anim: Animation, w, h):
	"""Apply MoviePy transformations for animations"""
	d = anim.duration

	if anim.type == 'fade_in':
	return clip.fadein(d)

	elif anim.type == 'fade_out':
	return clip.fadeout(d)

	elif anim.type == 'pop_in':
	# Scale from 0 to 1 with a slight bounce effect could be complex,
	# simple linear scale 0->1 for now
	return clip.resize(lambda t: min(1, t / d) if t < d else 1)

	elif anim.type == 'scale_in':
	# Zoom from 0.8 to 1.0
	return clip.resize(lambda t: 0.8 + 0.2 * (t / d) if t < d else 1)

	elif anim.type == 'slide_up':
	# Move from bottom to original position
	# Note: This overrides static position, so needs care.
	# We assume 'pos' was set to the final destination.
	# Get final x, y. This is tricky in MoviePy as pos can be strings.
	# Simplified: Slide from bottom of screen
	def slide(t):
	if t >= d: return clip.pos(t) # Stay at final
	progress = t / d
	x, y = clip.pos(t)
	# If y is a string (like 'center'), we can't easily calculate offset without computing logic
	# Fallback to simple fade if pos is relative, or implement relative sliding later
	return x, y # Placeholder: Real sliding requires resolving 'center' to pixels

	# Better approach for slide: CompositeVideoClip handles pos better.
	# For now, let's use a simple transform if pos is absolute, else skip
	pass

	return clip