aMuseMe / src /amuseme /renderer.py
Blazestorm001's picture
chore: tidy Space repository structure
08ab8f1 verified
Raw
History Blame Contribute Delete
9.79 kB
"""
renderer.py — Pillow-based frame generator for kinetic typography
Produces a generator of raw RGB bytes (width × height × 3) at TARGET_FPS.
"""
from typing import Generator
from PIL import Image, ImageDraw
from .transcriber import FrameMeta
from .animations import (
FRAME_ANIMATIONS,
THEME_COLORS,
DEFAULT_FONT_FAMILY,
get_font,
)
# --- Constants ---
WIDTH = 1280
HEIGHT = 720
TARGET_FPS = 30
def get_frame_times(frame: FrameMeta) -> tuple[float, float]:
if not frame.words:
return 0.0, 0.0
return frame.words[0].start, frame.words[-1].end
def preprocess_bg(img: Image.Image) -> Image.Image:
"""
Resize and crop a background image to 1280x720, then apply
a dark semi-transparent overlay to ensure readable text contrast.
"""
w, h = img.size
aspect = WIDTH / HEIGHT
if w / h > aspect:
new_w = int(h * aspect)
left = (w - new_w) // 2
img_cropped = img.crop((left, 0, left + new_w, h))
else:
new_h = int(w / aspect)
top = (h - new_h) // 2
img_cropped = img.crop((0, top, w, top + new_h))
img_final = img_cropped.resize((WIDTH, HEIGHT), Image.Resampling.LANCZOS)
# Apply a dark overlay (RGBA overlay on top of background)
overlay = Image.new("RGBA", (WIDTH, HEIGHT), (0, 0, 0, 140))
img_rgba = img_final.convert("RGBA")
img_rgba.alpha_composite(overlay)
return img_rgba.convert("RGB")
def _get_bg_at_time(t: float, frames: list[FrameMeta], bg_list: list[Image.Image]) -> Image.Image | None:
if not bg_list or not frames:
return None
next_idx = None
for idx, frame in enumerate(frames):
f_start, _ = get_frame_times(frame)
if f_start > t:
next_idx = idx
break
if next_idx == 0:
f_start, _ = get_frame_times(frames[0])
dt = f_start - t
if 0 <= dt <= 1.0:
alpha = 1.0 - dt
black = Image.new("RGB", (WIDTH, HEIGHT), (10, 10, 10))
return Image.blend(black, bg_list[0], alpha)
return Image.new("RGB", (WIDTH, HEIGHT), (10, 10, 10))
if next_idx is None:
return bg_list[-1]
prev_idx = next_idx - 1
f_next_start, _ = get_frame_times(frames[next_idx])
dt = f_next_start - t
if 0 <= dt <= 1.0:
alpha = max(0.0, min(1.0, 1.0 - dt))
return Image.blend(bg_list[prev_idx], bg_list[next_idx], alpha)
return bg_list[prev_idx]
def _render_frame(
t: float,
frames: list[FrameMeta],
theme_name: str,
bg_image: Image.Image | None = None,
font_family: str = DEFAULT_FONT_FAMILY,
) -> bytes:
"""Render a single frame at time t. Returns raw RGB bytes."""
theme = THEME_COLORS.get(theme_name, THEME_COLORS["Dark"])
if bg_image is not None:
img = bg_image.copy()
else:
img = Image.new("RGB", (WIDTH, HEIGHT), theme.get("bg", (10,10,10)))
draw = ImageDraw.Draw(img)
# Find active frame
active_frame: FrameMeta | None = None
frame_idx = -1
for i, frame in enumerate(frames):
f_start, f_end = get_frame_times(frame)
# Frames stay on screen slightly longer than their last word
if f_start <= t <= f_end + 0.5:
active_frame = frame
frame_idx = i
break
# If no active frame is found, we might be in a transition gap.
# For now, just return the background if it's completely empty space.
if active_frame is None:
return img.tobytes()
f_start, f_end = get_frame_times(active_frame)
f_duration = f_end - f_start
t_frame = t - f_start
# Apply frame-level animation
frame_anim_func = FRAME_ANIMATIONS.get(active_frame.frame_animation, FRAME_ANIMATIONS["none"])
fx_off, fy_off, f_scale, f_opacity = frame_anim_func(t_frame, f_duration)
if f_opacity <= 0:
return img.tobytes()
# Calculate per-word layout (unscaled widths/heights). One font is used
# for the whole render, so its bbox/space metrics are shared by all words.
font = get_font(font_family)
space_bbox = draw.textbbox((0, 0), " ", font=font)
space_w = space_bbox[2] - space_bbox[0]
word_layouts = []
max_h = 0
for w in active_frame.words:
bbox = draw.textbbox((0, 0), w.text, font=font)
ww, wh = bbox[2] - bbox[0], bbox[3] - bbox[1]
word_layouts.append((w, bbox, ww, wh))
max_h = max(max_h, wh)
# Greedily wrap words into rows that fit within the screen width, so a
# long line breaks across multiple rows instead of overflowing or being
# shrunk down to an unreadably small size.
max_text_width = WIDTH * 0.92
rows: list[list[tuple]] = [[]]
row_widths = [0.0]
for layout in word_layouts:
_, _, ww, _ = layout
cur_row = rows[-1]
added_w = ww + (space_w if cur_row else 0)
if cur_row and row_widths[-1] + added_w > max_text_width:
rows.append([])
row_widths.append(0.0)
cur_row = rows[-1]
added_w = ww
cur_row.append(layout)
row_widths[-1] += added_w
# Shrink only in the rare case a single word alone is wider than the screen.
max_row_w = max(row_widths)
if max_row_w > 0 and max_row_w * f_scale > max_text_width:
f_scale *= max_text_width / (max_row_w * f_scale)
# Stack rows vertically, centered as a block.
row_gap = max_h * 0.3
row_step = (max_h + row_gap) * f_scale
total_block_h = max_h * f_scale + row_step * (len(rows) - 1)
y_block_start = HEIGHT / 2 - total_block_h / 2 + fy_off
# Render each word, row by row
for row_idx, row in enumerate(rows):
row_w = row_widths[row_idx]
x_cursor = (WIDTH - row_w * f_scale) / 2 + fx_off
y_center = y_block_start + row_idx * row_step
for (w, bbox, ww, wh) in row:
# Words not yet spoken are dimmed; spoken/current words use the
# theme's main text color.
base_color = theme["theme_default"]
if t < w.start:
base_color = theme.get("inactive", (80, 80, 80))
display_text = w.text
# Note: Pillow doesn't support text scaling directly without loading
# a new font, so for frame-level zoom we render the word to a small
# RGBA buffer and resize that instead.
if abs(f_scale - 1.0) > 0.01 or f_opacity < 0.99:
pad = int(wh * 0.5)
buf_w, buf_h = int(ww + pad*2), int(wh + pad*2)
word_img = Image.new("RGBA", (buf_w, buf_h), (0,0,0,0))
word_draw = ImageDraw.Draw(word_img)
# Draw offset by the glyph's own bbox origin so its ink always
# lands within [pad, pad+ww] x [pad, pad+wh] — fully inside the
# buffer regardless of ascenders/descenders (fixes low-profile
# words like "now" getting clipped at the bottom).
word_draw.text((pad - bbox[0], pad - bbox[1]), display_text, font=font, fill=(base_color[0], base_color[1], base_color[2], 255))
if f_scale != 1.0:
new_w, new_h = int(buf_w * f_scale), int(buf_h * f_scale)
if new_w > 0 and new_h > 0:
word_img = word_img.resize((new_w, new_h), Image.Resampling.LANCZOS)
if f_opacity < 1.0:
alpha = word_img.split()[3]
alpha = alpha.point(lambda p: p * f_opacity)
word_img.putalpha(alpha)
paste_x = int(x_cursor + (bbox[0] - pad) * f_scale)
paste_y = int(y_center + (bbox[1] - pad) * f_scale)
img.paste(word_img, (paste_x, paste_y), word_img)
else:
draw.text(
(int(x_cursor), int(y_center)),
display_text,
font=font,
fill=base_color
)
x_cursor += (ww + space_w) * f_scale
# Frame-level flash: a brief white wash that decays over the frame's first 0.25s
if active_frame.frame_animation == "flash":
FLASH_DUR = 0.25
if 0.0 <= t_frame < FLASH_DUR:
alpha = (1.0 - t_frame / FLASH_DUR) * 0.8
white = Image.new("RGB", img.size, (255, 255, 255))
img = Image.blend(img, white, alpha)
return img.tobytes()
def render_frames(
frames: list[FrameMeta],
duration: float,
theme_name: str = "Dark",
bg_images: list | None = None,
font_family: str = DEFAULT_FONT_FAMILY,
) -> Generator[bytes, None, None]:
"""
Yields raw RGB frame bytes for every frame of the video.
duration: total audio duration in seconds.
"""
bg_list = None
if isinstance(bg_images, list) and len(bg_images) > 0:
processed = [preprocess_bg(img) for img in bg_images]
# Each background image covers two consecutive lyric frames, so the
# backdrop changes about half as often as the on-screen text.
expanded = []
for img in processed:
expanded.extend([img, img])
if len(expanded) < len(frames):
expanded.extend([expanded[-1]] * (len(frames) - len(expanded)))
bg_list = expanded[:len(frames)]
elif bg_images is not None:
bg_list = [preprocess_bg(bg_images)] * len(frames)
total_frames = int(duration * TARGET_FPS)
for frame_idx in range(total_frames):
t = frame_idx / TARGET_FPS
current_bg = None
if bg_list is not None:
current_bg = _get_bg_at_time(t, frames, bg_list)
yield _render_frame(t, frames, theme_name, bg_image=current_bg, font_family=font_family)