""" renderer.py — Pillow-based frame generator for kinetic typography Produces a generator of raw RGB bytes (width × height × 3) at TARGET_FPS. """ from typing import Generator from PIL import Image, ImageDraw from .transcriber import FrameMeta from .animations import ( FRAME_ANIMATIONS, THEME_COLORS, DEFAULT_FONT_FAMILY, get_font, ) # --- Constants --- WIDTH = 1280 HEIGHT = 720 TARGET_FPS = 30 def get_frame_times(frame: FrameMeta) -> tuple[float, float]: if not frame.words: return 0.0, 0.0 return frame.words[0].start, frame.words[-1].end def preprocess_bg(img: Image.Image) -> Image.Image: """ Resize and crop a background image to 1280x720, then apply a dark semi-transparent overlay to ensure readable text contrast. """ w, h = img.size aspect = WIDTH / HEIGHT if w / h > aspect: new_w = int(h * aspect) left = (w - new_w) // 2 img_cropped = img.crop((left, 0, left + new_w, h)) else: new_h = int(w / aspect) top = (h - new_h) // 2 img_cropped = img.crop((0, top, w, top + new_h)) img_final = img_cropped.resize((WIDTH, HEIGHT), Image.Resampling.LANCZOS) # Apply a dark overlay (RGBA overlay on top of background) overlay = Image.new("RGBA", (WIDTH, HEIGHT), (0, 0, 0, 140)) img_rgba = img_final.convert("RGBA") img_rgba.alpha_composite(overlay) return img_rgba.convert("RGB") def _get_bg_at_time(t: float, frames: list[FrameMeta], bg_list: list[Image.Image]) -> Image.Image | None: if not bg_list or not frames: return None next_idx = None for idx, frame in enumerate(frames): f_start, _ = get_frame_times(frame) if f_start > t: next_idx = idx break if next_idx == 0: f_start, _ = get_frame_times(frames[0]) dt = f_start - t if 0 <= dt <= 1.0: alpha = 1.0 - dt black = Image.new("RGB", (WIDTH, HEIGHT), (10, 10, 10)) return Image.blend(black, bg_list[0], alpha) return Image.new("RGB", (WIDTH, HEIGHT), (10, 10, 10)) if next_idx is None: return bg_list[-1] prev_idx = next_idx - 1 f_next_start, _ = get_frame_times(frames[next_idx]) dt = f_next_start - t if 0 <= dt <= 1.0: alpha = max(0.0, min(1.0, 1.0 - dt)) return Image.blend(bg_list[prev_idx], bg_list[next_idx], alpha) return bg_list[prev_idx] def _render_frame( t: float, frames: list[FrameMeta], theme_name: str, bg_image: Image.Image | None = None, font_family: str = DEFAULT_FONT_FAMILY, ) -> bytes: """Render a single frame at time t. Returns raw RGB bytes.""" theme = THEME_COLORS.get(theme_name, THEME_COLORS["Dark"]) if bg_image is not None: img = bg_image.copy() else: img = Image.new("RGB", (WIDTH, HEIGHT), theme.get("bg", (10,10,10))) draw = ImageDraw.Draw(img) # Find active frame active_frame: FrameMeta | None = None frame_idx = -1 for i, frame in enumerate(frames): f_start, f_end = get_frame_times(frame) # Frames stay on screen slightly longer than their last word if f_start <= t <= f_end + 0.5: active_frame = frame frame_idx = i break # If no active frame is found, we might be in a transition gap. # For now, just return the background if it's completely empty space. if active_frame is None: return img.tobytes() f_start, f_end = get_frame_times(active_frame) f_duration = f_end - f_start t_frame = t - f_start # Apply frame-level animation frame_anim_func = FRAME_ANIMATIONS.get(active_frame.frame_animation, FRAME_ANIMATIONS["none"]) fx_off, fy_off, f_scale, f_opacity = frame_anim_func(t_frame, f_duration) if f_opacity <= 0: return img.tobytes() # Calculate per-word layout (unscaled widths/heights). One font is used # for the whole render, so its bbox/space metrics are shared by all words. font = get_font(font_family) space_bbox = draw.textbbox((0, 0), " ", font=font) space_w = space_bbox[2] - space_bbox[0] word_layouts = [] max_h = 0 for w in active_frame.words: bbox = draw.textbbox((0, 0), w.text, font=font) ww, wh = bbox[2] - bbox[0], bbox[3] - bbox[1] word_layouts.append((w, bbox, ww, wh)) max_h = max(max_h, wh) # Greedily wrap words into rows that fit within the screen width, so a # long line breaks across multiple rows instead of overflowing or being # shrunk down to an unreadably small size. max_text_width = WIDTH * 0.92 rows: list[list[tuple]] = [[]] row_widths = [0.0] for layout in word_layouts: _, _, ww, _ = layout cur_row = rows[-1] added_w = ww + (space_w if cur_row else 0) if cur_row and row_widths[-1] + added_w > max_text_width: rows.append([]) row_widths.append(0.0) cur_row = rows[-1] added_w = ww cur_row.append(layout) row_widths[-1] += added_w # Shrink only in the rare case a single word alone is wider than the screen. max_row_w = max(row_widths) if max_row_w > 0 and max_row_w * f_scale > max_text_width: f_scale *= max_text_width / (max_row_w * f_scale) # Stack rows vertically, centered as a block. row_gap = max_h * 0.3 row_step = (max_h + row_gap) * f_scale total_block_h = max_h * f_scale + row_step * (len(rows) - 1) y_block_start = HEIGHT / 2 - total_block_h / 2 + fy_off # Render each word, row by row for row_idx, row in enumerate(rows): row_w = row_widths[row_idx] x_cursor = (WIDTH - row_w * f_scale) / 2 + fx_off y_center = y_block_start + row_idx * row_step for (w, bbox, ww, wh) in row: # Words not yet spoken are dimmed; spoken/current words use the # theme's main text color. base_color = theme["theme_default"] if t < w.start: base_color = theme.get("inactive", (80, 80, 80)) display_text = w.text # Note: Pillow doesn't support text scaling directly without loading # a new font, so for frame-level zoom we render the word to a small # RGBA buffer and resize that instead. if abs(f_scale - 1.0) > 0.01 or f_opacity < 0.99: pad = int(wh * 0.5) buf_w, buf_h = int(ww + pad*2), int(wh + pad*2) word_img = Image.new("RGBA", (buf_w, buf_h), (0,0,0,0)) word_draw = ImageDraw.Draw(word_img) # Draw offset by the glyph's own bbox origin so its ink always # lands within [pad, pad+ww] x [pad, pad+wh] — fully inside the # buffer regardless of ascenders/descenders (fixes low-profile # words like "now" getting clipped at the bottom). word_draw.text((pad - bbox[0], pad - bbox[1]), display_text, font=font, fill=(base_color[0], base_color[1], base_color[2], 255)) if f_scale != 1.0: new_w, new_h = int(buf_w * f_scale), int(buf_h * f_scale) if new_w > 0 and new_h > 0: word_img = word_img.resize((new_w, new_h), Image.Resampling.LANCZOS) if f_opacity < 1.0: alpha = word_img.split()[3] alpha = alpha.point(lambda p: p * f_opacity) word_img.putalpha(alpha) paste_x = int(x_cursor + (bbox[0] - pad) * f_scale) paste_y = int(y_center + (bbox[1] - pad) * f_scale) img.paste(word_img, (paste_x, paste_y), word_img) else: draw.text( (int(x_cursor), int(y_center)), display_text, font=font, fill=base_color ) x_cursor += (ww + space_w) * f_scale # Frame-level flash: a brief white wash that decays over the frame's first 0.25s if active_frame.frame_animation == "flash": FLASH_DUR = 0.25 if 0.0 <= t_frame < FLASH_DUR: alpha = (1.0 - t_frame / FLASH_DUR) * 0.8 white = Image.new("RGB", img.size, (255, 255, 255)) img = Image.blend(img, white, alpha) return img.tobytes() def render_frames( frames: list[FrameMeta], duration: float, theme_name: str = "Dark", bg_images: list | None = None, font_family: str = DEFAULT_FONT_FAMILY, ) -> Generator[bytes, None, None]: """ Yields raw RGB frame bytes for every frame of the video. duration: total audio duration in seconds. """ bg_list = None if isinstance(bg_images, list) and len(bg_images) > 0: processed = [preprocess_bg(img) for img in bg_images] # Each background image covers two consecutive lyric frames, so the # backdrop changes about half as often as the on-screen text. expanded = [] for img in processed: expanded.extend([img, img]) if len(expanded) < len(frames): expanded.extend([expanded[-1]] * (len(frames) - len(expanded))) bg_list = expanded[:len(frames)] elif bg_images is not None: bg_list = [preprocess_bg(bg_images)] * len(frames) total_frames = int(duration * TARGET_FPS) for frame_idx in range(total_frames): t = frame_idx / TARGET_FPS current_bg = None if bg_list is not None: current_bg = _get_bg_at_time(t, frames, bg_list) yield _render_frame(t, frames, theme_name, bg_image=current_bg, font_family=font_family)