Spaces:
Sleeping
Sleeping
| """ | |
| renderer.py — Pillow-based frame generator for kinetic typography | |
| Produces a generator of raw RGB bytes (width × height × 3) at TARGET_FPS. | |
| """ | |
| from typing import Generator | |
| from PIL import Image, ImageDraw | |
| from .transcriber import FrameMeta | |
| from .animations import ( | |
| FRAME_ANIMATIONS, | |
| THEME_COLORS, | |
| DEFAULT_FONT_FAMILY, | |
| get_font, | |
| ) | |
| # --- Constants --- | |
| WIDTH = 1280 | |
| HEIGHT = 720 | |
| TARGET_FPS = 30 | |
| def get_frame_times(frame: FrameMeta) -> tuple[float, float]: | |
| if not frame.words: | |
| return 0.0, 0.0 | |
| return frame.words[0].start, frame.words[-1].end | |
| def preprocess_bg(img: Image.Image) -> Image.Image: | |
| """ | |
| Resize and crop a background image to 1280x720, then apply | |
| a dark semi-transparent overlay to ensure readable text contrast. | |
| """ | |
| w, h = img.size | |
| aspect = WIDTH / HEIGHT | |
| if w / h > aspect: | |
| new_w = int(h * aspect) | |
| left = (w - new_w) // 2 | |
| img_cropped = img.crop((left, 0, left + new_w, h)) | |
| else: | |
| new_h = int(w / aspect) | |
| top = (h - new_h) // 2 | |
| img_cropped = img.crop((0, top, w, top + new_h)) | |
| img_final = img_cropped.resize((WIDTH, HEIGHT), Image.Resampling.LANCZOS) | |
| # Apply a dark overlay (RGBA overlay on top of background) | |
| overlay = Image.new("RGBA", (WIDTH, HEIGHT), (0, 0, 0, 140)) | |
| img_rgba = img_final.convert("RGBA") | |
| img_rgba.alpha_composite(overlay) | |
| return img_rgba.convert("RGB") | |
| def _get_bg_at_time(t: float, frames: list[FrameMeta], bg_list: list[Image.Image]) -> Image.Image | None: | |
| if not bg_list or not frames: | |
| return None | |
| next_idx = None | |
| for idx, frame in enumerate(frames): | |
| f_start, _ = get_frame_times(frame) | |
| if f_start > t: | |
| next_idx = idx | |
| break | |
| if next_idx == 0: | |
| f_start, _ = get_frame_times(frames[0]) | |
| dt = f_start - t | |
| if 0 <= dt <= 1.0: | |
| alpha = 1.0 - dt | |
| black = Image.new("RGB", (WIDTH, HEIGHT), (10, 10, 10)) | |
| return Image.blend(black, bg_list[0], alpha) | |
| return Image.new("RGB", (WIDTH, HEIGHT), (10, 10, 10)) | |
| if next_idx is None: | |
| return bg_list[-1] | |
| prev_idx = next_idx - 1 | |
| f_next_start, _ = get_frame_times(frames[next_idx]) | |
| dt = f_next_start - t | |
| if 0 <= dt <= 1.0: | |
| alpha = max(0.0, min(1.0, 1.0 - dt)) | |
| return Image.blend(bg_list[prev_idx], bg_list[next_idx], alpha) | |
| return bg_list[prev_idx] | |
| def _render_frame( | |
| t: float, | |
| frames: list[FrameMeta], | |
| theme_name: str, | |
| bg_image: Image.Image | None = None, | |
| font_family: str = DEFAULT_FONT_FAMILY, | |
| ) -> bytes: | |
| """Render a single frame at time t. Returns raw RGB bytes.""" | |
| theme = THEME_COLORS.get(theme_name, THEME_COLORS["Dark"]) | |
| if bg_image is not None: | |
| img = bg_image.copy() | |
| else: | |
| img = Image.new("RGB", (WIDTH, HEIGHT), theme.get("bg", (10,10,10))) | |
| draw = ImageDraw.Draw(img) | |
| # Find active frame | |
| active_frame: FrameMeta | None = None | |
| frame_idx = -1 | |
| for i, frame in enumerate(frames): | |
| f_start, f_end = get_frame_times(frame) | |
| # Frames stay on screen slightly longer than their last word | |
| if f_start <= t <= f_end + 0.5: | |
| active_frame = frame | |
| frame_idx = i | |
| break | |
| # If no active frame is found, we might be in a transition gap. | |
| # For now, just return the background if it's completely empty space. | |
| if active_frame is None: | |
| return img.tobytes() | |
| f_start, f_end = get_frame_times(active_frame) | |
| f_duration = f_end - f_start | |
| t_frame = t - f_start | |
| # Apply frame-level animation | |
| frame_anim_func = FRAME_ANIMATIONS.get(active_frame.frame_animation, FRAME_ANIMATIONS["none"]) | |
| fx_off, fy_off, f_scale, f_opacity = frame_anim_func(t_frame, f_duration) | |
| if f_opacity <= 0: | |
| return img.tobytes() | |
| # Calculate per-word layout (unscaled widths/heights). One font is used | |
| # for the whole render, so its bbox/space metrics are shared by all words. | |
| font = get_font(font_family) | |
| space_bbox = draw.textbbox((0, 0), " ", font=font) | |
| space_w = space_bbox[2] - space_bbox[0] | |
| word_layouts = [] | |
| max_h = 0 | |
| for w in active_frame.words: | |
| bbox = draw.textbbox((0, 0), w.text, font=font) | |
| ww, wh = bbox[2] - bbox[0], bbox[3] - bbox[1] | |
| word_layouts.append((w, bbox, ww, wh)) | |
| max_h = max(max_h, wh) | |
| # Greedily wrap words into rows that fit within the screen width, so a | |
| # long line breaks across multiple rows instead of overflowing or being | |
| # shrunk down to an unreadably small size. | |
| max_text_width = WIDTH * 0.92 | |
| rows: list[list[tuple]] = [[]] | |
| row_widths = [0.0] | |
| for layout in word_layouts: | |
| _, _, ww, _ = layout | |
| cur_row = rows[-1] | |
| added_w = ww + (space_w if cur_row else 0) | |
| if cur_row and row_widths[-1] + added_w > max_text_width: | |
| rows.append([]) | |
| row_widths.append(0.0) | |
| cur_row = rows[-1] | |
| added_w = ww | |
| cur_row.append(layout) | |
| row_widths[-1] += added_w | |
| # Shrink only in the rare case a single word alone is wider than the screen. | |
| max_row_w = max(row_widths) | |
| if max_row_w > 0 and max_row_w * f_scale > max_text_width: | |
| f_scale *= max_text_width / (max_row_w * f_scale) | |
| # Stack rows vertically, centered as a block. | |
| row_gap = max_h * 0.3 | |
| row_step = (max_h + row_gap) * f_scale | |
| total_block_h = max_h * f_scale + row_step * (len(rows) - 1) | |
| y_block_start = HEIGHT / 2 - total_block_h / 2 + fy_off | |
| # Render each word, row by row | |
| for row_idx, row in enumerate(rows): | |
| row_w = row_widths[row_idx] | |
| x_cursor = (WIDTH - row_w * f_scale) / 2 + fx_off | |
| y_center = y_block_start + row_idx * row_step | |
| for (w, bbox, ww, wh) in row: | |
| # Words not yet spoken are dimmed; spoken/current words use the | |
| # theme's main text color. | |
| base_color = theme["theme_default"] | |
| if t < w.start: | |
| base_color = theme.get("inactive", (80, 80, 80)) | |
| display_text = w.text | |
| # Note: Pillow doesn't support text scaling directly without loading | |
| # a new font, so for frame-level zoom we render the word to a small | |
| # RGBA buffer and resize that instead. | |
| if abs(f_scale - 1.0) > 0.01 or f_opacity < 0.99: | |
| pad = int(wh * 0.5) | |
| buf_w, buf_h = int(ww + pad*2), int(wh + pad*2) | |
| word_img = Image.new("RGBA", (buf_w, buf_h), (0,0,0,0)) | |
| word_draw = ImageDraw.Draw(word_img) | |
| # Draw offset by the glyph's own bbox origin so its ink always | |
| # lands within [pad, pad+ww] x [pad, pad+wh] — fully inside the | |
| # buffer regardless of ascenders/descenders (fixes low-profile | |
| # words like "now" getting clipped at the bottom). | |
| word_draw.text((pad - bbox[0], pad - bbox[1]), display_text, font=font, fill=(base_color[0], base_color[1], base_color[2], 255)) | |
| if f_scale != 1.0: | |
| new_w, new_h = int(buf_w * f_scale), int(buf_h * f_scale) | |
| if new_w > 0 and new_h > 0: | |
| word_img = word_img.resize((new_w, new_h), Image.Resampling.LANCZOS) | |
| if f_opacity < 1.0: | |
| alpha = word_img.split()[3] | |
| alpha = alpha.point(lambda p: p * f_opacity) | |
| word_img.putalpha(alpha) | |
| paste_x = int(x_cursor + (bbox[0] - pad) * f_scale) | |
| paste_y = int(y_center + (bbox[1] - pad) * f_scale) | |
| img.paste(word_img, (paste_x, paste_y), word_img) | |
| else: | |
| draw.text( | |
| (int(x_cursor), int(y_center)), | |
| display_text, | |
| font=font, | |
| fill=base_color | |
| ) | |
| x_cursor += (ww + space_w) * f_scale | |
| # Frame-level flash: a brief white wash that decays over the frame's first 0.25s | |
| if active_frame.frame_animation == "flash": | |
| FLASH_DUR = 0.25 | |
| if 0.0 <= t_frame < FLASH_DUR: | |
| alpha = (1.0 - t_frame / FLASH_DUR) * 0.8 | |
| white = Image.new("RGB", img.size, (255, 255, 255)) | |
| img = Image.blend(img, white, alpha) | |
| return img.tobytes() | |
| def render_frames( | |
| frames: list[FrameMeta], | |
| duration: float, | |
| theme_name: str = "Dark", | |
| bg_images: list | None = None, | |
| font_family: str = DEFAULT_FONT_FAMILY, | |
| ) -> Generator[bytes, None, None]: | |
| """ | |
| Yields raw RGB frame bytes for every frame of the video. | |
| duration: total audio duration in seconds. | |
| """ | |
| bg_list = None | |
| if isinstance(bg_images, list) and len(bg_images) > 0: | |
| processed = [preprocess_bg(img) for img in bg_images] | |
| # Each background image covers two consecutive lyric frames, so the | |
| # backdrop changes about half as often as the on-screen text. | |
| expanded = [] | |
| for img in processed: | |
| expanded.extend([img, img]) | |
| if len(expanded) < len(frames): | |
| expanded.extend([expanded[-1]] * (len(frames) - len(expanded))) | |
| bg_list = expanded[:len(frames)] | |
| elif bg_images is not None: | |
| bg_list = [preprocess_bg(bg_images)] * len(frames) | |
| total_frames = int(duration * TARGET_FPS) | |
| for frame_idx in range(total_frames): | |
| t = frame_idx / TARGET_FPS | |
| current_bg = None | |
| if bg_list is not None: | |
| current_bg = _get_bg_at_time(t, frames, bg_list) | |
| yield _render_frame(t, frames, theme_name, bg_image=current_bg, font_family=font_family) | |