Spaces:

build-small-hackathon
/

aMuseMe

Sleeping

App Files Files Community

aMuseMe / src /amuseme /renderer.py

Blazestorm001

chore: tidy Space repository structure

08ab8f1 verified 20 days ago

Raw

History Blame Contribute Delete

9.79 kB

	"""
	renderer.py — Pillow-based frame generator for kinetic typography
	Produces a generator of raw RGB bytes (width × height × 3) at TARGET_FPS.
	"""
	from typing import Generator
	from PIL import Image, ImageDraw

	from .transcriber import FrameMeta
	from .animations import (
	FRAME_ANIMATIONS,
	THEME_COLORS,
	DEFAULT_FONT_FAMILY,
	get_font,
	)

	# --- Constants ---
	WIDTH = 1280
	HEIGHT = 720
	TARGET_FPS = 30


	def get_frame_times(frame: FrameMeta) -> tuple[float, float]:
	if not frame.words:
	return 0.0, 0.0
	return frame.words[0].start, frame.words[-1].end


	def preprocess_bg(img: Image.Image) -> Image.Image:
	"""
	Resize and crop a background image to 1280x720, then apply
	a dark semi-transparent overlay to ensure readable text contrast.
	"""
	w, h = img.size
	aspect = WIDTH / HEIGHT
	if w / h > aspect:
	new_w = int(h * aspect)
	left = (w - new_w) // 2
	img_cropped = img.crop((left, 0, left + new_w, h))
	else:
	new_h = int(w / aspect)
	top = (h - new_h) // 2
	img_cropped = img.crop((0, top, w, top + new_h))

	img_final = img_cropped.resize((WIDTH, HEIGHT), Image.Resampling.LANCZOS)

	# Apply a dark overlay (RGBA overlay on top of background)
	overlay = Image.new("RGBA", (WIDTH, HEIGHT), (0, 0, 0, 140))
	img_rgba = img_final.convert("RGBA")
	img_rgba.alpha_composite(overlay)
	return img_rgba.convert("RGB")


	def _get_bg_at_time(t: float, frames: list[FrameMeta], bg_list: list[Image.Image]) -> Image.Image \| None:
	if not bg_list or not frames:
	return None

	next_idx = None
	for idx, frame in enumerate(frames):
	f_start, _ = get_frame_times(frame)
	if f_start > t:
	next_idx = idx
	break

	if next_idx == 0:
	f_start, _ = get_frame_times(frames[0])
	dt = f_start - t
	if 0 <= dt <= 1.0:
	alpha = 1.0 - dt
	black = Image.new("RGB", (WIDTH, HEIGHT), (10, 10, 10))
	return Image.blend(black, bg_list[0], alpha)
	return Image.new("RGB", (WIDTH, HEIGHT), (10, 10, 10))

	if next_idx is None:
	return bg_list[-1]

	prev_idx = next_idx - 1
	f_next_start, _ = get_frame_times(frames[next_idx])
	dt = f_next_start - t
	if 0 <= dt <= 1.0:
	alpha = max(0.0, min(1.0, 1.0 - dt))
	return Image.blend(bg_list[prev_idx], bg_list[next_idx], alpha)

	return bg_list[prev_idx]


	def _render_frame(
	t: float,
	frames: list[FrameMeta],
	theme_name: str,
	bg_image: Image.Image \| None = None,
	font_family: str = DEFAULT_FONT_FAMILY,
	) -> bytes:
	"""Render a single frame at time t. Returns raw RGB bytes."""
	theme = THEME_COLORS.get(theme_name, THEME_COLORS["Dark"])

	if bg_image is not None:
	img = bg_image.copy()
	else:
	img = Image.new("RGB", (WIDTH, HEIGHT), theme.get("bg", (10,10,10)))

	draw = ImageDraw.Draw(img)

	# Find active frame
	active_frame: FrameMeta \| None = None
	frame_idx = -1
	for i, frame in enumerate(frames):
	f_start, f_end = get_frame_times(frame)
	# Frames stay on screen slightly longer than their last word
	if f_start <= t <= f_end + 0.5:
	active_frame = frame
	frame_idx = i
	break

	# If no active frame is found, we might be in a transition gap.
	# For now, just return the background if it's completely empty space.
	if active_frame is None:
	return img.tobytes()

	f_start, f_end = get_frame_times(active_frame)
	f_duration = f_end - f_start
	t_frame = t - f_start

	# Apply frame-level animation
	frame_anim_func = FRAME_ANIMATIONS.get(active_frame.frame_animation, FRAME_ANIMATIONS["none"])
	fx_off, fy_off, f_scale, f_opacity = frame_anim_func(t_frame, f_duration)

	if f_opacity <= 0:
	return img.tobytes()

	# Calculate per-word layout (unscaled widths/heights). One font is used
	# for the whole render, so its bbox/space metrics are shared by all words.
	font = get_font(font_family)
	space_bbox = draw.textbbox((0, 0), " ", font=font)
	space_w = space_bbox[2] - space_bbox[0]

	word_layouts = []
	max_h = 0

	for w in active_frame.words:
	bbox = draw.textbbox((0, 0), w.text, font=font)
	ww, wh = bbox[2] - bbox[0], bbox[3] - bbox[1]
	word_layouts.append((w, bbox, ww, wh))
	max_h = max(max_h, wh)

	# Greedily wrap words into rows that fit within the screen width, so a
	# long line breaks across multiple rows instead of overflowing or being
	# shrunk down to an unreadably small size.
	max_text_width = WIDTH * 0.92
	rows: list[list[tuple]] = [[]]
	row_widths = [0.0]
	for layout in word_layouts:
	_, _, ww, _ = layout
	cur_row = rows[-1]
	added_w = ww + (space_w if cur_row else 0)
	if cur_row and row_widths[-1] + added_w > max_text_width:
	rows.append([])
	row_widths.append(0.0)
	cur_row = rows[-1]
	added_w = ww
	cur_row.append(layout)
	row_widths[-1] += added_w

	# Shrink only in the rare case a single word alone is wider than the screen.
	max_row_w = max(row_widths)
	if max_row_w > 0 and max_row_w * f_scale > max_text_width:
	f_scale = max_text_width / (max_row_w f_scale)

	# Stack rows vertically, centered as a block.
	row_gap = max_h * 0.3
	row_step = (max_h + row_gap) * f_scale
	total_block_h = max_h * f_scale + row_step * (len(rows) - 1)
	y_block_start = HEIGHT / 2 - total_block_h / 2 + fy_off

	# Render each word, row by row
	for row_idx, row in enumerate(rows):
	row_w = row_widths[row_idx]
	x_cursor = (WIDTH - row_w * f_scale) / 2 + fx_off
	y_center = y_block_start + row_idx * row_step

	for (w, bbox, ww, wh) in row:
	# Words not yet spoken are dimmed; spoken/current words use the
	# theme's main text color.
	base_color = theme["theme_default"]
	if t < w.start:
	base_color = theme.get("inactive", (80, 80, 80))

	display_text = w.text

	# Note: Pillow doesn't support text scaling directly without loading
	# a new font, so for frame-level zoom we render the word to a small
	# RGBA buffer and resize that instead.
	if abs(f_scale - 1.0) > 0.01 or f_opacity < 0.99:
	pad = int(wh * 0.5)
	buf_w, buf_h = int(ww + pad2), int(wh + pad2)
	word_img = Image.new("RGBA", (buf_w, buf_h), (0,0,0,0))
	word_draw = ImageDraw.Draw(word_img)
	# Draw offset by the glyph's own bbox origin so its ink always
	# lands within [pad, pad+ww] x [pad, pad+wh] — fully inside the
	# buffer regardless of ascenders/descenders (fixes low-profile
	# words like "now" getting clipped at the bottom).
	word_draw.text((pad - bbox[0], pad - bbox[1]), display_text, font=font, fill=(base_color[0], base_color[1], base_color[2], 255))

	if f_scale != 1.0:
	new_w, new_h = int(buf_w * f_scale), int(buf_h * f_scale)
	if new_w > 0 and new_h > 0:
	word_img = word_img.resize((new_w, new_h), Image.Resampling.LANCZOS)

	if f_opacity < 1.0:
	alpha = word_img.split()[3]
	alpha = alpha.point(lambda p: p * f_opacity)
	word_img.putalpha(alpha)

	paste_x = int(x_cursor + (bbox[0] - pad) * f_scale)
	paste_y = int(y_center + (bbox[1] - pad) * f_scale)
	img.paste(word_img, (paste_x, paste_y), word_img)
	else:
	draw.text(
	(int(x_cursor), int(y_center)),
	display_text,
	font=font,
	fill=base_color
	)

	x_cursor += (ww + space_w) * f_scale

	# Frame-level flash: a brief white wash that decays over the frame's first 0.25s
	if active_frame.frame_animation == "flash":
	FLASH_DUR = 0.25
	if 0.0 <= t_frame < FLASH_DUR:
	alpha = (1.0 - t_frame / FLASH_DUR) * 0.8
	white = Image.new("RGB", img.size, (255, 255, 255))
	img = Image.blend(img, white, alpha)

	return img.tobytes()


	def render_frames(
	frames: list[FrameMeta],
	duration: float,
	theme_name: str = "Dark",
	bg_images: list \| None = None,
	font_family: str = DEFAULT_FONT_FAMILY,
	) -> Generator[bytes, None, None]:
	"""
	Yields raw RGB frame bytes for every frame of the video.
	duration: total audio duration in seconds.
	"""
	bg_list = None
	if isinstance(bg_images, list) and len(bg_images) > 0:
	processed = [preprocess_bg(img) for img in bg_images]
	# Each background image covers two consecutive lyric frames, so the
	# backdrop changes about half as often as the on-screen text.
	expanded = []
	for img in processed:
	expanded.extend([img, img])
	if len(expanded) < len(frames):
	expanded.extend([expanded[-1]] * (len(frames) - len(expanded)))
	bg_list = expanded[:len(frames)]
	elif bg_images is not None:
	bg_list = [preprocess_bg(bg_images)] * len(frames)

	total_frames = int(duration * TARGET_FPS)
	for frame_idx in range(total_frames):
	t = frame_idx / TARGET_FPS

	current_bg = None
	if bg_list is not None:
	current_bg = _get_bg_at_time(t, frames, bg_list)

	yield _render_frame(t, frames, theme_name, bg_image=current_bg, font_family=font_family)