Spaces:

build-small-hackathon
/

tricket

Running

lerp666

Add LTX-Video image-to-video motion mode (toggle, keeps Ken Burns default)

ae19991 22 days ago

13.2 kB

	"""
	tricket — AI product-ad video studio (Gradio frontend, runs on a HF Space).

	Tab 1 🎬 Ad Studio : one prompt -> ad script -> per-scene image + voiceover ->
	Ken Burns slideshow with burned captions -> 9:16 MP4.
	Tab 2 🖼 Single Image : plain FLUX.1-schnell text-to-image.

	Heavy lifting runs on Modal (FLUX / Kokoro / ffmpeg). The script is written by
	GLM-5.1 through the Hugging Face inference router.

	Required Space secrets (Settings -> Variables and secrets):
	MODAL_TOKEN_ID
	MODAL_TOKEN_SECRET
	HF_TOKEN (for the GLM script-writer via router.huggingface.co)
	"""

	import io
	import json
	import os
	import re
	import tempfile

	import gradio as gr
	import modal
	from PIL import Image

	MODAL_APP = "tricket-flux"
	SCRIPT_MODEL = "zai-org/GLM-4.6" # via HF router; overridable with SCRIPT_MODEL env

	# 9:16 generation size for FLUX (multiples of 16).
	GEN_W, GEN_H = 768, 1344

	# Kokoro voices grouped by language.
	LANGS = {
	"中文": {
	"lang_code": "z",
	"voices": {
	"晓晓 · 女声": "zf_xiaoxiao",
	"小贝 · 女声": "zf_xiaobei",
	"云健 · 男声": "zm_yunjian",
	"云希 · 男声": "zm_yunxi",
	},
	},
	"English": {
	"lang_code": "a",
	"voices": {
	"Heart · F": "af_heart",
	"Bella · F": "af_bella",
	"Michael · M": "am_michael",
	"Puck · M": "am_puck",
	},
	},
	}

	# ---------------------------------------------------------------------------
	# Modal handles (resolved lazily so import never crashes the Space).
	# ---------------------------------------------------------------------------
	def _modal_handles():
	model = modal.Cls.from_name(MODAL_APP, "Model")()
	tts = modal.Cls.from_name(MODAL_APP, "TTS")()
	animate = modal.Cls.from_name(MODAL_APP, "Animate")()
	assemble = modal.Function.from_name(MODAL_APP, "assemble_video")
	return model, tts, animate, assemble


	def _need_modal():
	if not os.environ.get("MODAL_TOKEN_ID") or not os.environ.get("MODAL_TOKEN_SECRET"):
	raise gr.Error(
	"Modal credentials missing — add MODAL_TOKEN_ID and MODAL_TOKEN_SECRET "
	"to this Space's secrets."
	)


	def _need_hf():
	if not os.environ.get("HF_TOKEN"):
	raise gr.Error(
	"HF_TOKEN missing — add it to this Space's secrets (used by the "
	"GLM script writer)."
	)


	# ---------------------------------------------------------------------------
	# Script generation (GLM via HF router)
	# ---------------------------------------------------------------------------
	def write_script(product, selling_points, language, num_scenes):
	from openai import OpenAI

	client = OpenAI(
	base_url="https://router.huggingface.co/v1",
	api_key=os.environ["HF_TOKEN"],
	)
	lang_name = "Simplified Chinese" if language == "中文" else "English"
	sys = (
	"You are an award-winning short-form video ad creative. "
	"You write punchy vertical (9:16) product ads."
	)
	user = f"""Create a {num_scenes}-scene vertical product ad.

	Product: {product}
	Key selling points: {selling_points or "(infer sensible ones)"}

	Rules:
	- Narration language: {lang_name}. Each scene narration is ONE short spoken sentence (max ~16 words), energetic.
	- caption: a VERY short on-screen text overlay in {lang_name} (max ~6 words).
	- image_prompt: a vivid ENGLISH text-to-image prompt for that scene, cinematic, vertical composition, no text in image.
	- Scene 1 = hook / hero shot. Last scene = call to action.
	- Return STRICT JSON only, no markdown, schema:
	{{"title": str, "scenes": [{{"narration": str, "caption": str, "image_prompt": str}}]}}"""

	resp = client.chat.completions.create(
	model=os.environ.get("SCRIPT_MODEL", SCRIPT_MODEL),
	messages=[{"role": "system", "content": sys}, {"role": "user", "content": user}],
	temperature=0.8,
	)
	raw = resp.choices[0].message.content or ""
	data = _parse_json(raw)
	scenes = data.get("scenes", [])[:num_scenes]
	if not scenes:
	raise gr.Error("Script generation returned no scenes; try again.")
	return data.get("title", product), scenes


	def _parse_json(raw):
	raw = raw.strip()
	raw = re.sub(r"^```(?:json)?", "", raw).strip()
	raw = re.sub(r"```$", "", raw).strip()
	try:
	return json.loads(raw)
	except Exception:
	m = re.search(r"\{.*\}", raw, re.DOTALL)
	if m:
	return json.loads(m.group(0))
	raise gr.Error("Could not parse script JSON from the model.")


	# ---------------------------------------------------------------------------
	# Ad Studio orchestration (streaming generator)
	# ---------------------------------------------------------------------------
	def make_ad(product, selling_points, language, voice_label, num_scenes, product_image, motion):
	_need_modal()
	_need_hf()
	if not product or not product.strip():
	raise gr.Error("Please describe your product.")

	num_scenes = int(num_scenes)
	lang_code = LANGS[language]["lang_code"]
	voice = LANGS[language]["voices"].get(voice_label) or next(
	iter(LANGS[language]["voices"].values())
	)
	use_video = "LTX" in (motion or "")
	# LTX needs /32; FLUX needs /16 — 704x1216 satisfies both and is LTX-native.
	gen_w, gen_h = (704, 1216) if use_video else (GEN_W, GEN_H)

	log = []

	def status(msg):
	log.append(msg)
	return "\n\n".join(log)

	yield status("📝 Writing ad script with GLM…"), [], None

	title, scenes = write_script(product, selling_points, language, num_scenes)
	script_preview = "\n".join(
	f"{i+1}. {s.get('caption','')} — {s.get('narration','')}"
	for i, s in enumerate(scenes)
	)
	yield status(f"🎬 {title}\n\n{script_preview}\n\n🖼 Generating scenes on GPU…"), [], None

	model, tts, animate, assemble = _modal_handles()

	# Fan out image + TTS jobs in parallel across Modal containers.
	img_calls = [
	model.generate.spawn(prompt=s["image_prompt"], width=gen_w, height=gen_h)
	for s in scenes
	]
	tts_calls = [
	tts.synth.spawn(text=s.get("narration", ""), voice=voice, lang_code=lang_code)
	for s in scenes
	]

	# Optional: use the uploaded product photo as hero (scene 1) and CTA (last).
	hero_bytes = _image_to_png_bytes(product_image) if product_image else None

	images, gallery = [], []
	for i, call in enumerate(img_calls):
	png = call.get()
	if hero_bytes and (i == 0 or i == len(img_calls) - 1):
	png = hero_bytes
	images.append(png)
	gallery.append(Image.open(io.BytesIO(png)))
	yield status(f"🖼 Scene {i+1}/{len(scenes)} ready…"), list(gallery), None

	# Optional: animate each still into a motion clip with LTX-Video.
	videos = [None] * len(scenes)
	if use_video:
	yield status("🎥 Animating scenes with LTX-Video (this is the slow part)…"), list(gallery), None
	anim_calls = [
	animate.animate.spawn(
	image_png=images[i],
	prompt=scenes[i].get("image_prompt", ""),
	width=gen_w,
	height=gen_h,
	)
	for i in range(len(scenes))
	]
	for i, c in enumerate(anim_calls):
	videos[i] = c.get()
	yield status(f"🎥 Motion clip {i+1}/{len(scenes)} ready…"), list(gallery), None

	yield status("🔊 Synthesizing voiceover…"), list(gallery), None
	audios = [c.get() for c in tts_calls]

	mode_label = "LTX motion" if use_video else "Ken Burns"
	yield status(f"🎞 Assembling video ({mode_label} + captions + voiceover)…"), list(gallery), None
	scene_payload = [
	{
	"image": images[i],
	"video": videos[i],
	"audio": audios[i],
	"caption": scenes[i].get("caption", ""),
	}
	for i in range(len(scenes))
	]
	mp4 = assemble.remote(scene_payload)

	out = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
	out.write(mp4)
	out.close()
	yield status(f"✅ Done! {title} — {len(scenes)} scenes ({mode_label})."), list(gallery), out.name


	def _image_to_png_bytes(path_or_img):
	try:
	img = Image.open(path_or_img) if isinstance(path_or_img, str) else Image.fromarray(path_or_img)
	img = img.convert("RGB")
	buf = io.BytesIO()
	img.save(buf, format="PNG")
	return buf.getvalue()
	except Exception:
	return None


	# ---------------------------------------------------------------------------
	# Single-image tab
	# ---------------------------------------------------------------------------
	def single_image(prompt, steps, width, height, seed):
	_need_modal()
	if not prompt or not prompt.strip():
	raise gr.Error("Please enter a prompt.")
	model = modal.Cls.from_name(MODAL_APP, "Model")()
	try:
	png = model.generate.remote(
	prompt=prompt,
	num_inference_steps=int(steps),
	width=int(width),
	height=int(height),
	seed=int(seed),
	)
	except Exception as exc: # noqa: BLE001
	raise gr.Error(f"Generation failed: {exc}")
	return Image.open(io.BytesIO(png))


	# ---------------------------------------------------------------------------
	# UI
	# ---------------------------------------------------------------------------
	def _on_lang_change(language):
	voices = list(LANGS[language]["voices"].keys())
	return gr.update(choices=voices, value=voices[0])


	with gr.Blocks(title="tricket", theme=gr.themes.Soft()) as demo:
	gr.Markdown(
	"""
	# 🎬 tricket — one prompt → a product ad video
	Describe a product → GLM writes the script → FLUX paints each scene →
	Kokoro voices it → ffmpeg cuts a vertical (9:16) ad.
	GPU runs on-demand on Modal. First run after idle ~30–60s (cold start).
	"""
	)

	with gr.Tab("🎬 Ad Studio"):
	with gr.Row():
	with gr.Column(scale=3):
	product = gr.Textbox(
	label="Product",
	placeholder="Aura Buds — wireless noise-cancelling earbuds with 30h battery",
	lines=2,
	)
	selling = gr.Textbox(
	label="Key selling points (optional)",
	placeholder="noise cancelling, 30h battery, sweat-proof, instant pairing",
	lines=2,
	)
	with gr.Row():
	language = gr.Dropdown(
	list(LANGS.keys()), value="中文", label="Narration language"
	)
	voice = gr.Dropdown(
	list(LANGS["中文"]["voices"].keys()),
	value=list(LANGS["中文"]["voices"].keys())[0],
	label="Voice",
	)
	scenes_n = gr.Slider(3, 6, value=4, step=1, label="Scenes")
	product_img = gr.Image(
	label="Product photo (optional — used as hero & end frame)",
	type="filepath",
	height=160,
	)
	motion = gr.Radio(
	["Ken Burns（快）", "AI 视频 · LTX（慢，更炫）"],
	value="Ken Burns（快）",
	label="Motion / 运镜",
	info="AI 视频会为每个场景生成真实动态片段,明显更慢、更费 GPU。",
	)
	make_btn = gr.Button("🎬 Make Ad Video", variant="primary")
	with gr.Column(scale=4):
	status = gr.Markdown("Ready.")
	gallery = gr.Gallery(label="Scenes", columns=3, height=240)
	video = gr.Video(label="Ad video (9:16)")

	language.change(_on_lang_change, inputs=language, outputs=voice)
	make_btn.click(
	make_ad,
	inputs=[product, selling, language, voice, scenes_n, product_img, motion],
	outputs=[status, gallery, video],
	)

	with gr.Tab("🖼 Single Image"):
	with gr.Row():
	with gr.Column(scale=3):
	s_prompt = gr.Textbox(label="Prompt", lines=3,
	placeholder="a cinematic photo of a red panda barista")
	with gr.Row():
	s_steps = gr.Slider(1, 8, value=4, step=1, label="Steps")
	s_seed = gr.Number(value=-1, label="Seed (-1=random)", precision=0)
	with gr.Row():
	s_w = gr.Slider(512, 1024, value=1024, step=64, label="Width")
	s_h = gr.Slider(512, 1024, value=1024, step=64, label="Height")
	s_btn = gr.Button("Generate", variant="primary")
	with gr.Column(scale=4):
	s_out = gr.Image(label="Result", type="pil", height=512)
	s_btn.click(single_image, inputs=[s_prompt, s_steps, s_w, s_h, s_seed], outputs=s_out)


	if __name__ == "__main__":
	demo.launch()