tricket / app.py
lerp666
Add LTX-Video image-to-video motion mode (toggle, keeps Ken Burns default)
ae19991
Raw
History Blame Contribute Delete
13.2 kB
"""
tricket — AI product-ad video studio (Gradio frontend, runs on a HF Space).
Tab 1 🎬 Ad Studio : one prompt -> ad script -> per-scene image + voiceover ->
Ken Burns slideshow with burned captions -> 9:16 MP4.
Tab 2 🖼 Single Image : plain FLUX.1-schnell text-to-image.
Heavy lifting runs on Modal (FLUX / Kokoro / ffmpeg). The script is written by
GLM-5.1 through the Hugging Face inference router.
Required Space secrets (Settings -> Variables and secrets):
MODAL_TOKEN_ID
MODAL_TOKEN_SECRET
HF_TOKEN (for the GLM script-writer via router.huggingface.co)
"""
import io
import json
import os
import re
import tempfile
import gradio as gr
import modal
from PIL import Image
MODAL_APP = "tricket-flux"
SCRIPT_MODEL = "zai-org/GLM-4.6" # via HF router; overridable with SCRIPT_MODEL env
# 9:16 generation size for FLUX (multiples of 16).
GEN_W, GEN_H = 768, 1344
# Kokoro voices grouped by language.
LANGS = {
"中文": {
"lang_code": "z",
"voices": {
"晓晓 · 女声": "zf_xiaoxiao",
"小贝 · 女声": "zf_xiaobei",
"云健 · 男声": "zm_yunjian",
"云希 · 男声": "zm_yunxi",
},
},
"English": {
"lang_code": "a",
"voices": {
"Heart · F": "af_heart",
"Bella · F": "af_bella",
"Michael · M": "am_michael",
"Puck · M": "am_puck",
},
},
}
# ---------------------------------------------------------------------------
# Modal handles (resolved lazily so import never crashes the Space).
# ---------------------------------------------------------------------------
def _modal_handles():
model = modal.Cls.from_name(MODAL_APP, "Model")()
tts = modal.Cls.from_name(MODAL_APP, "TTS")()
animate = modal.Cls.from_name(MODAL_APP, "Animate")()
assemble = modal.Function.from_name(MODAL_APP, "assemble_video")
return model, tts, animate, assemble
def _need_modal():
if not os.environ.get("MODAL_TOKEN_ID") or not os.environ.get("MODAL_TOKEN_SECRET"):
raise gr.Error(
"Modal credentials missing — add MODAL_TOKEN_ID and MODAL_TOKEN_SECRET "
"to this Space's secrets."
)
def _need_hf():
if not os.environ.get("HF_TOKEN"):
raise gr.Error(
"HF_TOKEN missing — add it to this Space's secrets (used by the "
"GLM script writer)."
)
# ---------------------------------------------------------------------------
# Script generation (GLM via HF router)
# ---------------------------------------------------------------------------
def write_script(product, selling_points, language, num_scenes):
from openai import OpenAI
client = OpenAI(
base_url="https://router.huggingface.co/v1",
api_key=os.environ["HF_TOKEN"],
)
lang_name = "Simplified Chinese" if language == "中文" else "English"
sys = (
"You are an award-winning short-form video ad creative. "
"You write punchy vertical (9:16) product ads."
)
user = f"""Create a {num_scenes}-scene vertical product ad.
Product: {product}
Key selling points: {selling_points or "(infer sensible ones)"}
Rules:
- Narration language: {lang_name}. Each scene narration is ONE short spoken sentence (max ~16 words), energetic.
- caption: a VERY short on-screen text overlay in {lang_name} (max ~6 words).
- image_prompt: a vivid ENGLISH text-to-image prompt for that scene, cinematic, vertical composition, no text in image.
- Scene 1 = hook / hero shot. Last scene = call to action.
- Return STRICT JSON only, no markdown, schema:
{{"title": str, "scenes": [{{"narration": str, "caption": str, "image_prompt": str}}]}}"""
resp = client.chat.completions.create(
model=os.environ.get("SCRIPT_MODEL", SCRIPT_MODEL),
messages=[{"role": "system", "content": sys}, {"role": "user", "content": user}],
temperature=0.8,
)
raw = resp.choices[0].message.content or ""
data = _parse_json(raw)
scenes = data.get("scenes", [])[:num_scenes]
if not scenes:
raise gr.Error("Script generation returned no scenes; try again.")
return data.get("title", product), scenes
def _parse_json(raw):
raw = raw.strip()
raw = re.sub(r"^```(?:json)?", "", raw).strip()
raw = re.sub(r"```$", "", raw).strip()
try:
return json.loads(raw)
except Exception:
m = re.search(r"\{.*\}", raw, re.DOTALL)
if m:
return json.loads(m.group(0))
raise gr.Error("Could not parse script JSON from the model.")
# ---------------------------------------------------------------------------
# Ad Studio orchestration (streaming generator)
# ---------------------------------------------------------------------------
def make_ad(product, selling_points, language, voice_label, num_scenes, product_image, motion):
_need_modal()
_need_hf()
if not product or not product.strip():
raise gr.Error("Please describe your product.")
num_scenes = int(num_scenes)
lang_code = LANGS[language]["lang_code"]
voice = LANGS[language]["voices"].get(voice_label) or next(
iter(LANGS[language]["voices"].values())
)
use_video = "LTX" in (motion or "")
# LTX needs /32; FLUX needs /16 — 704x1216 satisfies both and is LTX-native.
gen_w, gen_h = (704, 1216) if use_video else (GEN_W, GEN_H)
log = []
def status(msg):
log.append(msg)
return "\n\n".join(log)
yield status("📝 Writing ad script with GLM…"), [], None
title, scenes = write_script(product, selling_points, language, num_scenes)
script_preview = "\n".join(
f"**{i+1}. {s.get('caption','')}** — {s.get('narration','')}"
for i, s in enumerate(scenes)
)
yield status(f"🎬 **{title}**\n\n{script_preview}\n\n🖼 Generating scenes on GPU…"), [], None
model, tts, animate, assemble = _modal_handles()
# Fan out image + TTS jobs in parallel across Modal containers.
img_calls = [
model.generate.spawn(prompt=s["image_prompt"], width=gen_w, height=gen_h)
for s in scenes
]
tts_calls = [
tts.synth.spawn(text=s.get("narration", ""), voice=voice, lang_code=lang_code)
for s in scenes
]
# Optional: use the uploaded product photo as hero (scene 1) and CTA (last).
hero_bytes = _image_to_png_bytes(product_image) if product_image else None
images, gallery = [], []
for i, call in enumerate(img_calls):
png = call.get()
if hero_bytes and (i == 0 or i == len(img_calls) - 1):
png = hero_bytes
images.append(png)
gallery.append(Image.open(io.BytesIO(png)))
yield status(f"🖼 Scene {i+1}/{len(scenes)} ready…"), list(gallery), None
# Optional: animate each still into a motion clip with LTX-Video.
videos = [None] * len(scenes)
if use_video:
yield status("🎥 Animating scenes with LTX-Video (this is the slow part)…"), list(gallery), None
anim_calls = [
animate.animate.spawn(
image_png=images[i],
prompt=scenes[i].get("image_prompt", ""),
width=gen_w,
height=gen_h,
)
for i in range(len(scenes))
]
for i, c in enumerate(anim_calls):
videos[i] = c.get()
yield status(f"🎥 Motion clip {i+1}/{len(scenes)} ready…"), list(gallery), None
yield status("🔊 Synthesizing voiceover…"), list(gallery), None
audios = [c.get() for c in tts_calls]
mode_label = "LTX motion" if use_video else "Ken Burns"
yield status(f"🎞 Assembling video ({mode_label} + captions + voiceover)…"), list(gallery), None
scene_payload = [
{
"image": images[i],
"video": videos[i],
"audio": audios[i],
"caption": scenes[i].get("caption", ""),
}
for i in range(len(scenes))
]
mp4 = assemble.remote(scene_payload)
out = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
out.write(mp4)
out.close()
yield status(f"✅ Done! **{title}** — {len(scenes)} scenes ({mode_label})."), list(gallery), out.name
def _image_to_png_bytes(path_or_img):
try:
img = Image.open(path_or_img) if isinstance(path_or_img, str) else Image.fromarray(path_or_img)
img = img.convert("RGB")
buf = io.BytesIO()
img.save(buf, format="PNG")
return buf.getvalue()
except Exception:
return None
# ---------------------------------------------------------------------------
# Single-image tab
# ---------------------------------------------------------------------------
def single_image(prompt, steps, width, height, seed):
_need_modal()
if not prompt or not prompt.strip():
raise gr.Error("Please enter a prompt.")
model = modal.Cls.from_name(MODAL_APP, "Model")()
try:
png = model.generate.remote(
prompt=prompt,
num_inference_steps=int(steps),
width=int(width),
height=int(height),
seed=int(seed),
)
except Exception as exc: # noqa: BLE001
raise gr.Error(f"Generation failed: {exc}")
return Image.open(io.BytesIO(png))
# ---------------------------------------------------------------------------
# UI
# ---------------------------------------------------------------------------
def _on_lang_change(language):
voices = list(LANGS[language]["voices"].keys())
return gr.update(choices=voices, value=voices[0])
with gr.Blocks(title="tricket", theme=gr.themes.Soft()) as demo:
gr.Markdown(
"""
# 🎬 tricket — one prompt → a product ad video
Describe a product → GLM writes the script → **FLUX** paints each scene →
**Kokoro** voices it → **ffmpeg** cuts a vertical (9:16) ad.
GPU runs on-demand on **Modal**. First run after idle ~30–60s (cold start).
"""
)
with gr.Tab("🎬 Ad Studio"):
with gr.Row():
with gr.Column(scale=3):
product = gr.Textbox(
label="Product",
placeholder="Aura Buds — wireless noise-cancelling earbuds with 30h battery",
lines=2,
)
selling = gr.Textbox(
label="Key selling points (optional)",
placeholder="noise cancelling, 30h battery, sweat-proof, instant pairing",
lines=2,
)
with gr.Row():
language = gr.Dropdown(
list(LANGS.keys()), value="中文", label="Narration language"
)
voice = gr.Dropdown(
list(LANGS["中文"]["voices"].keys()),
value=list(LANGS["中文"]["voices"].keys())[0],
label="Voice",
)
scenes_n = gr.Slider(3, 6, value=4, step=1, label="Scenes")
product_img = gr.Image(
label="Product photo (optional — used as hero & end frame)",
type="filepath",
height=160,
)
motion = gr.Radio(
["Ken Burns(快)", "AI 视频 · LTX(慢,更炫)"],
value="Ken Burns(快)",
label="Motion / 运镜",
info="AI 视频会为每个场景生成真实动态片段,明显更慢、更费 GPU。",
)
make_btn = gr.Button("🎬 Make Ad Video", variant="primary")
with gr.Column(scale=4):
status = gr.Markdown("Ready.")
gallery = gr.Gallery(label="Scenes", columns=3, height=240)
video = gr.Video(label="Ad video (9:16)")
language.change(_on_lang_change, inputs=language, outputs=voice)
make_btn.click(
make_ad,
inputs=[product, selling, language, voice, scenes_n, product_img, motion],
outputs=[status, gallery, video],
)
with gr.Tab("🖼 Single Image"):
with gr.Row():
with gr.Column(scale=3):
s_prompt = gr.Textbox(label="Prompt", lines=3,
placeholder="a cinematic photo of a red panda barista")
with gr.Row():
s_steps = gr.Slider(1, 8, value=4, step=1, label="Steps")
s_seed = gr.Number(value=-1, label="Seed (-1=random)", precision=0)
with gr.Row():
s_w = gr.Slider(512, 1024, value=1024, step=64, label="Width")
s_h = gr.Slider(512, 1024, value=1024, step=64, label="Height")
s_btn = gr.Button("Generate", variant="primary")
with gr.Column(scale=4):
s_out = gr.Image(label="Result", type="pil", height=512)
s_btn.click(single_image, inputs=[s_prompt, s_steps, s_w, s_h, s_seed], outputs=s_out)
if __name__ == "__main__":
demo.launch()