""" tricket — AI product-ad video studio (Gradio frontend, runs on a HF Space). Tab 1 🎬 Ad Studio : one prompt -> ad script -> per-scene image + voiceover -> Ken Burns slideshow with burned captions -> 9:16 MP4. Tab 2 🖼 Single Image : plain FLUX.1-schnell text-to-image. Heavy lifting runs on Modal (FLUX / Kokoro / ffmpeg). The script is written by GLM-5.1 through the Hugging Face inference router. Required Space secrets (Settings -> Variables and secrets): MODAL_TOKEN_ID MODAL_TOKEN_SECRET HF_TOKEN (for the GLM script-writer via router.huggingface.co) """ import io import json import os import re import tempfile import gradio as gr import modal from PIL import Image MODAL_APP = "tricket-flux" SCRIPT_MODEL = "zai-org/GLM-4.6" # via HF router; overridable with SCRIPT_MODEL env # 9:16 generation size for FLUX (multiples of 16). GEN_W, GEN_H = 768, 1344 # Kokoro voices grouped by language. LANGS = { "中文": { "lang_code": "z", "voices": { "晓晓 · 女声": "zf_xiaoxiao", "小贝 · 女声": "zf_xiaobei", "云健 · 男声": "zm_yunjian", "云希 · 男声": "zm_yunxi", }, }, "English": { "lang_code": "a", "voices": { "Heart · F": "af_heart", "Bella · F": "af_bella", "Michael · M": "am_michael", "Puck · M": "am_puck", }, }, } # --------------------------------------------------------------------------- # Modal handles (resolved lazily so import never crashes the Space). # --------------------------------------------------------------------------- def _modal_handles(): model = modal.Cls.from_name(MODAL_APP, "Model")() tts = modal.Cls.from_name(MODAL_APP, "TTS")() animate = modal.Cls.from_name(MODAL_APP, "Animate")() assemble = modal.Function.from_name(MODAL_APP, "assemble_video") return model, tts, animate, assemble def _need_modal(): if not os.environ.get("MODAL_TOKEN_ID") or not os.environ.get("MODAL_TOKEN_SECRET"): raise gr.Error( "Modal credentials missing — add MODAL_TOKEN_ID and MODAL_TOKEN_SECRET " "to this Space's secrets." ) def _need_hf(): if not os.environ.get("HF_TOKEN"): raise gr.Error( "HF_TOKEN missing — add it to this Space's secrets (used by the " "GLM script writer)." ) # --------------------------------------------------------------------------- # Script generation (GLM via HF router) # --------------------------------------------------------------------------- def write_script(product, selling_points, language, num_scenes): from openai import OpenAI client = OpenAI( base_url="https://router.huggingface.co/v1", api_key=os.environ["HF_TOKEN"], ) lang_name = "Simplified Chinese" if language == "中文" else "English" sys = ( "You are an award-winning short-form video ad creative. " "You write punchy vertical (9:16) product ads." ) user = f"""Create a {num_scenes}-scene vertical product ad. Product: {product} Key selling points: {selling_points or "(infer sensible ones)"} Rules: - Narration language: {lang_name}. Each scene narration is ONE short spoken sentence (max ~16 words), energetic. - caption: a VERY short on-screen text overlay in {lang_name} (max ~6 words). - image_prompt: a vivid ENGLISH text-to-image prompt for that scene, cinematic, vertical composition, no text in image. - Scene 1 = hook / hero shot. Last scene = call to action. - Return STRICT JSON only, no markdown, schema: {{"title": str, "scenes": [{{"narration": str, "caption": str, "image_prompt": str}}]}}""" resp = client.chat.completions.create( model=os.environ.get("SCRIPT_MODEL", SCRIPT_MODEL), messages=[{"role": "system", "content": sys}, {"role": "user", "content": user}], temperature=0.8, ) raw = resp.choices[0].message.content or "" data = _parse_json(raw) scenes = data.get("scenes", [])[:num_scenes] if not scenes: raise gr.Error("Script generation returned no scenes; try again.") return data.get("title", product), scenes def _parse_json(raw): raw = raw.strip() raw = re.sub(r"^```(?:json)?", "", raw).strip() raw = re.sub(r"```$", "", raw).strip() try: return json.loads(raw) except Exception: m = re.search(r"\{.*\}", raw, re.DOTALL) if m: return json.loads(m.group(0)) raise gr.Error("Could not parse script JSON from the model.") # --------------------------------------------------------------------------- # Ad Studio orchestration (streaming generator) # --------------------------------------------------------------------------- def make_ad(product, selling_points, language, voice_label, num_scenes, product_image, motion): _need_modal() _need_hf() if not product or not product.strip(): raise gr.Error("Please describe your product.") num_scenes = int(num_scenes) lang_code = LANGS[language]["lang_code"] voice = LANGS[language]["voices"].get(voice_label) or next( iter(LANGS[language]["voices"].values()) ) use_video = "LTX" in (motion or "") # LTX needs /32; FLUX needs /16 — 704x1216 satisfies both and is LTX-native. gen_w, gen_h = (704, 1216) if use_video else (GEN_W, GEN_H) log = [] def status(msg): log.append(msg) return "\n\n".join(log) yield status("📝 Writing ad script with GLM…"), [], None title, scenes = write_script(product, selling_points, language, num_scenes) script_preview = "\n".join( f"**{i+1}. {s.get('caption','')}** — {s.get('narration','')}" for i, s in enumerate(scenes) ) yield status(f"🎬 **{title}**\n\n{script_preview}\n\n🖼 Generating scenes on GPU…"), [], None model, tts, animate, assemble = _modal_handles() # Fan out image + TTS jobs in parallel across Modal containers. img_calls = [ model.generate.spawn(prompt=s["image_prompt"], width=gen_w, height=gen_h) for s in scenes ] tts_calls = [ tts.synth.spawn(text=s.get("narration", ""), voice=voice, lang_code=lang_code) for s in scenes ] # Optional: use the uploaded product photo as hero (scene 1) and CTA (last). hero_bytes = _image_to_png_bytes(product_image) if product_image else None images, gallery = [], [] for i, call in enumerate(img_calls): png = call.get() if hero_bytes and (i == 0 or i == len(img_calls) - 1): png = hero_bytes images.append(png) gallery.append(Image.open(io.BytesIO(png))) yield status(f"🖼 Scene {i+1}/{len(scenes)} ready…"), list(gallery), None # Optional: animate each still into a motion clip with LTX-Video. videos = [None] * len(scenes) if use_video: yield status("🎥 Animating scenes with LTX-Video (this is the slow part)…"), list(gallery), None anim_calls = [ animate.animate.spawn( image_png=images[i], prompt=scenes[i].get("image_prompt", ""), width=gen_w, height=gen_h, ) for i in range(len(scenes)) ] for i, c in enumerate(anim_calls): videos[i] = c.get() yield status(f"🎥 Motion clip {i+1}/{len(scenes)} ready…"), list(gallery), None yield status("🔊 Synthesizing voiceover…"), list(gallery), None audios = [c.get() for c in tts_calls] mode_label = "LTX motion" if use_video else "Ken Burns" yield status(f"🎞 Assembling video ({mode_label} + captions + voiceover)…"), list(gallery), None scene_payload = [ { "image": images[i], "video": videos[i], "audio": audios[i], "caption": scenes[i].get("caption", ""), } for i in range(len(scenes)) ] mp4 = assemble.remote(scene_payload) out = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) out.write(mp4) out.close() yield status(f"✅ Done! **{title}** — {len(scenes)} scenes ({mode_label})."), list(gallery), out.name def _image_to_png_bytes(path_or_img): try: img = Image.open(path_or_img) if isinstance(path_or_img, str) else Image.fromarray(path_or_img) img = img.convert("RGB") buf = io.BytesIO() img.save(buf, format="PNG") return buf.getvalue() except Exception: return None # --------------------------------------------------------------------------- # Single-image tab # --------------------------------------------------------------------------- def single_image(prompt, steps, width, height, seed): _need_modal() if not prompt or not prompt.strip(): raise gr.Error("Please enter a prompt.") model = modal.Cls.from_name(MODAL_APP, "Model")() try: png = model.generate.remote( prompt=prompt, num_inference_steps=int(steps), width=int(width), height=int(height), seed=int(seed), ) except Exception as exc: # noqa: BLE001 raise gr.Error(f"Generation failed: {exc}") return Image.open(io.BytesIO(png)) # --------------------------------------------------------------------------- # UI # --------------------------------------------------------------------------- def _on_lang_change(language): voices = list(LANGS[language]["voices"].keys()) return gr.update(choices=voices, value=voices[0]) with gr.Blocks(title="tricket", theme=gr.themes.Soft()) as demo: gr.Markdown( """ # 🎬 tricket — one prompt → a product ad video Describe a product → GLM writes the script → **FLUX** paints each scene → **Kokoro** voices it → **ffmpeg** cuts a vertical (9:16) ad. GPU runs on-demand on **Modal**. First run after idle ~30–60s (cold start). """ ) with gr.Tab("🎬 Ad Studio"): with gr.Row(): with gr.Column(scale=3): product = gr.Textbox( label="Product", placeholder="Aura Buds — wireless noise-cancelling earbuds with 30h battery", lines=2, ) selling = gr.Textbox( label="Key selling points (optional)", placeholder="noise cancelling, 30h battery, sweat-proof, instant pairing", lines=2, ) with gr.Row(): language = gr.Dropdown( list(LANGS.keys()), value="中文", label="Narration language" ) voice = gr.Dropdown( list(LANGS["中文"]["voices"].keys()), value=list(LANGS["中文"]["voices"].keys())[0], label="Voice", ) scenes_n = gr.Slider(3, 6, value=4, step=1, label="Scenes") product_img = gr.Image( label="Product photo (optional — used as hero & end frame)", type="filepath", height=160, ) motion = gr.Radio( ["Ken Burns(快)", "AI 视频 · LTX(慢,更炫)"], value="Ken Burns(快)", label="Motion / 运镜", info="AI 视频会为每个场景生成真实动态片段,明显更慢、更费 GPU。", ) make_btn = gr.Button("🎬 Make Ad Video", variant="primary") with gr.Column(scale=4): status = gr.Markdown("Ready.") gallery = gr.Gallery(label="Scenes", columns=3, height=240) video = gr.Video(label="Ad video (9:16)") language.change(_on_lang_change, inputs=language, outputs=voice) make_btn.click( make_ad, inputs=[product, selling, language, voice, scenes_n, product_img, motion], outputs=[status, gallery, video], ) with gr.Tab("🖼 Single Image"): with gr.Row(): with gr.Column(scale=3): s_prompt = gr.Textbox(label="Prompt", lines=3, placeholder="a cinematic photo of a red panda barista") with gr.Row(): s_steps = gr.Slider(1, 8, value=4, step=1, label="Steps") s_seed = gr.Number(value=-1, label="Seed (-1=random)", precision=0) with gr.Row(): s_w = gr.Slider(512, 1024, value=1024, step=64, label="Width") s_h = gr.Slider(512, 1024, value=1024, step=64, label="Height") s_btn = gr.Button("Generate", variant="primary") with gr.Column(scale=4): s_out = gr.Image(label="Result", type="pil", height=512) s_btn.click(single_image, inputs=[s_prompt, s_steps, s_w, s_h, s_seed], outputs=s_out) if __name__ == "__main__": demo.launch()