Spaces:
Running
Running
| """ | |
| tricket — AI product-ad video studio (Gradio frontend, runs on a HF Space). | |
| Tab 1 🎬 Ad Studio : one prompt -> ad script -> per-scene image + voiceover -> | |
| Ken Burns slideshow with burned captions -> 9:16 MP4. | |
| Tab 2 🖼 Single Image : plain FLUX.1-schnell text-to-image. | |
| Heavy lifting runs on Modal (FLUX / Kokoro / ffmpeg). The script is written by | |
| GLM-5.1 through the Hugging Face inference router. | |
| Required Space secrets (Settings -> Variables and secrets): | |
| MODAL_TOKEN_ID | |
| MODAL_TOKEN_SECRET | |
| HF_TOKEN (for the GLM script-writer via router.huggingface.co) | |
| """ | |
| import io | |
| import json | |
| import os | |
| import re | |
| import tempfile | |
| import gradio as gr | |
| import modal | |
| from PIL import Image | |
| MODAL_APP = "tricket-flux" | |
| SCRIPT_MODEL = "zai-org/GLM-4.6" # via HF router; overridable with SCRIPT_MODEL env | |
| # 9:16 generation size for FLUX (multiples of 16). | |
| GEN_W, GEN_H = 768, 1344 | |
| # Kokoro voices grouped by language. | |
| LANGS = { | |
| "中文": { | |
| "lang_code": "z", | |
| "voices": { | |
| "晓晓 · 女声": "zf_xiaoxiao", | |
| "小贝 · 女声": "zf_xiaobei", | |
| "云健 · 男声": "zm_yunjian", | |
| "云希 · 男声": "zm_yunxi", | |
| }, | |
| }, | |
| "English": { | |
| "lang_code": "a", | |
| "voices": { | |
| "Heart · F": "af_heart", | |
| "Bella · F": "af_bella", | |
| "Michael · M": "am_michael", | |
| "Puck · M": "am_puck", | |
| }, | |
| }, | |
| } | |
| # --------------------------------------------------------------------------- | |
| # Modal handles (resolved lazily so import never crashes the Space). | |
| # --------------------------------------------------------------------------- | |
| def _modal_handles(): | |
| model = modal.Cls.from_name(MODAL_APP, "Model")() | |
| tts = modal.Cls.from_name(MODAL_APP, "TTS")() | |
| animate = modal.Cls.from_name(MODAL_APP, "Animate")() | |
| assemble = modal.Function.from_name(MODAL_APP, "assemble_video") | |
| return model, tts, animate, assemble | |
| def _need_modal(): | |
| if not os.environ.get("MODAL_TOKEN_ID") or not os.environ.get("MODAL_TOKEN_SECRET"): | |
| raise gr.Error( | |
| "Modal credentials missing — add MODAL_TOKEN_ID and MODAL_TOKEN_SECRET " | |
| "to this Space's secrets." | |
| ) | |
| def _need_hf(): | |
| if not os.environ.get("HF_TOKEN"): | |
| raise gr.Error( | |
| "HF_TOKEN missing — add it to this Space's secrets (used by the " | |
| "GLM script writer)." | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Script generation (GLM via HF router) | |
| # --------------------------------------------------------------------------- | |
| def write_script(product, selling_points, language, num_scenes): | |
| from openai import OpenAI | |
| client = OpenAI( | |
| base_url="https://router.huggingface.co/v1", | |
| api_key=os.environ["HF_TOKEN"], | |
| ) | |
| lang_name = "Simplified Chinese" if language == "中文" else "English" | |
| sys = ( | |
| "You are an award-winning short-form video ad creative. " | |
| "You write punchy vertical (9:16) product ads." | |
| ) | |
| user = f"""Create a {num_scenes}-scene vertical product ad. | |
| Product: {product} | |
| Key selling points: {selling_points or "(infer sensible ones)"} | |
| Rules: | |
| - Narration language: {lang_name}. Each scene narration is ONE short spoken sentence (max ~16 words), energetic. | |
| - caption: a VERY short on-screen text overlay in {lang_name} (max ~6 words). | |
| - image_prompt: a vivid ENGLISH text-to-image prompt for that scene, cinematic, vertical composition, no text in image. | |
| - Scene 1 = hook / hero shot. Last scene = call to action. | |
| - Return STRICT JSON only, no markdown, schema: | |
| {{"title": str, "scenes": [{{"narration": str, "caption": str, "image_prompt": str}}]}}""" | |
| resp = client.chat.completions.create( | |
| model=os.environ.get("SCRIPT_MODEL", SCRIPT_MODEL), | |
| messages=[{"role": "system", "content": sys}, {"role": "user", "content": user}], | |
| temperature=0.8, | |
| ) | |
| raw = resp.choices[0].message.content or "" | |
| data = _parse_json(raw) | |
| scenes = data.get("scenes", [])[:num_scenes] | |
| if not scenes: | |
| raise gr.Error("Script generation returned no scenes; try again.") | |
| return data.get("title", product), scenes | |
| def _parse_json(raw): | |
| raw = raw.strip() | |
| raw = re.sub(r"^```(?:json)?", "", raw).strip() | |
| raw = re.sub(r"```$", "", raw).strip() | |
| try: | |
| return json.loads(raw) | |
| except Exception: | |
| m = re.search(r"\{.*\}", raw, re.DOTALL) | |
| if m: | |
| return json.loads(m.group(0)) | |
| raise gr.Error("Could not parse script JSON from the model.") | |
| # --------------------------------------------------------------------------- | |
| # Ad Studio orchestration (streaming generator) | |
| # --------------------------------------------------------------------------- | |
| def make_ad(product, selling_points, language, voice_label, num_scenes, product_image, motion): | |
| _need_modal() | |
| _need_hf() | |
| if not product or not product.strip(): | |
| raise gr.Error("Please describe your product.") | |
| num_scenes = int(num_scenes) | |
| lang_code = LANGS[language]["lang_code"] | |
| voice = LANGS[language]["voices"].get(voice_label) or next( | |
| iter(LANGS[language]["voices"].values()) | |
| ) | |
| use_video = "LTX" in (motion or "") | |
| # LTX needs /32; FLUX needs /16 — 704x1216 satisfies both and is LTX-native. | |
| gen_w, gen_h = (704, 1216) if use_video else (GEN_W, GEN_H) | |
| log = [] | |
| def status(msg): | |
| log.append(msg) | |
| return "\n\n".join(log) | |
| yield status("📝 Writing ad script with GLM…"), [], None | |
| title, scenes = write_script(product, selling_points, language, num_scenes) | |
| script_preview = "\n".join( | |
| f"**{i+1}. {s.get('caption','')}** — {s.get('narration','')}" | |
| for i, s in enumerate(scenes) | |
| ) | |
| yield status(f"🎬 **{title}**\n\n{script_preview}\n\n🖼 Generating scenes on GPU…"), [], None | |
| model, tts, animate, assemble = _modal_handles() | |
| # Fan out image + TTS jobs in parallel across Modal containers. | |
| img_calls = [ | |
| model.generate.spawn(prompt=s["image_prompt"], width=gen_w, height=gen_h) | |
| for s in scenes | |
| ] | |
| tts_calls = [ | |
| tts.synth.spawn(text=s.get("narration", ""), voice=voice, lang_code=lang_code) | |
| for s in scenes | |
| ] | |
| # Optional: use the uploaded product photo as hero (scene 1) and CTA (last). | |
| hero_bytes = _image_to_png_bytes(product_image) if product_image else None | |
| images, gallery = [], [] | |
| for i, call in enumerate(img_calls): | |
| png = call.get() | |
| if hero_bytes and (i == 0 or i == len(img_calls) - 1): | |
| png = hero_bytes | |
| images.append(png) | |
| gallery.append(Image.open(io.BytesIO(png))) | |
| yield status(f"🖼 Scene {i+1}/{len(scenes)} ready…"), list(gallery), None | |
| # Optional: animate each still into a motion clip with LTX-Video. | |
| videos = [None] * len(scenes) | |
| if use_video: | |
| yield status("🎥 Animating scenes with LTX-Video (this is the slow part)…"), list(gallery), None | |
| anim_calls = [ | |
| animate.animate.spawn( | |
| image_png=images[i], | |
| prompt=scenes[i].get("image_prompt", ""), | |
| width=gen_w, | |
| height=gen_h, | |
| ) | |
| for i in range(len(scenes)) | |
| ] | |
| for i, c in enumerate(anim_calls): | |
| videos[i] = c.get() | |
| yield status(f"🎥 Motion clip {i+1}/{len(scenes)} ready…"), list(gallery), None | |
| yield status("🔊 Synthesizing voiceover…"), list(gallery), None | |
| audios = [c.get() for c in tts_calls] | |
| mode_label = "LTX motion" if use_video else "Ken Burns" | |
| yield status(f"🎞 Assembling video ({mode_label} + captions + voiceover)…"), list(gallery), None | |
| scene_payload = [ | |
| { | |
| "image": images[i], | |
| "video": videos[i], | |
| "audio": audios[i], | |
| "caption": scenes[i].get("caption", ""), | |
| } | |
| for i in range(len(scenes)) | |
| ] | |
| mp4 = assemble.remote(scene_payload) | |
| out = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) | |
| out.write(mp4) | |
| out.close() | |
| yield status(f"✅ Done! **{title}** — {len(scenes)} scenes ({mode_label})."), list(gallery), out.name | |
| def _image_to_png_bytes(path_or_img): | |
| try: | |
| img = Image.open(path_or_img) if isinstance(path_or_img, str) else Image.fromarray(path_or_img) | |
| img = img.convert("RGB") | |
| buf = io.BytesIO() | |
| img.save(buf, format="PNG") | |
| return buf.getvalue() | |
| except Exception: | |
| return None | |
| # --------------------------------------------------------------------------- | |
| # Single-image tab | |
| # --------------------------------------------------------------------------- | |
| def single_image(prompt, steps, width, height, seed): | |
| _need_modal() | |
| if not prompt or not prompt.strip(): | |
| raise gr.Error("Please enter a prompt.") | |
| model = modal.Cls.from_name(MODAL_APP, "Model")() | |
| try: | |
| png = model.generate.remote( | |
| prompt=prompt, | |
| num_inference_steps=int(steps), | |
| width=int(width), | |
| height=int(height), | |
| seed=int(seed), | |
| ) | |
| except Exception as exc: # noqa: BLE001 | |
| raise gr.Error(f"Generation failed: {exc}") | |
| return Image.open(io.BytesIO(png)) | |
| # --------------------------------------------------------------------------- | |
| # UI | |
| # --------------------------------------------------------------------------- | |
| def _on_lang_change(language): | |
| voices = list(LANGS[language]["voices"].keys()) | |
| return gr.update(choices=voices, value=voices[0]) | |
| with gr.Blocks(title="tricket", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown( | |
| """ | |
| # 🎬 tricket — one prompt → a product ad video | |
| Describe a product → GLM writes the script → **FLUX** paints each scene → | |
| **Kokoro** voices it → **ffmpeg** cuts a vertical (9:16) ad. | |
| GPU runs on-demand on **Modal**. First run after idle ~30–60s (cold start). | |
| """ | |
| ) | |
| with gr.Tab("🎬 Ad Studio"): | |
| with gr.Row(): | |
| with gr.Column(scale=3): | |
| product = gr.Textbox( | |
| label="Product", | |
| placeholder="Aura Buds — wireless noise-cancelling earbuds with 30h battery", | |
| lines=2, | |
| ) | |
| selling = gr.Textbox( | |
| label="Key selling points (optional)", | |
| placeholder="noise cancelling, 30h battery, sweat-proof, instant pairing", | |
| lines=2, | |
| ) | |
| with gr.Row(): | |
| language = gr.Dropdown( | |
| list(LANGS.keys()), value="中文", label="Narration language" | |
| ) | |
| voice = gr.Dropdown( | |
| list(LANGS["中文"]["voices"].keys()), | |
| value=list(LANGS["中文"]["voices"].keys())[0], | |
| label="Voice", | |
| ) | |
| scenes_n = gr.Slider(3, 6, value=4, step=1, label="Scenes") | |
| product_img = gr.Image( | |
| label="Product photo (optional — used as hero & end frame)", | |
| type="filepath", | |
| height=160, | |
| ) | |
| motion = gr.Radio( | |
| ["Ken Burns(快)", "AI 视频 · LTX(慢,更炫)"], | |
| value="Ken Burns(快)", | |
| label="Motion / 运镜", | |
| info="AI 视频会为每个场景生成真实动态片段,明显更慢、更费 GPU。", | |
| ) | |
| make_btn = gr.Button("🎬 Make Ad Video", variant="primary") | |
| with gr.Column(scale=4): | |
| status = gr.Markdown("Ready.") | |
| gallery = gr.Gallery(label="Scenes", columns=3, height=240) | |
| video = gr.Video(label="Ad video (9:16)") | |
| language.change(_on_lang_change, inputs=language, outputs=voice) | |
| make_btn.click( | |
| make_ad, | |
| inputs=[product, selling, language, voice, scenes_n, product_img, motion], | |
| outputs=[status, gallery, video], | |
| ) | |
| with gr.Tab("🖼 Single Image"): | |
| with gr.Row(): | |
| with gr.Column(scale=3): | |
| s_prompt = gr.Textbox(label="Prompt", lines=3, | |
| placeholder="a cinematic photo of a red panda barista") | |
| with gr.Row(): | |
| s_steps = gr.Slider(1, 8, value=4, step=1, label="Steps") | |
| s_seed = gr.Number(value=-1, label="Seed (-1=random)", precision=0) | |
| with gr.Row(): | |
| s_w = gr.Slider(512, 1024, value=1024, step=64, label="Width") | |
| s_h = gr.Slider(512, 1024, value=1024, step=64, label="Height") | |
| s_btn = gr.Button("Generate", variant="primary") | |
| with gr.Column(scale=4): | |
| s_out = gr.Image(label="Result", type="pil", height=512) | |
| s_btn.click(single_image, inputs=[s_prompt, s_steps, s_w, s_h, s_seed], outputs=s_out) | |
| if __name__ == "__main__": | |
| demo.launch() | |