Spaces:

MoonMath-ai
/

Prompt-2-Video

Running on Zero

App Files Files Community

Shalmoni commited on Oct 15

Commit

ac99ac3

verified ·

1 Parent(s): 85904ec

Update app.py

Browse files

Files changed (1) hide show

app.py +137 -84

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import os, json, uuid, re
 from datetime import datetime
 import gradio as gr
@@ -71,7 +72,7 @@ def _lazy_model_tok():
     _model = AutoModelForCausalLM.from_pretrained(
         STORYBOARD_MODEL,
         device_map="auto",
-        torch_dtype=preferred_dtype,     # <- correct kwarg
         trust_remote_code=True,
         use_safetensors=True
     )
@@ -111,7 +112,6 @@ def _prompt_with_tags(user_prompt: str, n_shots: int, default_fps: int, default_
         "Output must start with <JSON> and end with </JSON>.\n"
     )
 def _prompt_minimal(user_prompt: str, n_shots: int, default_fps: int, default_len: int) -> str:
     return (
         "Reply ONLY with a JSON array starting with '[' and ending with ']'. No extra text.\n"
@@ -125,7 +125,7 @@ def _prompt_minimal(user_prompt: str, n_shots: int, default_fps: int, default_le
         f"  \"fps\": {default_fps},\n"
         "  \"steps\": 30,\n"
         "  \"seed\": null,\n"
-       '  "negative": ""\n'
         "}\n"
     )
@@ -170,14 +170,20 @@ def _extract_json_array(text: str) -> str:
     if start == -1:
         return ""
     depth = 0
     for i in range(start, len(text)):
         ch = text[i]
-        if ch == "[":
-            depth += 1
-        elif ch == "]":
-            depth -= 1
-            if depth == 0:
-                return text[start:i+1].strip()
     return ""
 def _normalize_shots(shots_raw, default_fps: int, default_len: int):
@@ -241,81 +247,116 @@ def generate_storyboard_with_llm(user_prompt: str, n_shots: int, default_fps: in
     return _normalize_shots(shots_raw, default_fps, default_len)
 # =========================
-# IMAGE GEN (ZeroGPU) — sd-turbo t2i + img2img chaining
 # =========================
-from diffusers import StableDiffusionPipeline, StableDiffusionImg2ImgPipeline
-SD_MODEL = os.getenv("SD_MODEL", "stabilityai/sd-turbo")
 _sd_t2i = None
 _sd_i2i = None
 def _lazy_sd_pipes():
     global _sd_t2i, _sd_i2i
     if _sd_t2i is not None and _sd_i2i is not None:
         return _sd_t2i, _sd_i2i
-    use_cuda = torch.cuda.is_available()
-    dtype = torch.float16 if use_cuda else torch.float32
     hf_token = os.getenv("HF_TOKEN", None)
     _sd_t2i = StableDiffusionPipeline.from_pretrained(
-        SD_MODEL,
-        torch_dtype=dtype,
-        safety_checker=None,
-        feature_extractor=None,
-        use_safetensors=True,
-        low_cpu_mem_usage=False,
-        token=hf_token
     )
-    if use_cuda:
-        _sd_t2i = _sd_t2i.to("cuda")
     _sd_i2i = StableDiffusionImg2ImgPipeline(
-        vae=_sd_t2i.vae,
-        text_encoder=_sd_t2i.text_encoder,
-        tokenizer=_sd_t2i.tokenizer,
-        unet=_sd_t2i.unet,
-        scheduler=_sd_t2i.scheduler,
-        safety_checker=None,
-        feature_extractor=None
     )
-    if use_cuda:
-        _sd_i2i = _sd_i2i.to("cuda")
     return _sd_t2i, _sd_i2i
 def _save_keyframe(pid: str, shot_id: int, img: Image.Image) -> str:
     pdir = project_dir(pid)
     out = os.path.join(pdir, "keyframes", f"shot_{shot_id:02d}.png")
     img.save(out)
     return out
 @spaces.GPU(duration=180)
 def generate_keyframe_image(
     pid: str,
     shot_idx: int,
     shots: list,
-    t2i_steps: int = 6,          # first shot
-    i2i_steps: int = 10,         # subsequent shots
-    i2i_strength: float = 0.65,  # change vs consistency
-    guidance_scale: float = 0.5,
-    width: int = 512,
-    height: int = 512
 ):
     """
     Generate image for shots[shot_idx].
-    - shot 0: text2img (few steps)
-    - shot k>0: img2img from previous approved image with higher strength/steps
-    Seed is kept SAME across all shots (stored in shots[i]['seed']).
     """
-    t2i, i2i = _lazy_sd_pipes()
     shot = shots[shot_idx]
     prompt   = (shot.get("description") or "").strip()
     negative = shot.get("negative") or ""
     seed     = shot.get("seed", None)
-    device = "cuda" if torch.cuda.is_available() else "cpu"
     gen = torch.Generator(device)
     if isinstance(seed, int):
         gen = gen.manual_seed(int(seed))
@@ -323,40 +364,57 @@ def generate_keyframe_image(
     width  = max(256, min(1024, int(width)))
     height = max(256, min(1024, int(height)))
-    if shot_idx == 0 or not shots[shot_idx - 1].get("image_path"):
-        out = t2i(
-            prompt=prompt,
-            negative_prompt=negative,
-            guidance_scale=guidance_scale,
-            num_inference_steps=int(max(1, t2i_steps)),
-            generator=gen,
-            width=width,
-            height=height
-        ).images[0]
     else:
-        prev_path = shots[shot_idx - 1].get("image_path")
-        if prev_path and os.path.exists(prev_path):
             init_image = Image.open(prev_path).convert("RGB")
-            strength = float(i2i_strength)
-            strength = min(max(strength, 0.50), 0.90)
             out = i2i(
                 prompt=prompt,
-                negative_prompt=negative,
                 image=init_image,
-                guidance_scale=guidance_scale,
-                strength=strength,
-                num_inference_steps=int(max(2, i2i_steps)),
                 generator=gen
             ).images[0]
-        else:
             out = t2i(
                 prompt=prompt,
                 negative_prompt=negative,
-                guidance_scale=guidance_scale,
-                num_inference_steps=int(max(1, t2i_steps)),
                 generator=gen,
-                width=width,
-                height=height
             ).images[0]
     saved_path = _save_keyframe(pid, int(shot["id"]), out)
@@ -392,7 +450,7 @@ def df_to_shots(df: pd.DataFrame) -> list:
 # =========================
 with gr.Blocks() as demo:
     gr.Markdown("# 🎬 Storyboard → Keyframes → Videos → Export")
-    gr.Markdown("Edit storyboard prompts, then generate keyframes. Shots 2+ use the previous approved image for consistency. A single project seed is locked for a cohesive look.")
     # State
     project = gr.State(None)
@@ -439,11 +497,11 @@ with gr.Blocks() as demo:
             with gr.Row():
                 gen_btn = gr.Button("Generate / Regenerate", variant="primary")
                 approve_next_btn = gr.Button("Approve & Next →", variant="secondary")
-            # tuning controls
             with gr.Row():
-                img_strength = gr.Slider(0.40, 0.90, value=0.65, step=0.05, label="Change vs Consistency (img2img strength)")
-                img_steps    = gr.Slider(4, 20, value=10,  step=1,   label="Img2Img Steps")
-                guidance     = gr.Slider(0.0, 2.0, value=0.5, step=0.05, label="Guidance Scale")
             with gr.Row():
                 prev_img = gr.Image(label="Previous approved image (conditioning)", type="filepath")
                 out_img  = gr.Image(label="Generated image", type="filepath")
@@ -473,7 +531,6 @@ with gr.Blocks() as demo:
         p["shots"] = shots
         p["meta"]["updated"] = now_iso()
         save_project(p)
-        # Enable Save Edits after storyboard exists
         return p, shots_to_df(shots), gr.update(value="Storyboard generated (editable)."), gr.update(interactive=True)
     propose_btn.click(
@@ -503,11 +560,8 @@ with gr.Blocks() as demo:
         # lock a single seed for the project:
         proj_seed = None
-        # override if user supplied:
         if proj_seed_override not in [None, ""] and str(proj_seed_override).isdigit():
             proj_seed = int(proj_seed_override)
-        # otherwise use existing project meta seed or find one in shots:
         if proj_seed is None:
             proj_seed = p.get("meta", {}).get("seed", None)
         if proj_seed is None:
@@ -518,7 +572,6 @@ with gr.Blocks() as demo:
         if proj_seed is None:
             proj_seed = int(torch.randint(0, 2**31 - 1, (1,)).item())
-        # apply to all shots missing seed
         for s in shots:
             if not isinstance(s.get("seed"), int):
                 s["seed"] = proj_seed
@@ -549,19 +602,19 @@ with gr.Blocks() as demo:
         shots = p["shots"]
         if idx < 0 or idx >= len(shots): raise gr.Error("Invalid shot index.")
         shots[idx]["description"] = current_prompt  # allow tweaking
-        prev_path = shots[idx-1]["image_path"] if idx > 0 else None
         img_path = generate_keyframe_image(
             p["meta"]["id"],
             int(idx),
             shots,
-            t2i_steps=6,
             i2i_steps=int(i2i_steps_val),
             i2i_strength=float(i2i_strength_val),
             guidance_scale=float(guidance_val),
-            width=512,
-            height=512
         )
         return img_path, (prev_path or None), gr.update(value=f"Generated candidate for shot {shots[idx]['id']}.")
     gen_btn.click(

+# app.py
 import os, json, uuid, re
 from datetime import datetime
 import gradio as gr
     _model = AutoModelForCausalLM.from_pretrained(
         STORYBOARD_MODEL,
         device_map="auto",
+        torch_dtype=preferred_dtype,
         trust_remote_code=True,
         use_safetensors=True
     )
         "Output must start with <JSON> and end with </JSON>.\n"
     )
 def _prompt_minimal(user_prompt: str, n_shots: int, default_fps: int, default_len: int) -> str:
     return (
         "Reply ONLY with a JSON array starting with '[' and ending with ']'. No extra text.\n"
         f"  \"fps\": {default_fps},\n"
         "  \"steps\": 30,\n"
         "  \"seed\": null,\n"
+        '  "negative": ""\n'
         "}\n"
     )
     if start == -1:
         return ""
     depth = 0
+    in_str = False
+    prev = ""
     for i in range(start, len(text)):
         ch = text[i]
+        if ch == '"' and prev != '\\':
+            in_str = not in_str
+        if not in_str:
+            if ch == "[":
+                depth += 1
+            elif ch == "]":
+                depth -= 1
+                if depth == 0:
+                    return text[start:i+1].strip()
+        prev = ch
     return ""
 def _normalize_shots(shots_raw, default_fps: int, default_len: int):
     return _normalize_shots(shots_raw, default_fps, default_len)
 # =========================
+# IMAGE GEN — FLUX first, SD-Turbo fallback
 # =========================
+USE_CUDA = torch.cuda.is_available()
+DTYPE = torch.float16 if USE_CUDA else torch.float32
+FLUX_MODEL = os.getenv("FLUX_MODEL", "black-forest-labs/FLUX.1-Nano")  # or "black-forest-labs/FLUX.1-dev"
+SD_MODEL   = os.getenv("SD_MODEL", "stabilityai/sd-turbo")
+_flux_t2i = None
+_flux_i2i = None
 _sd_t2i = None
 _sd_i2i = None
+_have_flux = None
+def _lazy_flux_pipes():
+    # Returns (t2i, i2i) or raises
+    from diffusers import FluxPipeline, FluxImg2ImgPipeline
+    global _flux_t2i, _flux_i2i
+    if _flux_t2i is not None and _flux_i2i is not None:
+        return _flux_t2i, _flux_i2i
+    _flux_t2i = FluxPipeline.from_pretrained(FLUX_MODEL, torch_dtype=DTYPE, use_safetensors=True)
+    if USE_CUDA: _flux_t2i = _flux_t2i.to("cuda")
+    _flux_i2i = FluxImg2ImgPipeline.from_pretrained(FLUX_MODEL, torch_dtype=DTYPE, use_safetensors=True)
+    if USE_CUDA: _flux_i2i = _flux_i2i.to("cuda")
+    return _flux_t2i, _flux_i2i
 def _lazy_sd_pipes():
+    # Returns (t2i, i2i)
+    from diffusers import StableDiffusionPipeline, StableDiffusionImg2ImgPipeline
     global _sd_t2i, _sd_i2i
     if _sd_t2i is not None and _sd_i2i is not None:
         return _sd_t2i, _sd_i2i
     hf_token = os.getenv("HF_TOKEN", None)
     _sd_t2i = StableDiffusionPipeline.from_pretrained(
+        SD_MODEL, torch_dtype=DTYPE, safety_checker=None, feature_extractor=None,
+        use_safetensors=True, low_cpu_mem_usage=False, token=hf_token
     )
+    if USE_CUDA: _sd_t2i = _sd_t2i.to("cuda")
     _sd_i2i = StableDiffusionImg2ImgPipeline(
+        vae=_sd_t2i.vae, text_encoder=_sd_t2i.text_encoder, tokenizer=_sd_t2i.tokenizer,
+        unet=_sd_t2i.unet, scheduler=_sd_t2i.scheduler,
+        safety_checker=None, feature_extractor=None
     )
+    if USE_CUDA: _sd_i2i = _sd_i2i.to("cuda")
     return _sd_t2i, _sd_i2i
+def _try_get_pipes():
+    """Prefer FLUX; fall back to SD-Turbo. Returns (mode, t2i, i2i) where mode in {'flux','sd'}."""
+    global _have_flux
+    if _have_flux is None:
+        try:
+            t2i, i2i = _lazy_flux_pipes()
+            _have_flux = True
+            return "flux", t2i, i2i
+        except Exception as e:
+            _have_flux = False
+    if _have_flux:
+        return "flux", *_lazy_flux_pipes()
+    else:
+        return "sd", *_lazy_sd_pipes()
 def _save_keyframe(pid: str, shot_id: int, img: Image.Image) -> str:
     pdir = project_dir(pid)
     out = os.path.join(pdir, "keyframes", f"shot_{shot_id:02d}.png")
     img.save(out)
     return out
+def _significant_change(curr_desc: str, prev_desc: str) -> bool:
+    """
+    Heuristic: if symmetric difference of tokens is large -> treat as a new scene,
+    so we should text2img (seed keeps style) instead of img2img.
+    """
+    if not prev_desc: return True
+    a = set(re.findall(r"\w+", curr_desc.lower()))
+    b = set(re.findall(r"\w+", prev_desc.lower()))
+    # weights: boost composition-y words
+    comp_words = {"wide","close","low","high","overhead","aerial","profile","left","right","center",
+                  "portrait","landscape","long","establishing","macro","tilt","dutch","angle",
+                  "night","day","sunset","sunrise","noon","backlit","rim","key","fill"}
+    delta = a.symmetric_difference(b)
+    score = len(delta) + 2 * len((a ^ b) & comp_words)
+    return score >= 12  # tune threshold 10–16
 @spaces.GPU(duration=180)
 def generate_keyframe_image(
     pid: str,
     shot_idx: int,
     shots: list,
+    t2i_steps: int = 14,         # FLUX likes 12–20
+    i2i_steps: int = 16,
+    i2i_strength: float = 0.8,   # higher = follow prompt more
+    guidance_scale: float = 3.0, # FLUX sweet spot ~2.5–3.5
+    width: int = 640,
+    height: int = 640
 ):
     """
     Generate image for shots[shot_idx].
+    - shot 0: text2img
+    - shot k>0: smart chaining
+        * if significant change: text2img (same seed for style)
+        * else: img2img from previous approved image
     """
+    mode, t2i, i2i = _try_get_pipes()
     shot = shots[shot_idx]
     prompt   = (shot.get("description") or "").strip()
     negative = shot.get("negative") or ""
     seed     = shot.get("seed", None)
+    device = "cuda" if USE_CUDA else "cpu"
     gen = torch.Generator(device)
     if isinstance(seed, int):
         gen = gen.manual_seed(int(seed))
     width  = max(256, min(1024, int(width)))
     height = max(256, min(1024, int(height)))
+    # decide chaining
+    use_prev = False
+    prev_path = shots[shot_idx - 1].get("image_path") if shot_idx > 0 else None
+    if shot_idx == 0 or not prev_path or not os.path.exists(prev_path):
+        use_prev = False
     else:
+        prev_desc = shots[shot_idx - 1].get("description") or ""
+        use_prev = not _significant_change(prompt, prev_desc)
+    # invoke
+    if mode == "flux":
+        if not use_prev:
+            out = t2i(
+                prompt=prompt,
+                negative_prompt=negative or None,
+                num_inference_steps=int(max(8, t2i_steps)),
+                guidance_scale=float(max(2.0, guidance_scale)),
+                generator=gen,
+                width=width, height=height
+            ).images[0]
+        else:
             init_image = Image.open(prev_path).convert("RGB")
             out = i2i(
                 prompt=prompt,
+                negative_prompt=negative or None,
                 image=init_image,
+                strength=float(min(max(i2i_strength, 0.5), 0.95)),
+                num_inference_steps=int(max(10, i2i_steps)),
+                guidance_scale=float(max(2.0, guidance_scale)),
                 generator=gen
             ).images[0]
+    else:
+        # SD-turbo fallback (keep your original behavior but with less mushy defaults)
+        if not use_prev:
             out = t2i(
                 prompt=prompt,
                 negative_prompt=negative,
+                guidance_scale=1.0,
+                num_inference_steps=int(max(6, t2i_steps//2)),
                 generator=gen,
+                width=width, height=height
+            ).images[0]
+        else:
+            init_image = Image.open(prev_path).convert("RGB")
+            out = i2i(
+                prompt=prompt,
+                negative_prompt=negative,
+                image=init_image,
+                strength=float(min(max(i2i_strength, 0.55), 0.9)),
+                num_inference_steps=int(max(8, i2i_steps//2)),
+                generator=gen
             ).images[0]
     saved_path = _save_keyframe(pid, int(shot["id"]), out)
 # =========================
 with gr.Blocks() as demo:
     gr.Markdown("# 🎬 Storyboard → Keyframes → Videos → Export")
+    gr.Markdown("Edit storyboard prompts, then generate keyframes. **Smart chaining**: only reuse the previous image if the new prompt is similar; otherwise we regenerate from text with the same seed for style consistency.")
     # State
     project = gr.State(None)
             with gr.Row():
                 gen_btn = gr.Button("Generate / Regenerate", variant="primary")
                 approve_next_btn = gr.Button("Approve & Next →", variant="secondary")
+            # tuning controls (defaults tuned for FLUX; fallback will downshift)
             with gr.Row():
+                img_strength = gr.Slider(0.50, 0.95, value=0.80, step=0.05, label="Change vs Consistency (img2img strength)")
+                img_steps    = gr.Slider(8, 28, value=16,  step=1,   label="Inference Steps (img2img)")
+                guidance     = gr.Slider(2.0, 4.0, value=3.0, step=0.1, label="Guidance Scale")
             with gr.Row():
                 prev_img = gr.Image(label="Previous approved image (conditioning)", type="filepath")
                 out_img  = gr.Image(label="Generated image", type="filepath")
         p["shots"] = shots
         p["meta"]["updated"] = now_iso()
         save_project(p)
         return p, shots_to_df(shots), gr.update(value="Storyboard generated (editable)."), gr.update(interactive=True)
     propose_btn.click(
         # lock a single seed for the project:
         proj_seed = None
         if proj_seed_override not in [None, ""] and str(proj_seed_override).isdigit():
             proj_seed = int(proj_seed_override)
         if proj_seed is None:
             proj_seed = p.get("meta", {}).get("seed", None)
         if proj_seed is None:
         if proj_seed is None:
             proj_seed = int(torch.randint(0, 2**31 - 1, (1,)).item())
         for s in shots:
             if not isinstance(s.get("seed"), int):
                 s["seed"] = proj_seed
         shots = p["shots"]
         if idx < 0 or idx >= len(shots): raise gr.Error("Invalid shot index.")
         shots[idx]["description"] = current_prompt  # allow tweaking
         img_path = generate_keyframe_image(
             p["meta"]["id"],
             int(idx),
             shots,
+            t2i_steps=14,  # tuned for FLUX
             i2i_steps=int(i2i_steps_val),
             i2i_strength=float(i2i_strength_val),
             guidance_scale=float(guidance_val),
+            width=640,
+            height=640
         )
+        prev_path = shots[idx-1]["image_path"] if idx > 0 else None
         return img_path, (prev_path or None), gr.update(value=f"Generated candidate for shot {shots[idx]['id']}.")
     gen_btn.click(