Spaces:

MoonMath-ai
/

Prompt-2-Video

Running on Zero

App Files Files Community

Shalmoni commited on Oct 5

Commit

5ac63ce

verified ·

1 Parent(s): 58c4d87

Update app.py

Browse files

Files changed (1) hide show

app.py +103 -52

app.py CHANGED Viewed

@@ -33,7 +33,7 @@ def load_project_file(file_obj):
     return proj
 # =========================
-# LLM (ZeroGPU) — Storyboard generator (robust JSON)
 # =========================
 from transformers import AutoTokenizer, AutoModelForCausalLM
@@ -51,13 +51,12 @@ def _lazy_model_tok():
     _model = AutoModelForCausalLM.from_pretrained(
         STORYBOARD_MODEL,
         device_map="auto",
-        dtype="auto",             # prefer `dtype` (torch_dtype is deprecated)
         trust_remote_code=True,
     )
     return _model, _tokenizer
-def _storyboard_prompt(user_prompt: str, n_shots: int, default_fps: int, default_len: int) -> str:
-    # Force the model to wrap JSON in tags; makes parsing deterministic.
     return (
         "Return ONLY a JSON array, enclosed between <JSON> and </JSON>.\n"
         f"Create a storyboard of {n_shots} shots for this idea:\n\n"
@@ -77,55 +76,38 @@ def _storyboard_prompt(user_prompt: str, n_shots: int, default_fps: int, default
         "Output:\n<JSON>\n[ { ... }, ... ]\n</JSON>\n"
     )
-def _extract_json_array(text: str) -> str:
-    """
-    Prefer <JSON>...</JSON>. Fallback: first balanced top-level JSON array.
-    """
-    m = re.search(r"<JSON>(.*?)</JSON>", text, flags=re.DOTALL | re.IGNORECASE)
-    if m:
-        return m.group(1).strip()
-    start = text.find("[")
-    if start == -1:
-        raise ValueError("No JSON array start '[' found in model output.")
-    depth = 0
-    for i in range(start, len(text)):
-        ch = text[i]
-        if ch == "[":
-            depth += 1
-        elif ch == "]":
-            depth -= 1
-            if depth == 0:
-                return text[start:i+1]
-    raise ValueError("Unbalanced JSON array in model output.")
-@spaces.GPU(duration=180)  # ZeroGPU entrypoint
-def generate_storyboard_with_llm(user_prompt: str, n_shots: int, default_fps: int, default_len: int):
-    """
-    Chat-format prompt -> deterministic generation -> robust JSON parse.
-    """
-    model, tok = _lazy_model_tok()
-    system = (
-        "You are a film previsualization assistant. "
-        "Return ONLY JSON inside <JSON>...</JSON>. No extra text."
     )
-    user = _storyboard_prompt(user_prompt, n_shots, default_fps, default_len)
-    # Use chat template if available for the model
     if hasattr(tok, "apply_chat_template"):
-        prompt_text = tok.apply_chat_template(
-            [{"role": "system", "content": system},
-             {"role": "user", "content": user}],
             tokenize=False,
             add_generation_prompt=True
         )
-    else:
-        prompt_text = system + "\n\n" + user
     inputs = tok(prompt_text, return_tensors="pt")
     inputs = {k: v.to(model.device) for k, v in inputs.items()}
     eos_id = tok.eos_token_id
     gen = model.generate(
         **inputs,
@@ -136,16 +118,40 @@ def generate_storyboard_with_llm(user_prompt: str, n_shots: int, default_fps: in
         eos_token_id=eos_id,
         pad_token_id=eos_id,
     )
-    out_text = tok.decode(gen[0], skip_special_tokens=True)
-    # Trim the echoed prompt if present
-    if out_text.startswith(prompt_text):
-        out_text = out_text[len(prompt_text):]
-    json_text = _extract_json_array(out_text)
-    shots_raw = json.loads(json_text)
-    # Normalize fields
     norm = []
     for i, s in enumerate(shots_raw, start=1):
         norm.append({
@@ -162,6 +168,51 @@ def generate_storyboard_with_llm(user_prompt: str, n_shots: int, default_fps: in
         })
     return norm
 # =========================
 # Gradio UI
 # =========================

     return proj
 # =========================
+# LLM (ZeroGPU) — Storyboard generator (robust, two-pass)
 # =========================
 from transformers import AutoTokenizer, AutoModelForCausalLM
     _model = AutoModelForCausalLM.from_pretrained(
         STORYBOARD_MODEL,
         device_map="auto",
+        dtype="auto",
         trust_remote_code=True,
     )
     return _model, _tokenizer
+def _prompt_with_tags(user_prompt: str, n_shots: int, default_fps: int, default_len: int) -> str:
     return (
         "Return ONLY a JSON array, enclosed between <JSON> and </JSON>.\n"
         f"Create a storyboard of {n_shots} shots for this idea:\n\n"
         "Output:\n<JSON>\n[ { ... }, ... ]\n</JSON>\n"
     )
+def _prompt_minimal(user_prompt: str, n_shots: int, default_fps: int, default_len: int) -> str:
+    # Second attempt if tags fail: demand ONLY an array, nothing else.
+    return (
+        "Reply ONLY with a JSON array starting with '[' and ending with ']'. No extra text.\n"
+        f"Storyboard: {n_shots} shots for:\n'''{user_prompt}'''\n"
+        "Each item:\n"
+        "{\n"
+        '  \"id\": <int starting at 1>,\n'
+        '  \"title\": \"Short title\",\n'
+        '  \"description\": \"Visual description\",\n'
+        f"  \"duration\": {default_len},\n"
+        f"  \"fps\": {default_fps},\n"
+        f"  \"video_length\": {default_len},\n"
+        "  \"steps\": 30,\n"
+        "  \"seed\": null,\n"
+        '  \"negative\": \"\"\n'
+        "}\n"
     )
+def _apply_chat(tok, system_msg: str, user_msg: str) -> str:
     if hasattr(tok, "apply_chat_template"):
+        return tok.apply_chat_template(
+            [{"role": "system", "content": system_msg},
+             {"role": "user", "content": user_msg}],
             tokenize=False,
             add_generation_prompt=True
         )
+    return system_msg + "\n\n" + user_msg
+def _generate_text(model, tok, prompt_text: str) -> str:
     inputs = tok(prompt_text, return_tensors="pt")
     inputs = {k: v.to(model.device) for k, v in inputs.items()}
     eos_id = tok.eos_token_id
     gen = model.generate(
         **inputs,
         eos_token_id=eos_id,
         pad_token_id=eos_id,
     )
+    text = tok.decode(gen[0], skip_special_tokens=True)
+    # Trim the echoed prompt if the model included it
+    if text.startswith(prompt_text):
+        text = text[len(prompt_text):]
+    # Strip code fences if any
+    text = text.strip()
+    if text.startswith("```"):
+        # remove ```json ... ```
+        text = re.sub(r"^```(?:json)?\s*|\s*```$", "", text, flags=re.IGNORECASE|re.DOTALL).strip()
+    return text
+def _extract_json_array(text: str) -> str:
+    # Prefer <JSON>...</JSON>
+    m = re.search(r"<JSON>(.*?)</JSON>", text, flags=re.DOTALL | re.IGNORECASE)
+    if m:
+        inner = m.group(1).strip()
+        if inner:
+            return inner
+    # Fallback: balanced array
+    start = text.find("[")
+    if start == -1:
+        return ""  # signal failure to caller
+    depth = 0
+    for i in range(start, len(text)):
+        ch = text[i]
+        if ch == "[":
+            depth += 1
+        elif ch == "]":
+            depth -= 1
+            if depth == 0:
+                return text[start:i+1].strip()
+    return ""  # unbalanced
+def _normalize_shots(shots_raw, default_fps: int, default_len: int):
     norm = []
     for i, s in enumerate(shots_raw, start=1):
         norm.append({
         })
     return norm
+@spaces.GPU(duration=180)
+def generate_storyboard_with_llm(user_prompt: str, n_shots: int, default_fps: int, default_len: int):
+    """
+    Two-pass generation for robustness:
+      1) <JSON>...</JSON>
+      2) strict array-only fallback
+    """
+    model, tok = _lazy_model_tok()
+    system = "You are a film previsualization assistant. Output must be valid JSON."
+    # ---- PASS 1: with <JSON> tags
+    p1 = _apply_chat(tok, system + " Return ONLY JSON inside <JSON> tags.",
+                     _prompt_with_tags(user_prompt, n_shots, default_fps, default_len))
+    out1 = _generate_text(model, tok, p1)
+    json_text = _extract_json_array(out1)
+    # ---- PASS 2: strict array (if needed)
+    if not json_text:
+        p2 = _apply_chat(tok, system + " Reply ONLY with a JSON array.",
+                         _prompt_minimal(user_prompt, n_shots, default_fps, default_len))
+        out2 = _generate_text(model, tok, p2)
+        json_text = _extract_json_array(out2)
+        # As a last ditch, try bracket slice only
+        if not json_text:
+            start = out2.find("["); end = out2.rfind("]")
+            if start != -1 and end != -1 and end > start:
+                json_text = out2[start:end+1].strip()
+        if not json_text:
+            # Show a short preview so you can see what the model returned
+            preview = (out2[:400] + "...") if len(out2) > 400 else out2
+            raise ValueError(f"LLM did not return parseable JSON.\nPreview:\n{preview}")
+    # Parse & normalize
+    try:
+        shots_raw = json.loads(json_text)
+    except Exception as e:
+        # Attempt a tiny cleanup: remove trailing commas
+        json_text_clean = re.sub(r",\s*([\]\}])", r"\1", json_text)
+        shots_raw = json.loads(json_text_clean)
+    return _normalize_shots(shots_raw, default_fps, default_len)
 # =========================
 # Gradio UI
 # =========================