Spaces:

MoonMath-ai
/

Prompt-2-Video

Running on Zero

App Files Files Community

Shalmoni commited on Oct 5

Commit

5904c28

verified ·

1 Parent(s): a035fe0

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -38

app.py CHANGED Viewed

@@ -33,12 +33,12 @@ def load_project_file(file_obj):
     return proj
 # =========================
-# LLM (ZeroGPU) — Storyboard generator (robust, two-pass)
 # =========================
 from transformers import AutoTokenizer, AutoModelForCausalLM
 STORYBOARD_MODEL = os.getenv("STORYBOARD_MODEL", "Qwen/Qwen2.5-1.5B-Instruct")
-HF_TASK_MAX_TOKENS = int(os.getenv("HF_TASK_MAX_TOKENS", "900"))
 _tokenizer = None
 _model = None
@@ -54,6 +54,9 @@ def _lazy_model_tok():
         dtype="auto",
         trust_remote_code=True,
     )
     return _model, _tokenizer
 def _prompt_with_tags(user_prompt: str, n_shots: int, default_fps: int, default_len: int) -> str:
@@ -61,7 +64,7 @@ def _prompt_with_tags(user_prompt: str, n_shots: int, default_fps: int, default_
         "Return ONLY a JSON array, enclosed between <JSON> and </JSON>.\n"
         f"Create a storyboard of {n_shots} shots for this idea:\n\n"
         f"'''{user_prompt}'''\n\n"
-        "Schema per item:\n"
         "{\n"
         '  \"id\": <int starting at 1>,\n'
         '  \"title\": \"Short title\",\n'
@@ -77,11 +80,10 @@ def _prompt_with_tags(user_prompt: str, n_shots: int, default_fps: int, default_
     )
 def _prompt_minimal(user_prompt: str, n_shots: int, default_fps: int, default_len: int) -> str:
-    # Second attempt if tags fail: demand ONLY an array, nothing else.
     return (
         "Reply ONLY with a JSON array starting with '[' and ending with ']'. No extra text.\n"
         f"Storyboard: {n_shots} shots for:\n'''{user_prompt}'''\n"
-        "Each item:\n"
         "{\n"
         '  \"id\": <int starting at 1>,\n'
         '  \"title\": \"Short title\",\n'
@@ -116,30 +118,27 @@ def _generate_text(model, tok, prompt_text: str) -> str:
         temperature=0.0,
         repetition_penalty=1.05,
         eos_token_id=eos_id,
-        pad_token_id=eos_id,
     )
     text = tok.decode(gen[0], skip_special_tokens=True)
-    # Trim the echoed prompt if the model included it
     if text.startswith(prompt_text):
         text = text[len(prompt_text):]
-    # Strip code fences if any
     text = text.strip()
     if text.startswith("```"):
-        # remove ```json ... ```
         text = re.sub(r"^```(?:json)?\s*|\s*```$", "", text, flags=re.IGNORECASE|re.DOTALL).strip()
     return text
 def _extract_json_array(text: str) -> str:
-    # Prefer <JSON>...</JSON>
     m = re.search(r"<JSON>(.*?)</JSON>", text, flags=re.DOTALL | re.IGNORECASE)
     if m:
         inner = m.group(1).strip()
         if inner:
             return inner
-    # Fallback: balanced array
     start = text.find("[")
     if start == -1:
-        return ""  # signal failure to caller
     depth = 0
     for i in range(start, len(text)):
         ch = text[i]
@@ -149,7 +148,7 @@ def _extract_json_array(text: str) -> str:
             depth -= 1
             if depth == 0:
                 return text[start:i+1].strip()
-    return ""  # unbalanced
 def _normalize_shots(shots_raw, default_fps: int, default_len: int):
     norm = []
@@ -171,45 +170,37 @@ def _normalize_shots(shots_raw, default_fps: int, default_len: int):
 @spaces.GPU(duration=180)
 def generate_storyboard_with_llm(user_prompt: str, n_shots: int, default_fps: int, default_len: int):
     """
-    Two-pass generation for robustness:
-      1) <JSON>...</JSON>
-      2) strict array-only fallback
     """
     model, tok = _lazy_model_tok()
     system = "You are a film previsualization assistant. Output must be valid JSON."
-    # ---- PASS 1: with <JSON> tags
-    p1 = _apply_chat(tok, system + " Return ONLY JSON inside <JSON> tags.",
                      _prompt_with_tags(user_prompt, n_shots, default_fps, default_len))
     out1 = _generate_text(model, tok, p1)
     json_text = _extract_json_array(out1)
-    # ---- PASS 2: strict array (if needed)
     if not json_text:
-        p2 = _apply_chat(tok, system + " Reply ONLY with a JSON array.",
                          _prompt_minimal(user_prompt, n_shots, default_fps, default_len))
         out2 = _generate_text(model, tok, p2)
         json_text = _extract_json_array(out2)
-        # As a last ditch, try bracket slice only
-        if not json_text:
             start = out2.find("["); end = out2.rfind("]")
             if start != -1 and end != -1 and end > start:
                 json_text = out2[start:end+1].strip()
-        if not json_text:
-            # Show a short preview so you can see what the model returned
-            preview = (out2[:400] + "...") if len(out2) > 400 else out2
-            raise ValueError(f"LLM did not return parseable JSON.\nPreview:\n{preview}")
-    # Parse & normalize
-    if not json_text.strip():
-        # Fallback: model returned nothing. Return a single stub shot.
-        print("⚠️ LLM returned empty output. Using fallback storyboard.")
-        fallback = [{
             "id": 1,
             "title": "Shot 1",
-            "description": f"Fallback shot for: {user_prompt[:50]}",
             "duration": default_len,
             "fps": default_fps,
             "video_length": default_len,
@@ -218,19 +209,16 @@ def generate_storyboard_with_llm(user_prompt: str, n_shots: int, default_fps: in
             "negative": "",
             "keyframe_path": None
         }]
-        return fallback
     try:
         shots_raw = json.loads(json_text)
     except Exception:
-        # Attempt a tiny cleanup: remove trailing commas and try again
         json_text_clean = re.sub(r",\s*([\]\}])", r"\1", json_text)
         shots_raw = json.loads(json_text_clean)
     return _normalize_shots(shots_raw, default_fps, default_len)
 # =========================
 # Gradio UI
 # =========================

     return proj
 # =========================
+# LLM (ZeroGPU) — Storyboard generator (robust, two-pass + empty fallback)
 # =========================
 from transformers import AutoTokenizer, AutoModelForCausalLM
 STORYBOARD_MODEL = os.getenv("STORYBOARD_MODEL", "Qwen/Qwen2.5-1.5B-Instruct")
+HF_TASK_MAX_TOKENS = int(os.getenv("HF_TASK_MAX_TOKENS", "1200"))  # give a bit more room
 _tokenizer = None
 _model = None
         dtype="auto",
         trust_remote_code=True,
     )
+    # Ensure pad token exists to avoid warnings
+    if _tokenizer.pad_token_id is None and _tokenizer.eos_token_id is not None:
+        _tokenizer.pad_token_id = _tokenizer.eos_token_id
     return _model, _tokenizer
 def _prompt_with_tags(user_prompt: str, n_shots: int, default_fps: int, default_len: int) -> str:
         "Return ONLY a JSON array, enclosed between <JSON> and </JSON>.\n"
         f"Create a storyboard of {n_shots} shots for this idea:\n\n"
         f"'''{user_prompt}'''\n\n"
+        "Each item schema:\n"
         "{\n"
         '  \"id\": <int starting at 1>,\n'
         '  \"title\": \"Short title\",\n'
     )
 def _prompt_minimal(user_prompt: str, n_shots: int, default_fps: int, default_len: int) -> str:
     return (
         "Reply ONLY with a JSON array starting with '[' and ending with ']'. No extra text.\n"
         f"Storyboard: {n_shots} shots for:\n'''{user_prompt}'''\n"
+        "Item schema:\n"
         "{\n"
         '  \"id\": <int starting at 1>,\n'
         '  \"title\": \"Short title\",\n'
         temperature=0.0,
         repetition_penalty=1.05,
         eos_token_id=eos_id,
+        pad_token_id=tok.pad_token_id if tok.pad_token_id is not None else eos_id,
     )
     text = tok.decode(gen[0], skip_special_tokens=True)
     if text.startswith(prompt_text):
         text = text[len(prompt_text):]
+    # strip code fences if present
     text = text.strip()
     if text.startswith("```"):
         text = re.sub(r"^```(?:json)?\s*|\s*```$", "", text, flags=re.IGNORECASE|re.DOTALL).strip()
     return text
 def _extract_json_array(text: str) -> str:
     m = re.search(r"<JSON>(.*?)</JSON>", text, flags=re.DOTALL | re.IGNORECASE)
     if m:
         inner = m.group(1).strip()
         if inner:
             return inner
+    # Fallback: first balanced array
     start = text.find("[")
     if start == -1:
+        return ""
     depth = 0
     for i in range(start, len(text)):
         ch = text[i]
             depth -= 1
             if depth == 0:
                 return text[start:i+1].strip()
+    return ""
 def _normalize_shots(shots_raw, default_fps: int, default_len: int):
     norm = []
 @spaces.GPU(duration=180)
 def generate_storyboard_with_llm(user_prompt: str, n_shots: int, default_fps: int, default_len: int):
     """
+    Two-pass generation with robust parsing and empty-output fallback.
     """
     model, tok = _lazy_model_tok()
     system = "You are a film previsualization assistant. Output must be valid JSON."
+    # PASS 1: with <JSON> tags
+    p1 = _apply_chat(tok, system + " Return ONLY JSON inside <JSON> tags.",
                      _prompt_with_tags(user_prompt, n_shots, default_fps, default_len))
     out1 = _generate_text(model, tok, p1)
+    print(f"[DEBUG] LLM raw out1 (first 240 chars): {out1[:240]}")
     json_text = _extract_json_array(out1)
+    # PASS 2: strict array fallback
     if not json_text:
+        p2 = _apply_chat(tok, system + " Reply ONLY with a JSON array.",
                          _prompt_minimal(user_prompt, n_shots, default_fps, default_len))
         out2 = _generate_text(model, tok, p2)
+        print(f"[DEBUG] LLM raw out2 (first 240 chars): {out2[:240]}")
         json_text = _extract_json_array(out2)
+        if not json_text and "[" in out2 and "]" in out2:
             start = out2.find("["); end = out2.rfind("]")
             if start != -1 and end != -1 and end > start:
                 json_text = out2[start:end+1].strip()
+    # EMPTY FALLBACK → return a single stub so the app does not crash
+    if not json_text or not json_text.strip():
+        print("⚠️ LLM returned empty or unparsable JSON. Using fallback storyboard.")
+        return [{
             "id": 1,
             "title": "Shot 1",
+            "description": f"Fallback shot for: {user_prompt[:80]}",
             "duration": default_len,
             "fps": default_fps,
             "video_length": default_len,
             "negative": "",
             "keyframe_path": None
         }]
+    # Parse & normalize (with tiny trailing-comma cleanup)
     try:
         shots_raw = json.loads(json_text)
     except Exception:
         json_text_clean = re.sub(r",\s*([\]\}])", r"\1", json_text)
         shots_raw = json.loads(json_text_clean)
     return _normalize_shots(shots_raw, default_fps, default_len)
 # =========================
 # Gradio UI
 # =========================