Spaces:

MoonMath-ai
/

Prompt-2-Video

Running on Zero

App Files Files Community

Shalmoni commited on Oct 5

Commit

58c4d87

verified ·

1 Parent(s): 6b00576

Update app.py

Browse files

Files changed (1) hide show

app.py +105 -65

app.py CHANGED Viewed

@@ -1,7 +1,8 @@
-import os, json, uuid
 from datetime import datetime
 import gradio as gr
-import spaces  # <<< required for ZeroGPU
 # =========================
 # Storage helpers
@@ -32,81 +33,121 @@ def load_project_file(file_obj):
     return proj
 # =========================
-# LLM (ZeroGPU) — Storyboard generator
 # =========================
-from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 STORYBOARD_MODEL = os.getenv("STORYBOARD_MODEL", "Qwen/Qwen2.5-1.5B-Instruct")
-HF_TASK_MAX_TOKENS = int(os.getenv("HF_TASK_MAX_TOKENS", "900"))  # keep tidy for JSON
-_pipe = None  # lazy-loaded global
-def _lazy_pipe():
-    global _pipe
-    if _pipe is not None:
-        return _pipe
-    tok = AutoTokenizer.from_pretrained(STORYBOARD_MODEL, trust_remote_code=True)
-    mdl = AutoModelForCausalLM.from_pretrained(
         STORYBOARD_MODEL,
         device_map="auto",
-        torch_dtype="auto",
         trust_remote_code=True,
     )
-    _pipe = pipeline(
-        "text-generation",
-        model=mdl,
-        tokenizer=tok,
         max_new_tokens=HF_TASK_MAX_TOKENS,
-        do_sample=False,              # deterministic JSON
         temperature=0.0,
         repetition_penalty=1.05,
     )
-    return _pipe
-def _storyboard_prompt(user_prompt: str, n_shots: int, default_fps: int, default_len: int) -> str:
-    return f"""
-You are a film previsualization assistant. Return ONLY valid JSON (no explanations).
-Create a storyboard of {n_shots} numbered shots for the following idea:
-\"\"\"{user_prompt}\"\"\"
-Return an array of objects with this exact schema and default values:
-[
-  {{
-    "id": 1,
-    "title": "Short title",
-    "description": "A visual description suitable for keyframe generation",
-    "duration": {default_len},
-    "fps": {default_fps},
-    "video_length": {default_len},
-    "steps": 30,
-    "seed": null,
-    "negative": ""
-  }}
-]
-Rules:
-- IDs must start at 1 and increment by 1.
-- Use simple ASCII only. No trailing commas.
-- Output must be valid JSON parseable by Python's json.loads.
-""".strip()
-@spaces.GPU(duration=180)  # <<< ZeroGPU entrypoint: triggers pooled GPU allocation
-def generate_storyboard_with_llm(user_prompt: str, n_shots: int, default_fps: int, default_len: int):
-    pipe = _lazy_pipe()
-    prompt = _storyboard_prompt(user_prompt, n_shots, default_fps, default_len)
-    out = pipe(prompt)[0]["generated_text"]
-    # Extract the JSON array
-    start = out.find("[")
-    end = out.rfind("]")
-    if start == -1 or end == -1 or end <= start:
-        raise ValueError("LLM did not return valid JSON.")
-    text = out[start:end+1]
-    shots = json.loads(text)
-    # Normalize & enforce required fields
     norm = []
-    for i, s in enumerate(shots, start=1):
         norm.append({
             "id": int(s.get("id", i)),
             "title": s.get("title", f"Shot {i}"),
@@ -255,5 +296,4 @@ with gr.Blocks() as demo:
     )
 if __name__ == "__main__":
-    # SSR is fine; you can set share=True if you want a public link automatically
     demo.launch()

+import os, json, uuid, re
 from datetime import datetime
 import gradio as gr
+import spaces  # ZeroGPU decorator
+import torch
 # =========================
 # Storage helpers
     return proj
 # =========================
+# LLM (ZeroGPU) — Storyboard generator (robust JSON)
 # =========================
+from transformers import AutoTokenizer, AutoModelForCausalLM
 STORYBOARD_MODEL = os.getenv("STORYBOARD_MODEL", "Qwen/Qwen2.5-1.5B-Instruct")
+HF_TASK_MAX_TOKENS = int(os.getenv("HF_TASK_MAX_TOKENS", "900"))
+_tokenizer = None
+_model = None
+def _lazy_model_tok():
+    global _tokenizer, _model
+    if _tokenizer is not None and _model is not None:
+        return _model, _tokenizer
+    _tokenizer = AutoTokenizer.from_pretrained(STORYBOARD_MODEL, trust_remote_code=True)
+    _model = AutoModelForCausalLM.from_pretrained(
         STORYBOARD_MODEL,
         device_map="auto",
+        dtype="auto",             # prefer `dtype` (torch_dtype is deprecated)
         trust_remote_code=True,
     )
+    return _model, _tokenizer
+def _storyboard_prompt(user_prompt: str, n_shots: int, default_fps: int, default_len: int) -> str:
+    # Force the model to wrap JSON in tags; makes parsing deterministic.
+    return (
+        "Return ONLY a JSON array, enclosed between <JSON> and </JSON>.\n"
+        f"Create a storyboard of {n_shots} shots for this idea:\n\n"
+        f"'''{user_prompt}'''\n\n"
+        "Schema per item:\n"
+        "{\n"
+        '  \"id\": <int starting at 1>,\n'
+        '  \"title\": \"Short title\",\n'
+        '  \"description\": \"Visual description for keyframe generation\",\n'
+        f"  \"duration\": {default_len},\n"
+        f"  \"fps\": {default_fps},\n"
+        f"  \"video_length\": {default_len},\n"
+        "  \"steps\": 30,\n"
+        "  \"seed\": null,\n"
+        '  \"negative\": \"\"\n'
+        "}\n\n"
+        "Output:\n<JSON>\n[ { ... }, ... ]\n</JSON>\n"
+    )
+def _extract_json_array(text: str) -> str:
+    """
+    Prefer <JSON>...</JSON>. Fallback: first balanced top-level JSON array.
+    """
+    m = re.search(r"<JSON>(.*?)</JSON>", text, flags=re.DOTALL | re.IGNORECASE)
+    if m:
+        return m.group(1).strip()
+    start = text.find("[")
+    if start == -1:
+        raise ValueError("No JSON array start '[' found in model output.")
+    depth = 0
+    for i in range(start, len(text)):
+        ch = text[i]
+        if ch == "[":
+            depth += 1
+        elif ch == "]":
+            depth -= 1
+            if depth == 0:
+                return text[start:i+1]
+    raise ValueError("Unbalanced JSON array in model output.")
+@spaces.GPU(duration=180)  # ZeroGPU entrypoint
+def generate_storyboard_with_llm(user_prompt: str, n_shots: int, default_fps: int, default_len: int):
+    """
+    Chat-format prompt -> deterministic generation -> robust JSON parse.
+    """
+    model, tok = _lazy_model_tok()
+    system = (
+        "You are a film previsualization assistant. "
+        "Return ONLY JSON inside <JSON>...</JSON>. No extra text."
+    )
+    user = _storyboard_prompt(user_prompt, n_shots, default_fps, default_len)
+    # Use chat template if available for the model
+    if hasattr(tok, "apply_chat_template"):
+        prompt_text = tok.apply_chat_template(
+            [{"role": "system", "content": system},
+             {"role": "user", "content": user}],
+            tokenize=False,
+            add_generation_prompt=True
+        )
+    else:
+        prompt_text = system + "\n\n" + user
+    inputs = tok(prompt_text, return_tensors="pt")
+    inputs = {k: v.to(model.device) for k, v in inputs.items()}
+    eos_id = tok.eos_token_id
+    gen = model.generate(
+        **inputs,
         max_new_tokens=HF_TASK_MAX_TOKENS,
+        do_sample=False,
         temperature=0.0,
         repetition_penalty=1.05,
+        eos_token_id=eos_id,
+        pad_token_id=eos_id,
     )
+    out_text = tok.decode(gen[0], skip_special_tokens=True)
+    # Trim the echoed prompt if present
+    if out_text.startswith(prompt_text):
+        out_text = out_text[len(prompt_text):]
+    json_text = _extract_json_array(out_text)
+    shots_raw = json.loads(json_text)
+    # Normalize fields
     norm = []
+    for i, s in enumerate(shots_raw, start=1):
         norm.append({
             "id": int(s.get("id", i)),
             "title": s.get("title", f"Shot {i}"),
     )
 if __name__ == "__main__":
     demo.launch()