Spaces:

eeshaAI
/

Zeeb

Sleeping

App Files Files Community

eeshaAI commited on 21 days ago

Commit

7eab64f

verified ·

1 Parent(s): 83a2068

Add constrained decoding: force visual tokens after <video_start>

Browse files

Files changed (1) hide show

app.py +86 -62

app.py CHANGED Viewed

@@ -4,6 +4,8 @@ Gradio App for EeshaAI/Zeeb — Video Generation
 ================================================
 Uses the trained OLMo 2 1B + LoRA model to generate video tokens,
 then decodes them via VQ-VAE into a video file.
 """
 import os
@@ -18,10 +20,17 @@ _tokenizer = None
 _vq_vae = None
 _loading_lock = threading.Lock()
 def load_models():
     """Load the trained LLM and VQ-VAE decoder (lazy, cached)."""
     global _model, _tokenizer, _vq_vae
     with _loading_lock:
         if _model is not None and _tokenizer is not None:
@@ -91,18 +100,27 @@ def load_models():
         _model.eval()
         print(f"✅ Model loaded. Vocab size: {len(_tokenizer)}")
         return _model, _tokenizer, _vq_vae
 def generate_video(prompt: str, max_tokens: int = 128):
-    """Generate video from a text prompt using the trained LLM + VQ-VAE."""
     import torch
     log_lines = []
     log_lines.append(f"🎬 Generating video for: '{prompt}'\n\n")
     try:
-        log_lines.append("📦 Loading trained model + VQ-VAE (first run takes ~3 min)...\n")
         model, tokenizer, vq_vae = load_models()
         log_lines.append("✅ Models loaded.\n\n")
     except Exception as e:
@@ -115,60 +133,73 @@ def generate_video(prompt: str, max_tokens: int = 128):
     text = f"Create a video of: {prompt} <video_start>"
     log_lines.append(f"📝 Prompt: {text}\n\n")
-    # ── Generate tokens ────────────────────────────────────────────────
-    log_lines.append("🔥 Generating visual tokens...\n")
     inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=256)
     with torch.no_grad():
-        output_ids = model.generate(
-            **inputs,
-            max_new_tokens=max_tokens,
-            do_sample=True,
-            temperature=0.8,
-            top_p=0.9,
-            pad_token_id=tokenizer.pad_token_id,
-        )
-    # Decode the full output
-    full_text = tokenizer.decode(output_ids[0], skip_special_tokens=False)
-    log_lines.append(f"📤 Raw output length: {len(full_text)} chars\n")
-    # Extract visual tokens between <video_start> and <video_end>
-    visual_token_ids = []
-    in_video = False
-    for token_id in output_ids[0].tolist():
-        decoded = tokenizer.decode([token_id])
-        if "<video_start>" in decoded:
-            in_video = True
-            continue
-        if "<video_end>" in decoded:
-            in_video = False
-            break
-        if in_video:
-            match = re.match(r"<v_(\d+)>", decoded.strip())
-            if match:
-                visual_token_ids.append(int(match.group(1)))
-    log_lines.append(f"🎨 Extracted {len(visual_token_ids)} visual tokens\n")
     if not visual_token_ids:
-        log_lines.append("⚠️ No visual tokens in structured format. Trying regex on full output...\n")
-        all_v_tokens = re.findall(r"<v_(\d+)>", full_text)
-        if all_v_tokens:
-            visual_token_ids = [int(t) for t in all_v_tokens]
-            log_lines.append(f"🔄 Regex found {len(visual_token_ids)} tokens\n")
-        else:
-            log_lines.append("⚠️ No visual tokens at all. Showing raw output:\n")
-            log_lines.append(f"\n{full_text[:1000]}\n")
-            return None, "\n".join(log_lines)
     sample_tokens = visual_token_ids[:20]
     log_lines.append(f"   Sample tokens: {sample_tokens}\n")
-    log_lines.append(f"   Unique tokens: {len(set(visual_token_ids))}\n\n")
     # ── Decode to video frames ──────────────────────────────────────────
-    log_lines.append("🎞️ Decoding tokens → video frames...\n")
     grid_h, grid_w = 8, 8
     tokens_per_frame = grid_h * grid_w  # 64
@@ -178,26 +209,20 @@ def generate_video(prompt: str, max_tokens: int = 128):
     frames = []
-    if vq_vae is not None:
-        for frame_idx in range(num_frames):
-            start = frame_idx * tokens_per_frame
-            end = start + tokens_per_frame
-            frame_tokens = visual_token_ids[start:end]
             try:
                 frame_tensor = vq_vae.decode_tokens(frame_tokens, grid_h, grid_w)
                 frame_np = (frame_tensor[0].permute(1, 2, 0).detach().numpy() * 255).astype(np.uint8)
                 frames.append(frame_np)
             except Exception as e:
-                log_lines.append(f"   ⚠️ Frame {frame_idx} decode error: {e}\n")
-                # Fallback: color blocks
-                frame_np = _tokens_to_color_blocks(frame_tokens, grid_h, grid_w)
-                frames.append(frame_np)
-    else:
-        log_lines.append("   ⚠️ No VQ-VAE, using token→color mapping\n")
-        for frame_idx in range(num_frames):
-            start = frame_idx * tokens_per_frame
-            end = start + tokens_per_frame
-            frame_tokens = visual_token_ids[start:end]
             frames.append(_tokens_to_color_blocks(frame_tokens, grid_h, grid_w))
     if not frames:
@@ -224,7 +249,6 @@ def generate_video(prompt: str, max_tokens: int = 128):
             imageio.mimsave(output_path, upscaled, fps=2)
             log_lines.append(f"✅ Video saved as MP4: {output_path}\n")
         except Exception:
-            # Fallback to GIF
             output_path = "/tmp/generated_video.gif"
             pil_frames = [Image.fromarray(f) for f in upscaled]
             pil_frames[0].save(
@@ -285,7 +309,7 @@ with gr.Blocks(
         **OLMo 2 1B Instruct** fine-tuned with **LoRA (r=4)** to generate video tokens.
         Model: [EeshaAI/zeeb](https://huggingface.co/EeshaAI/zeeb)
-        Type a description and click Generate!
         """
     )
@@ -303,7 +327,7 @@ with gr.Blocks(
     video_output = gr.Video(label="Generated Video")
     gen_log = gr.Textbox(
         label="Generation Log",
-        lines=20,
         interactive=False,
         show_copy_button=True,
     )

 ================================================
 Uses the trained OLMo 2 1B + LoRA model to generate video tokens,
 then decodes them via VQ-VAE into a video file.
+Uses constrained decoding: after <video_start>, only <v_N> tokens are allowed.
 """
 import os
 _vq_vae = None
 _loading_lock = threading.Lock()
+# Visual token ID range (from tokenizer: <v_0>=100281, <v_1023>=101304)
+VIDEO_START_ID = None  # Will be set after tokenizer loads
+VIDEO_END_ID = None
+V_TOKEN_START_ID = None
+V_TOKEN_END_ID = None
 def load_models():
     """Load the trained LLM and VQ-VAE decoder (lazy, cached)."""
     global _model, _tokenizer, _vq_vae
+    global VIDEO_START_ID, VIDEO_END_ID, V_TOKEN_START_ID, V_TOKEN_END_ID
     with _loading_lock:
         if _model is not None and _tokenizer is not None:
         _model.eval()
         print(f"✅ Model loaded. Vocab size: {len(_tokenizer)}")
+        # Set visual token ID ranges
+        VIDEO_START_ID = _tokenizer.convert_tokens_to_ids("<video_start>")
+        VIDEO_END_ID = _tokenizer.convert_tokens_to_ids("<video_end>")
+        V_TOKEN_START_ID = _tokenizer.convert_tokens_to_ids("<v_0>")
+        V_TOKEN_END_ID = _tokenizer.convert_tokens_to_ids("<v_1023>")
+        print(f"   <video_start>={VIDEO_START_ID}, <video_end>={VIDEO_END_ID}")
+        print(f"   <v_0>={V_TOKEN_START_ID}, <v_1023>={V_TOKEN_END_ID}")
         return _model, _tokenizer, _vq_vae
 def generate_video(prompt: str, max_tokens: int = 128):
+    """Generate video from a text prompt using constrained decoding + VQ-VAE."""
     import torch
+    import torch.nn.functional as F
     log_lines = []
     log_lines.append(f"🎬 Generating video for: '{prompt}'\n\n")
     try:
+        log_lines.append("📦 Loading trained model + VQ-VAE...\n")
         model, tokenizer, vq_vae = load_models()
         log_lines.append("✅ Models loaded.\n\n")
     except Exception as e:
     text = f"Create a video of: {prompt} <video_start>"
     log_lines.append(f"📝 Prompt: {text}\n\n")
+    # ── Constrained token generation ────────────────────────────────────
+    # After <video_start>, we FORCE the model to only pick from <v_0>...<v_1023>
+    # This is done by masking the logits at each step
+    log_lines.append("🔥 Generating visual tokens (constrained decoding)...\n")
     inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=256)
+    input_ids = inputs["input_ids"]
+    visual_token_ids = []
+    current_ids = input_ids.clone()
+    # Create a mask that only allows visual token IDs
+    vocab_size = len(tokenizer)
+    visual_mask = torch.zeros(vocab_size, dtype=torch.bool)
+    visual_mask[V_TOKEN_START_ID:V_TOKEN_END_ID + 1] = True
+    # Also allow <video_end> so the model can stop
+    visual_mask[VIDEO_END_ID] = True
     with torch.no_grad():
+        for step in range(max_tokens):
+            # Forward pass
+            outputs = model(input_ids=current_ids)
+            next_token_logits = outputs.logits[:, -1, :]  # [1, vocab_size]
+            # Apply constraint: only allow visual tokens + <video_end>
+            masked_logits = next_token_logits.clone()
+            masked_logits[0, ~visual_mask] = float('-inf')
+            # Sample from the constrained distribution
+            probs = F.softmax(masked_logits / 0.8, dim=-1)  # temperature=0.8
+            # Check if <video_end> has high probability
+            end_prob = probs[0, VIDEO_END_ID].item()
+            # Sample
+            next_token = torch.multinomial(probs, num_samples=1)  # [1, 1]
+            next_id = next_token.item()
+            # If the model chose <video_end>, stop
+            if next_id == VIDEO_END_ID:
+                log_lines.append(f"   Model chose <video_end> at step {step} (end_prob={end_prob:.4f})\n")
+                break
+            # Convert token ID to visual token index
+            visual_idx = next_id - V_TOKEN_START_ID
+            visual_token_ids.append(visual_idx)
+            # Append to sequence
+            current_ids = torch.cat([current_ids, next_token], dim=-1)
+    log_lines.append(f"🎨 Generated {len(visual_token_ids)} visual tokens\n")
     if not visual_token_ids:
+        log_lines.append("⚠️ No visual tokens generated even with constrained decoding.\n")
+        log_lines.append("   Falling back to random token sampling from VQ-VAE codebook.\n")
+        # Fallback: generate random visual tokens
+        import random
+        visual_token_ids = [random.randint(0, 1023) for _ in range(64)]
+        log_lines.append(f"   Generated {len(visual_token_ids)} random tokens as fallback\n")
     sample_tokens = visual_token_ids[:20]
     log_lines.append(f"   Sample tokens: {sample_tokens}\n")
+    unique = len(set(visual_token_ids))
+    log_lines.append(f"   Unique tokens: {unique} / {len(visual_token_ids)}\n\n")
     # ── Decode to video frames ──────────────────────────────────────────
+    log_lines.append("🎞️ Decoding tokens → video frames via VQ-VAE...\n")
     grid_h, grid_w = 8, 8
     tokens_per_frame = grid_h * grid_w  # 64
     frames = []
+    for frame_idx in range(num_frames):
+        start_idx = frame_idx * tokens_per_frame
+        end_idx = start_idx + tokens_per_frame
+        frame_tokens = visual_token_ids[start_idx:end_idx]
+        if vq_vae is not None:
             try:
                 frame_tensor = vq_vae.decode_tokens(frame_tokens, grid_h, grid_w)
                 frame_np = (frame_tensor[0].permute(1, 2, 0).detach().numpy() * 255).astype(np.uint8)
                 frames.append(frame_np)
             except Exception as e:
+                log_lines.append(f"   ⚠️ Frame {frame_idx} VQ-VAE error: {e}, using color blocks\n")
+                frames.append(_tokens_to_color_blocks(frame_tokens, grid_h, grid_w))
+        else:
             frames.append(_tokens_to_color_blocks(frame_tokens, grid_h, grid_w))
     if not frames:
             imageio.mimsave(output_path, upscaled, fps=2)
             log_lines.append(f"✅ Video saved as MP4: {output_path}\n")
         except Exception:
             output_path = "/tmp/generated_video.gif"
             pil_frames = [Image.fromarray(f) for f in upscaled]
             pil_frames[0].save(
         **OLMo 2 1B Instruct** fine-tuned with **LoRA (r=4)** to generate video tokens.
         Model: [EeshaAI/zeeb](https://huggingface.co/EeshaAI/zeeb)
+        Uses **constrained decoding** — after `<video_start>`, only visual tokens are allowed.
         """
     )
     video_output = gr.Video(label="Generated Video")
     gen_log = gr.Textbox(
         label="Generation Log",
+        lines=25,
         interactive=False,
         show_copy_button=True,
     )