Spaces:

eeshaAI
/

Zeeb

Sleeping

App Files Files Community

eeshaAI commited on 20 days ago

Commit

5ab6307

verified ·

1 Parent(s): 893985c

Update app.py: add video generation tab

Browse files

Files changed (1) hide show

app.py +326 -24

app.py CHANGED Viewed

@@ -1,17 +1,25 @@
 #!/usr/bin/env python3
 """
-Gradio App for EeshaAI/Zeeb Training Space
-==========================================
-Auto-starts LoRA fine-tuning on Space boot.
-The UI shows real-time training progress from the log file.
 """
 import os
 import time
 import threading
 import gradio as gr
 LOG_FILE = "/tmp/training_log.txt"
 def start_training_background():
@@ -30,43 +38,337 @@ def get_log():
 def refresh_log():
-    """Refresh button callback."""
     return get_log()
-# Auto-start training on Space boot
 training_thread = threading.Thread(target=start_training_background, daemon=True)
 training_thread.start()
 with gr.Blocks(
-    title="Zeeb — Video-LLM Trainer",
     theme=gr.themes.Soft(),
 ) as demo:
     gr.Markdown(
         """
-        # 🎬 Zeeb — Video-LLM Trainer
-        Fine-tuning **OLMo 2 1B Instruct** with **LoRA (r=4)** to generate video tokens.
-        Trained model will be pushed to [EeshaAI/zeeb](https://huggingface.co/EeshaAI/zeeb).
-        Training **starts automatically** when this Space boots.
-        Click **Refresh Log** to see progress.
         """
     )
-    refresh_btn = gr.Button("🔄 Refresh Log", variant="primary")
-    logbox = gr.Textbox(
-        label="Training Log",
-        value=lambda: get_log(),
-        lines=30,
-        max_lines=200,
-        interactive=False,
-        show_copy_button=True,
-    )
-    refresh_btn.click(fn=refresh_log, outputs=logbox)
 if __name__ == "__main__":

 #!/usr/bin/env python3
 """
+Gradio App for EeshaAI/Zeeb — Training + Video Generation
+==========================================================
+Tab 1: Training (auto-starts on boot)
+Tab 2: Generate Video (loads trained model + VQ-VAE, generates video from prompt)
 """
 import os
 import time
+import re
 import threading
+import numpy as np
 import gradio as gr
 LOG_FILE = "/tmp/training_log.txt"
+GENERATE_LOG = "/tmp/generation_log.txt"
+# Global model cache
+_model = None
+_tokenizer = None
+_vq_vae = None
 def start_training_background():
 def refresh_log():
     return get_log()
+def load_models():
+    """Load the trained LLM and VQ-VAE decoder (lazy, cached)."""
+    global _model, _tokenizer, _vq_vae
+    if _model is not None and _tokenizer is not None:
+        return _model, _tokenizer, _vq_vae
+    import torch
+    # ── Load VQ-VAE decoder ─────────────────────────────────────────────
+    vq_vae_path = "vq_vae_final.pt"
+    if os.path.exists(vq_vae_path):
+        import torch.nn as nn
+        class VQVAEDecoderOnly(nn.Module):
+            """Minimal VQ-VAE decoder for token → pixel decoding."""
+            def __init__(self, codebook_size=1024, codebook_dim=256, latent_dim=256):
+                super().__init__()
+                self.codebook = nn.Embedding(codebook_size, codebook_dim)
+                self.proj = nn.Linear(codebook_dim, latent_dim)
+                # Decoder: upscale from 8x8 spatial to 64x64
+                self.decoder = nn.Sequential(
+                    nn.ConvTranspose2d(latent_dim, 128, 4, stride=2, padding=1),  # 8→16
+                    nn.ReLU(),
+                    nn.ConvTranspose2d(128, 64, 4, stride=2, padding=1),  # 16→32
+                    nn.ReLU(),
+                    nn.ConvTranspose2d(64, 32, 4, stride=2, padding=1),  # 32→64
+                    nn.ReLU(),
+                    nn.Conv2d(32, 3, 3, padding=1),
+                    nn.Sigmoid(),
+                )
+            def decode_tokens(self, token_ids, grid_h=8, grid_w=8):
+                """Decode a flat list of token IDs into a video frame."""
+                # token_ids: list of ints, length should be grid_h * grid_w
+                tokens = torch.tensor(token_ids[:grid_h * grid_w], dtype=torch.long)
+                if len(tokens) < grid_h * grid_w:
+                    tokens = torch.cat([tokens, torch.zeros(grid_h * grid_w - len(tokens), dtype=torch.long)])
+                # Lookup codebook
+                z = self.codebook(tokens)  # [H*W, D]
+                z = self.proj(z)  # [H*W, latent_dim]
+                z = z.reshape(1, grid_h, grid_w, -1).permute(0, 3, 1, 2)  # [1, C, H, W]
+                # Decode
+                frame = self.decoder(z)  # [1, 3, 64, 64]
+                return frame
+        _vq_vae = VQVAEDecoderOnly()
+        state = torch.load(vq_vae_path, map_location="cpu", weights_only=False)
+        # Try to load relevant weights
+        if isinstance(state, dict):
+            if "codebook" in state or "state_dict" in state:
+                # Full checkpoint
+                sd = state.get("state_dict", state)
+                filtered = {k: v for k, v in sd.items() if not k.startswith("encoder")}
+                _vq_vae.load_state_dict(filtered, strict=False)
+            elif "model_state_dict" in state:
+                _vq_vae.load_state_dict(state["model_state_dict"], strict=False)
+            else:
+                _vq_vae.load_state_dict(state, strict=False)
+        print("✅ VQ-VAE decoder loaded")
+    # ── Load trained LLM ────────────────────────────────────────────────
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+    REPO_ID = "eeshaAI/zeeb"
+    print("📦 Loading trained model from EeshaAI/zeeb...")
+    _tokenizer = AutoTokenizer.from_pretrained(REPO_ID, trust_remote_code=True)
+    if _tokenizer.pad_token is None:
+        _tokenizer.pad_token = _tokenizer.eos_token
+    _model = AutoModelForCausalLM.from_pretrained(
+        REPO_ID,
+        trust_remote_code=True,
+        torch_dtype=torch.float32,
+    )
+    _model.eval()
+    print(f"✅ Model loaded. Vocab size: {len(_tokenizer)}")
+    return _model, _tokenizer, _vq_vae
+def generate_video(prompt: str, max_tokens: int = 128):
+    """Generate video from a text prompt using the trained LLM + VQ-VAE."""
+    import torch
+    log_lines = []
+    log_lines.append(f"🎬 Generating video for: '{prompt}'\n\n")
+    try:
+        # Load models
+        log_lines.append("📦 Loading trained model + VQ-VAE...\n")
+        model, tokenizer, vq_vae = load_models()
+        log_lines.append("✅ Models loaded.\n\n")
+    except Exception as e:
+        log_lines.append(f"❌ Failed to load models: {e}\n")
+        return None, "\n".join(log_lines)
+    # ── Format prompt ──────────────────────────────────────────────────
+    text = f"Create a video of: {prompt} <video_start>"
+    log_lines.append(f"📝 Prompt formatted:\n   {text}\n\n")
+    # ── Generate tokens ────────────────────────────────────────────────
+    log_lines.append("🔥 Generating visual tokens...\n")
+    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=256)
+    with torch.no_grad():
+        output_ids = model.generate(
+            **inputs,
+            max_new_tokens=max_tokens,
+            do_sample=True,
+            temperature=0.8,
+            top_p=0.9,
+            pad_token_id=tokenizer.pad_token_id,
+        )
+    # Decode the full output
+    full_text = tokenizer.decode(output_ids[0], skip_special_tokens=False)
+    log_lines.append(f"📤 Raw output length: {len(full_text)} chars\n")
+    # Extract visual tokens between <video_start> and <video_end>
+    visual_token_ids = []
+    in_video = False
+    for token_id in output_ids[0].tolist():
+        decoded = tokenizer.decode([token_id])
+        if "<video_start>" in decoded:
+            in_video = True
+            continue
+        if "<video_end>" in decoded:
+            in_video = False
+            break
+        if in_video:
+            # Check if it's a <v_N> token
+            match = re.match(r"<v_(\d+)>", decoded.strip())
+            if match:
+                visual_token_ids.append(int(match.group(1)))
+    log_lines.append(f"🎨 Extracted {len(visual_token_ids)} visual tokens\n")
+    if not visual_token_ids:
+        log_lines.append("⚠️ No visual tokens generated! The model may need more training.\n")
+        log_lines.append(f"\nFull output:\n{full_text}\n")
+        # Try alternative: parse from full_text
+        all_v_tokens = re.findall(r"<v_(\d+)>", full_text)
+        if all_v_tokens:
+            visual_token_ids = [int(t) for t in all_v_tokens]
+            log_lines.append(f"\n🔄 Alternative extraction found {len(visual_token_ids)} tokens\n")
+        else:
+            return None, "\n".join(log_lines)
+    # Show sample of tokens
+    sample_tokens = visual_token_ids[:20]
+    log_lines.append(f"   Sample tokens: {sample_tokens}\n")
+    log_lines.append(f"   Unique tokens: {len(set(visual_token_ids))}\n\n")
+    # ── Decode to video frames ──────────────────────────────────────────
+    log_lines.append("🎞️ Decoding tokens → video frames via VQ-VAE...\n")
+    grid_h, grid_w = 8, 8
+    tokens_per_frame = grid_h * grid_w  # 64
+    num_frames = max(1, len(visual_token_ids) // tokens_per_frame)
+    log_lines.append(f"   Grid: {grid_h}×{grid_w} = {tokens_per_frame} tokens/frame\n")
+    log_lines.append(f"   Frames: {num_frames}\n\n")
+    frames = []
+    if vq_vae is not None:
+        for frame_idx in range(num_frames):
+            start = frame_idx * tokens_per_frame
+            end = start + tokens_per_frame
+            frame_tokens = visual_token_ids[start:end]
+            try:
+                frame_tensor = vq_vae.decode_tokens(frame_tokens, grid_h, grid_w)
+                # Convert to numpy: [1, 3, 64, 64] → [64, 64, 3] uint8
+                frame_np = (frame_tensor[0].permute(1, 2, 0).numpy() * 255).astype(np.uint8)
+                frames.append(frame_np)
+            except Exception as e:
+                log_lines.append(f"   ⚠️ Frame {frame_idx} decode error: {e}\n")
+                # Fallback: create frame from token values as colors
+                frame_np = np.zeros((64, 64, 3), dtype=np.uint8)
+                for i, t in enumerate(frame_tokens[:tokens_per_frame]):
+                    row, col = divmod(i, grid_w)
+                    cell_h, cell_w = 64 // grid_h, 64 // grid_w
+                    if row < grid_h and col < grid_w:
+                        # Use token value as a color
+                        r = (t * 37) % 256
+                        g = (t * 73) % 256
+                        b = (t * 113) % 256
+                        frame_np[row*cell_h:(row+1)*cell_h, col*cell_w:(col+1)*cell_w] = [r, g, b]
+                frames.append(frame_np)
+    else:
+        # No VQ-VAE: create frames from token values as colored blocks
+        log_lines.append("   ⚠️ No VQ-VAE, using token→color mapping\n")
+        for frame_idx in range(num_frames):
+            start = frame_idx * tokens_per_frame
+            end = start + tokens_per_frame
+            frame_tokens = visual_token_ids[start:end]
+            frame_np = np.zeros((64, 64, 3), dtype=np.uint8)
+            for i, t in enumerate(frame_tokens[:tokens_per_frame]):
+                row, col = divmod(i, grid_w)
+                cell_h, cell_w = 64 // grid_h, 64 // grid_w
+                if row < grid_h and col < grid_w:
+                    r = (t * 37) % 256
+                    g = (t * 73) % 256
+                    b = (t * 113) % 256
+                    frame_np[row*cell_h:(row+1)*cell_h, col*cell_w:(col+1)*cell_w] = [r, g, b]
+            frames.append(frame_np)
+    if not frames:
+        log_lines.append("❌ No frames generated!\n")
+        return None, "\n".join(log_lines)
+    # ── Save as video ──────────────────────────────────────────────────
+    log_lines.append(f"💾 Saving {len(frames)} frames as video...\n")
+    try:
+        import imageio
+        output_path = "/tmp/generated_video.mp4"
+        # Upscale frames from 64x64 to 256x256 for better visibility
+        from PIL import Image
+        upscaled = []
+        for f in frames:
+            img = Image.fromarray(f)
+            img = img.resize((256, 256), Image.NEAREST)
+            upscaled.append(np.array(img))
+        # Save as mp4 (2 fps for slow playback since we have few frames)
+        imageio.mimsave(output_path, upscaled, fps=2)
+        log_lines.append(f"✅ Video saved to {output_path}\n")
+        log_lines.append(f"   Resolution: 256×256\n")
+        log_lines.append(f"   Frames: {len(upscaled)}\n")
+        log_lines.append(f"   FPS: 2\n\n")
+        log_lines.append("🎉 Video generation complete!\n")
+        return output_path, "\n".join(log_lines)
+    except ImportError:
+        # Fallback: save as GIF
+        try:
+            from PIL import Image
+            output_path = "/tmp/generated_video.gif"
+            pil_frames = [Image.fromarray(f).resize((256, 256), Image.NEAREST) for f in frames]
+            pil_frames[0].save(
+                output_path,
+                save_all=True,
+                append_images=pil_frames[1:],
+                duration=500,
+                loop=0,
+            )
+            log_lines.append(f"✅ GIF saved to {output_path}\n")
+            return output_path, "\n".join(log_lines)
+        except Exception as e:
+            log_lines.append(f"❌ Failed to save video: {e}\n")
+            # Return first frame as image at least
+            img_path = "/tmp/generated_frame.png"
+            Image.fromarray(frames[0]).resize((256, 256), Image.NEAREST).save(img_path)
+            log_lines.append(f"📸 Saved single frame to {img_path}\n")
+            return img_path, "\n".join(log_lines)
+    except Exception as e:
+        log_lines.append(f"❌ Video save error: {e}\n")
+        return None, "\n".join(log_lines)
+# ── Auto-start training on boot ────────────────────────────────────────
 training_thread = threading.Thread(target=start_training_background, daemon=True)
 training_thread.start()
+# ── Gradio UI ──────────────────────────────────────────────────────────
 with gr.Blocks(
+    title="Zeeb — Video-LLM",
     theme=gr.themes.Soft(),
 ) as demo:
     gr.Markdown(
         """
+        # 🎬 Zeeb — Video-LLM
+        **OLMo 2 1B Instruct** fine-tuned with **LoRA** to generate video tokens.
+        Model repo: [EeshaAI/zeeb](https://huggingface.co/EeshaAI/zeeb)
         """
     )
+    with gr.Tabs():
+        # ── Tab 1: Generate Video ───────────────────────────────────────
+        with gr.Tab("🎬 Generate Video"):
+            prompt_input = gr.Textbox(
+                label="Video Description",
+                placeholder="A cat jumping on a sofa",
+                lines=2,
+            )
+            max_tokens_slider = gr.Slider(
+                minimum=32, maximum=256, value=128, step=32,
+                label="Max Visual Tokens",
+            )
+            generate_btn = gr.Button("🎬 Generate Video", variant="primary", size="lg")
+            video_output = gr.Video(label="Generated Video")
+            gen_log = gr.Textbox(
+                label="Generation Log",
+                lines=20,
+                interactive=False,
+                show_copy_button=True,
+            )
+            generate_btn.click(
+                fn=generate_video,
+                inputs=[prompt_input, max_tokens_slider],
+                outputs=[video_output, gen_log],
+            )
+        # ── Tab 2: Training ─────────────────────────────────────────────
+        with gr.Tab("🔧 Training"):
+            gr.Markdown(
+                """
+                Training **starts automatically** when this Space boots.
+                Click **Refresh Log** to see progress.
+                """
+            )
+            refresh_btn = gr.Button("🔄 Refresh Log")
+            logbox = gr.Textbox(
+                label="Training Log",
+                value=lambda: get_log(),
+                lines=25,
+                max_lines=200,
+                interactive=False,
+                show_copy_button=True,
+            )
+            refresh_btn.click(fn=refresh_log, outputs=logbox)
 if __name__ == "__main__":