Spaces:

eeshaAI
/

Zeeb

Sleeping

App Files Files Community

eeshaAI commited on 20 days ago

Commit

6e8dde1

verified ·

1 Parent(s): 7eab64f

Update app.py: full training pipeline with real datasets

Browse files

Files changed (1) hide show

app.py +151 -218

app.py CHANGED Viewed

@@ -1,11 +1,9 @@
 #!/usr/bin/env python3
 """
-Gradio App for EeshaAI/Zeeb — Video Generation
-================================================
-Uses the trained OLMo 2 1B + LoRA model to generate video tokens,
-then decodes them via VQ-VAE into a video file.
-Uses constrained decoding: after <video_start>, only <v_N> tokens are allowed.
 """
 import os
@@ -14,14 +12,16 @@ import threading
 import numpy as np
 import gradio as gr
 # Global model cache
 _model = None
 _tokenizer = None
 _vq_vae = None
 _loading_lock = threading.Lock()
-# Visual token ID range (from tokenizer: <v_0>=100281, <v_1023>=101304)
-VIDEO_START_ID = None  # Will be set after tokenizer loads
 VIDEO_END_ID = None
 V_TOKEN_START_ID = None
 V_TOKEN_END_ID = None
@@ -37,53 +37,49 @@ def load_models():
             return _model, _tokenizer, _vq_vae
         import torch
-        # ── Load VQ-VAE decoder ─────────────────────────────────────────
-        vq_vae_path = "vq_vae_final.pt"
-        if os.path.exists(vq_vae_path):
-            import torch.nn as nn
-            class VQVAEDecoderOnly(nn.Module):
-                """Minimal VQ-VAE decoder for token → pixel decoding."""
-                def __init__(self, codebook_size=1024, codebook_dim=256, latent_dim=256):
-                    super().__init__()
-                    self.codebook = nn.Embedding(codebook_size, codebook_dim)
-                    self.proj = nn.Linear(codebook_dim, latent_dim)
-                    self.decoder = nn.Sequential(
-                        nn.ConvTranspose2d(latent_dim, 128, 4, stride=2, padding=1),
-                        nn.ReLU(),
-                        nn.ConvTranspose2d(128, 64, 4, stride=2, padding=1),
-                        nn.ReLU(),
-                        nn.ConvTranspose2d(64, 32, 4, stride=2, padding=1),
-                        nn.ReLU(),
-                        nn.Conv2d(32, 3, 3, padding=1),
-                        nn.Sigmoid(),
-                    )
-                def decode_tokens(self, token_ids, grid_h=8, grid_w=8):
-                    tokens = torch.tensor(token_ids[:grid_h * grid_w], dtype=torch.long)
-                    if len(tokens) < grid_h * grid_w:
-                        tokens = torch.cat([tokens, torch.zeros(grid_h * grid_w - len(tokens), dtype=torch.long)])
-                    z = self.codebook(tokens)
-                    z = self.proj(z)
-                    z = z.reshape(1, grid_h, grid_w, -1).permute(0, 3, 1, 2)
-                    frame = self.decoder(z)
-                    return frame
-            _vq_vae = VQVAEDecoderOnly()
-            state = torch.load(vq_vae_path, map_location="cpu", weights_only=False)
-            if isinstance(state, dict):
-                if "state_dict" in state:
-                    sd = state["state_dict"]
-                elif "model_state_dict" in state:
-                    sd = state["model_state_dict"]
-                else:
-                    sd = state
                 filtered = {k: v for k, v in sd.items() if not k.startswith("encoder")}
                 _vq_vae.load_state_dict(filtered, strict=False)
-            print("✅ VQ-VAE decoder loaded")
-        # ── Load trained LLM ────────────────────────────────────────────
         from transformers import AutoModelForCausalLM, AutoTokenizer
         REPO_ID = "eeshaAI/zeeb"
@@ -93,250 +89,187 @@ def load_models():
             _tokenizer.pad_token = _tokenizer.eos_token
         _model = AutoModelForCausalLM.from_pretrained(
-            REPO_ID,
-            trust_remote_code=True,
-            torch_dtype=torch.float32,
         )
         _model.eval()
-        print(f"✅ Model loaded. Vocab size: {len(_tokenizer)}")
-        # Set visual token ID ranges
         VIDEO_START_ID = _tokenizer.convert_tokens_to_ids("<video_start>")
         VIDEO_END_ID = _tokenizer.convert_tokens_to_ids("<video_end>")
         V_TOKEN_START_ID = _tokenizer.convert_tokens_to_ids("<v_0>")
         V_TOKEN_END_ID = _tokenizer.convert_tokens_to_ids("<v_1023>")
-        print(f"   <video_start>={VIDEO_START_ID}, <video_end>={VIDEO_END_ID}")
-        print(f"   <v_0>={V_TOKEN_START_ID}, <v_1023>={V_TOKEN_END_ID}")
         return _model, _tokenizer, _vq_vae
-def generate_video(prompt: str, max_tokens: int = 128):
     """Generate video from a text prompt using constrained decoding + VQ-VAE."""
     import torch
     import torch.nn.functional as F
-    log_lines = []
-    log_lines.append(f"🎬 Generating video for: '{prompt}'\n\n")
     try:
-        log_lines.append("📦 Loading trained model + VQ-VAE...\n")
         model, tokenizer, vq_vae = load_models()
-        log_lines.append("✅ Models loaded.\n\n")
     except Exception as e:
-        import traceback
-        log_lines.append(f"❌ Failed to load models: {e}\n")
-        log_lines.append(traceback.format_exc())
-        return None, "\n".join(log_lines)
-    # ── Format prompt ──────────────────────────────────────────────────
     text = f"Create a video of: {prompt} <video_start>"
-    log_lines.append(f"📝 Prompt: {text}\n\n")
-    # ── Constrained token generation ────────────────────────────────────
-    # After <video_start>, we FORCE the model to only pick from <v_0>...<v_1023>
-    # This is done by masking the logits at each step
-    log_lines.append("🔥 Generating visual tokens (constrained decoding)...\n")
     inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=256)
-    input_ids = inputs["input_ids"]
-    visual_token_ids = []
-    current_ids = input_ids.clone()
-    # Create a mask that only allows visual token IDs
     vocab_size = len(tokenizer)
     visual_mask = torch.zeros(vocab_size, dtype=torch.bool)
     visual_mask[V_TOKEN_START_ID:V_TOKEN_END_ID + 1] = True
-    # Also allow <video_end> so the model can stop
     visual_mask[VIDEO_END_ID] = True
     with torch.no_grad():
         for step in range(max_tokens):
-            # Forward pass
             outputs = model(input_ids=current_ids)
-            next_token_logits = outputs.logits[:, -1, :]  # [1, vocab_size]
-            # Apply constraint: only allow visual tokens + <video_end>
-            masked_logits = next_token_logits.clone()
-            masked_logits[0, ~visual_mask] = float('-inf')
-            # Sample from the constrained distribution
-            probs = F.softmax(masked_logits / 0.8, dim=-1)  # temperature=0.8
-            # Check if <video_end> has high probability
-            end_prob = probs[0, VIDEO_END_ID].item()
-            # Sample
-            next_token = torch.multinomial(probs, num_samples=1)  # [1, 1]
             next_id = next_token.item()
-            # If the model chose <video_end>, stop
             if next_id == VIDEO_END_ID:
-                log_lines.append(f"   Model chose <video_end> at step {step} (end_prob={end_prob:.4f})\n")
                 break
-            # Convert token ID to visual token index
             visual_idx = next_id - V_TOKEN_START_ID
             visual_token_ids.append(visual_idx)
-            # Append to sequence
             current_ids = torch.cat([current_ids, next_token], dim=-1)
-    log_lines.append(f"🎨 Generated {len(visual_token_ids)} visual tokens\n")
     if not visual_token_ids:
-        log_lines.append("⚠️ No visual tokens generated even with constrained decoding.\n")
-        log_lines.append("   Falling back to random token sampling from VQ-VAE codebook.\n")
-        # Fallback: generate random visual tokens
         import random
         visual_token_ids = [random.randint(0, 1023) for _ in range(64)]
-        log_lines.append(f"   Generated {len(visual_token_ids)} random tokens as fallback\n")
-    sample_tokens = visual_token_ids[:20]
-    log_lines.append(f"   Sample tokens: {sample_tokens}\n")
-    unique = len(set(visual_token_ids))
-    log_lines.append(f"   Unique tokens: {unique} / {len(visual_token_ids)}\n\n")
-    # ── Decode to video frames ──────────────────────────────────────────
-    log_lines.append("🎞️ Decoding tokens → video frames via VQ-VAE...\n")
     grid_h, grid_w = 8, 8
-    tokens_per_frame = grid_h * grid_w  # 64
     num_frames = max(1, len(visual_token_ids) // tokens_per_frame)
-    log_lines.append(f"   Grid: {grid_h}×{grid_w} = {tokens_per_frame} tokens/frame\n")
-    log_lines.append(f"   Frames: {num_frames}\n\n")
     frames = []
-    for frame_idx in range(num_frames):
-        start_idx = frame_idx * tokens_per_frame
-        end_idx = start_idx + tokens_per_frame
-        frame_tokens = visual_token_ids[start_idx:end_idx]
-        if vq_vae is not None:
-            try:
-                frame_tensor = vq_vae.decode_tokens(frame_tokens, grid_h, grid_w)
-                frame_np = (frame_tensor[0].permute(1, 2, 0).detach().numpy() * 255).astype(np.uint8)
-                frames.append(frame_np)
-            except Exception as e:
-                log_lines.append(f"   ⚠️ Frame {frame_idx} VQ-VAE error: {e}, using color blocks\n")
-                frames.append(_tokens_to_color_blocks(frame_tokens, grid_h, grid_w))
-        else:
-            frames.append(_tokens_to_color_blocks(frame_tokens, grid_h, grid_w))
     if not frames:
-        log_lines.append("❌ No frames generated!\n")
-        return None, "\n".join(log_lines)
-    # ── Save as video ──────────────────────────────────────────────────
-    log_lines.append(f"💾 Saving {len(frames)} frames as video...\n")
     try:
         from PIL import Image
-        # Upscale 64x64 → 256x256
-        upscaled = []
-        for f in frames:
-            img = Image.fromarray(f)
-            img = img.resize((256, 256), Image.NEAREST)
-            upscaled.append(np.array(img))
-        # Try imageio for MP4
         try:
             import imageio
-            output_path = "/tmp/generated_video.mp4"
-            imageio.mimsave(output_path, upscaled, fps=2)
-            log_lines.append(f"✅ Video saved as MP4: {output_path}\n")
-        except Exception:
-            output_path = "/tmp/generated_video.gif"
-            pil_frames = [Image.fromarray(f) for f in upscaled]
-            pil_frames[0].save(
-                output_path,
-                save_all=True,
-                append_images=pil_frames[1:],
-                duration=500,
-                loop=0,
-            )
-            log_lines.append(f"✅ Video saved as GIF: {output_path}\n")
-        log_lines.append(f"   Resolution: 256×256\n")
-        log_lines.append(f"   Frames: {len(upscaled)}\n")
-        log_lines.append(f"   FPS: 2\n\n")
-        log_lines.append("🎉 Video generation complete!\n")
-        return output_path, "\n".join(log_lines)
     except Exception as e:
-        import traceback
-        log_lines.append(f"❌ Video save error: {e}\n")
-        log_lines.append(traceback.format_exc())
-        return None, "\n".join(log_lines)
-def _tokens_to_color_blocks(token_ids, grid_h=8, grid_w=8):
-    """Convert token IDs to a color-block image as fallback."""
     frame = np.zeros((64, 64, 3), dtype=np.uint8)
-    cell_h, cell_w = 64 // grid_h, 64 // grid_w
     for i, t in enumerate(token_ids[:grid_h * grid_w]):
-        row, col = divmod(i, grid_w)
-        r = (t * 37) % 256
-        g = (t * 73) % 256
-        b = (t * 113) % 256
-        frame[row*cell_h:(row+1)*cell_h, col*cell_w:(col+1)*cell_w] = [r, g, b]
     return frame
-# ── Preload models on boot in background ───────────────────────────────
 def preload():
     try:
         load_models()
-        print("🚀 Models preloaded and ready!")
     except Exception as e:
         print(f"⚠️ Preload error: {e}")
-preload_thread = threading.Thread(target=preload, daemon=True)
-preload_thread.start()
 # ── Gradio UI ──────────────────────────────────────────────────────────
-with gr.Blocks(
-    title="Zeeb — Video-LLM",
-    theme=gr.themes.Soft(),
-) as demo:
-    gr.Markdown(
-        """
         # 🎬 Zeeb — Video-LLM
-        **OLMo 2 1B Instruct** fine-tuned with **LoRA (r=4)** to generate video tokens.
-        Model: [EeshaAI/zeeb](https://huggingface.co/EeshaAI/zeeb)
-        Uses **constrained decoding** — after `<video_start>`, only visual tokens are allowed.
-        """
-    )
-    prompt_input = gr.Textbox(
-        label="Video Description",
-        placeholder="A cat jumping on a sofa",
-        lines=2,
-        value="A cat jumping on a sofa",
-    )
-    max_tokens_slider = gr.Slider(
-        minimum=32, maximum=256, value=128, step=32,
-        label="Max Visual Tokens to Generate",
-    )
-    generate_btn = gr.Button("🎬 Generate Video", variant="primary", size="lg")
-    video_output = gr.Video(label="Generated Video")
-    gen_log = gr.Textbox(
-        label="Generation Log",
-        lines=25,
-        interactive=False,
-        show_copy_button=True,
-    )
-    generate_btn.click(
-        fn=generate_video,
-        inputs=[prompt_input, max_tokens_slider],
-        outputs=[video_output, gen_log],
-    )
 if __name__ == "__main__":

 #!/usr/bin/env python3
 """
+Gradio App for EeshaAI/Zeeb — Video Generation + Training Pipeline
+===================================================================
+Tab 1: Generate Video (uses trained model + VQ-VAE)
+Tab 2: Run Full Pipeline (VQ-VAE training → dataset tokenization → LLM training → push)
 """
 import os
 import numpy as np
 import gradio as gr
+LOG_FILE = "/tmp/pipeline_log.txt"
 # Global model cache
 _model = None
 _tokenizer = None
 _vq_vae = None
 _loading_lock = threading.Lock()
+# Visual token ID range
+VIDEO_START_ID = None
 VIDEO_END_ID = None
 V_TOKEN_START_ID = None
 V_TOKEN_END_ID = None
             return _model, _tokenizer, _vq_vae
         import torch
+        import torch.nn as nn
+        # ── VQ-VAE decoder ─────────────────────────────────────────────
+        class VQVAEDecoderOnly(nn.Module):
+            def __init__(self, codebook_size=1024, codebook_dim=256, latent_dim=256):
+                super().__init__()
+                self.codebook = nn.Embedding(codebook_size, codebook_dim)
+                self.proj = nn.Linear(codebook_dim, latent_dim)
+                self.decoder = nn.Sequential(
+                    nn.ConvTranspose2d(latent_dim, 256, 4, stride=2, padding=1), nn.ReLU(),
+                    nn.ConvTranspose2d(256, 128, 4, stride=2, padding=1), nn.ReLU(),
+                    nn.ConvTranspose2d(128, 64, 4, stride=2, padding=1), nn.ReLU(),
+                    nn.Conv2d(64, 3, 3, padding=1), nn.Sigmoid(),
+                )
+            def decode_tokens(self, token_ids, grid_h=8, grid_w=8):
+                tokens = torch.tensor(token_ids[:grid_h * grid_w], dtype=torch.long)
+                if len(tokens) < grid_h * grid_w:
+                    tokens = torch.cat([tokens, torch.zeros(grid_h * grid_w - len(tokens), dtype=torch.long)])
+                z = self.codebook(tokens)
+                z = self.proj(z)
+                z = z.reshape(1, grid_h, grid_w, -1).permute(0, 3, 1, 2)
+                frame = self.decoder(z)
+                return frame
+        # Try loading from local file first, then from model repo
+        vq_vae_loaded = False
+        for vq_path in ["vq_vae_real.pt", "vq_vae_final.pt"]:
+            if os.path.exists(vq_path):
+                _vq_vae = VQVAEDecoderOnly()
+                state = torch.load(vq_path, map_location="cpu", weights_only=False)
+                sd = state.get("state_dict", state.get("model_state_dict", state)) if isinstance(state, dict) else state
                 filtered = {k: v for k, v in sd.items() if not k.startswith("encoder")}
                 _vq_vae.load_state_dict(filtered, strict=False)
+                vq_vae_loaded = True
+                print(f"✅ VQ-VAE loaded from {vq_path}")
+                break
+        if not vq_vae_loaded:
+            _vq_vae = VQVAEDecoderOnly()
+            print("⚠️ Using untrained VQ-VAE (no checkpoint found)")
+        # ── LLM ─────────────────────────────────────────────────────────
         from transformers import AutoModelForCausalLM, AutoTokenizer
         REPO_ID = "eeshaAI/zeeb"
             _tokenizer.pad_token = _tokenizer.eos_token
         _model = AutoModelForCausalLM.from_pretrained(
+            REPO_ID, trust_remote_code=True, torch_dtype=torch.float32
         )
         _model.eval()
         VIDEO_START_ID = _tokenizer.convert_tokens_to_ids("<video_start>")
         VIDEO_END_ID = _tokenizer.convert_tokens_to_ids("<video_end>")
         V_TOKEN_START_ID = _tokenizer.convert_tokens_to_ids("<v_0>")
         V_TOKEN_END_ID = _tokenizer.convert_tokens_to_ids("<v_1023>")
+        print(f"✅ Model loaded. Vocab: {len(_tokenizer)}")
         return _model, _tokenizer, _vq_vae
+def generate_video(prompt: str, max_tokens: int = 64):
     """Generate video from a text prompt using constrained decoding + VQ-VAE."""
     import torch
     import torch.nn.functional as F
+    log = [f"🎬 Generating video for: '{prompt}'\n\n"]
     try:
+        log.append("📦 Loading models...\n")
         model, tokenizer, vq_vae = load_models()
+        log.append("✅ Models loaded.\n\n")
     except Exception as e:
+        log.append(f"❌ Load error: {e}\n")
+        return None, "".join(log)
+    # Format prompt
     text = f"Create a video of: {prompt} <video_start>"
+    log.append(f"📝 Prompt: {text}\n\n")
+    log.append("🔥 Generating visual tokens (constrained decoding)...\n")
     inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=256)
+    current_ids = inputs["input_ids"].clone()
+    # Constrained decoding mask
     vocab_size = len(tokenizer)
     visual_mask = torch.zeros(vocab_size, dtype=torch.bool)
     visual_mask[V_TOKEN_START_ID:V_TOKEN_END_ID + 1] = True
     visual_mask[VIDEO_END_ID] = True
+    visual_token_ids = []
     with torch.no_grad():
         for step in range(max_tokens):
             outputs = model(input_ids=current_ids)
+            logits = outputs.logits[:, -1, :]
+            masked = logits.clone()
+            masked[0, ~visual_mask] = float('-inf')
+            probs = F.softmax(masked / 0.8, dim=-1)
+            next_token = torch.multinomial(probs, num_samples=1)
             next_id = next_token.item()
             if next_id == VIDEO_END_ID:
                 break
             visual_idx = next_id - V_TOKEN_START_ID
             visual_token_ids.append(visual_idx)
             current_ids = torch.cat([current_ids, next_token], dim=-1)
+    log.append(f"🎨 Generated {len(visual_token_ids)} visual tokens\n")
     if not visual_token_ids:
         import random
         visual_token_ids = [random.randint(0, 1023) for _ in range(64)]
+        log.append("⚠️ Fallback: random tokens\n")
+    log.append(f"   Sample: {visual_token_ids[:20]}\n")
+    log.append(f"   Unique: {len(set(visual_token_ids))}\n\n")
+    # Decode frames
+    log.append("🎞️ Decoding tokens → frames...\n")
     grid_h, grid_w = 8, 8
+    tokens_per_frame = grid_h * grid_w
     num_frames = max(1, len(visual_token_ids) // tokens_per_frame)
     frames = []
+    for fi in range(num_frames):
+        ft = visual_token_ids[fi*tokens_per_frame:(fi+1)*tokens_per_frame]
+        try:
+            frame_tensor = vq_vae.decode_tokens(ft, grid_h, grid_w)
+            frame_np = (frame_tensor[0].permute(1, 2, 0).detach().numpy() * 255).astype(np.uint8)
+            frames.append(frame_np)
+        except:
+            frames.append(_tokens_to_color(ft, grid_h, grid_w))
     if not frames:
+        return None, "".join(log)
+    # Save video
     try:
         from PIL import Image
+        upscaled = [np.array(Image.fromarray(f).resize((256, 256), Image.NEAREST)) for f in frames]
         try:
             import imageio
+            out = "/tmp/generated_video.mp4"
+            imageio.mimsave(out, upscaled, fps=2)
+        except:
+            out = "/tmp/generated_video.gif"
+            pils = [Image.fromarray(f) for f in upscaled]
+            pils[0].save(out, save_all=True, append_images=pils[1:], duration=500, loop=0)
+        log.append(f"✅ Video saved ({len(upscaled)} frames, 256×256)\n\n🎉 Done!\n")
+        return out, "".join(log)
     except Exception as e:
+        log.append(f"❌ Save error: {e}\n")
+        return None, "".join(log)
+def _tokens_to_color(token_ids, grid_h=8, grid_w=8):
     frame = np.zeros((64, 64, 3), dtype=np.uint8)
+    ch, cw = 64 // grid_h, 64 // grid_w
     for i, t in enumerate(token_ids[:grid_h * grid_w]):
+        r, c = divmod(i, grid_w)
+        frame[r*ch:(r+1)*ch, c*cw:(c+1)*cw] = [(t*37)%256, (t*73)%256, (t*113)%256]
     return frame
+def get_log():
+    try:
+        with open(LOG_FILE, "r") as f:
+            return f.read()
+    except:
+        return "No pipeline log yet."
+def start_pipeline():
+    """Start the full training pipeline in background."""
+    from train_full_pipeline import run_pipeline
+    t = threading.Thread(target=run_pipeline, args=(LOG_FILE,), daemon=True)
+    t.start()
+    return "🚀 Pipeline started! Click Refresh to see progress."
+# ── Preload generation models ───────────────────────────────────────────
 def preload():
     try:
         load_models()
+        print("🚀 Generation models preloaded!")
     except Exception as e:
         print(f"⚠️ Preload error: {e}")
+threading.Thread(target=preload, daemon=True).start()
 # ── Gradio UI ──────────────────────────────────────────────────────────
+with gr.Blocks(title="Zeeb — Video-LLM", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
         # 🎬 Zeeb — Video-LLM
+        **OLMo 2 1B** + **LoRA** + **VQ-VAE** → Text-to-Video generation.
+        [EeshaAI/zeeb](https://huggingface.co/EeshaAI/zeeb)
+    """)
+    with gr.Tabs():
+        with gr.Tab("🎬 Generate Video"):
+            prompt_input = gr.Textbox(label="Video Description", value="A cat jumping on a sofa", lines=2)
+            max_tok = gr.Slider(32, 128, value=64, step=32, label="Max Visual Tokens")
+            gen_btn = gr.Button("🎬 Generate Video", variant="primary", size="lg")
+            video_out = gr.Video(label="Generated Video")
+            gen_log = gr.Textbox(label="Log", lines=15, interactive=False, show_copy_button=True)
+            gen_btn.click(fn=generate_video, inputs=[prompt_input, max_tok], outputs=[video_out, gen_log])
+        with gr.Tab("🔧 Full Training Pipeline"):
+            gr.Markdown("""
+            ### Train from scratch with real data
+            1. **Phase 1**: Train VQ-VAE on 50K COCO images (real photos!)
+            2. **Phase 2**: Tokenize 10K OpenVid-1M clips (or 50K COCO images as fallback)
+            3. **Phase 3**: Fine-tune OLMo 2 1B + LoRA on tokenized data
+            4. **Phase 4**: Push trained model to EeshaAI/zeeb
+            ⚠️ This takes **many hours** on CPU. The Space may need restarts.
+            """)
+            pipe_btn = gr.Button("🚀 Start Full Pipeline", variant="primary", size="lg")
+            ref_btn = gr.Button("🔄 Refresh Log")
+            pipe_log = gr.Textbox(label="Pipeline Log", value=lambda: get_log(), lines=30,
+                                  interactive=False, show_copy_button=True)
+            pipe_btn.click(fn=start_pipeline, outputs=pipe_log)
+            ref_btn.click(fn=get_log, outputs=pipe_log)
 if __name__ == "__main__":