Spaces:

eeshaAI
/

Zeeb

Sleeping

App Files Files Community

eeshaAI commited on 20 days ago

Commit

83a2068

verified ·

1 Parent(s): 5908d9b

Rewrite: generation-only app, preload models, no auto-training

Browse files

Files changed (1) hide show

app.py +153 -209

app.py CHANGED Viewed

@@ -1,128 +1,97 @@
 #!/usr/bin/env python3
 """
-Gradio App for EeshaAI/Zeeb — Training + Video Generation
-==========================================================
-Tab 1: Training (auto-starts on boot)
-Tab 2: Generate Video (loads trained model + VQ-VAE, generates video from prompt)
 """
 import os
-import time
 import re
 import threading
 import numpy as np
 import gradio as gr
-LOG_FILE = "/tmp/training_log.txt"
-GENERATE_LOG = "/tmp/generation_log.txt"
 # Global model cache
 _model = None
 _tokenizer = None
 _vq_vae = None
-def start_training_background():
-    """Start training in a background thread on Space startup."""
-    from train_on_hf_spaces import run_training_to_file
-    run_training_to_file(LOG_FILE)
-def get_log():
-    """Read the current training log."""
-    try:
-        with open(LOG_FILE, "r") as f:
-            return f.read()
-    except FileNotFoundError:
-        return "⏳ Training has not started yet. Please wait..."
-def refresh_log():
-    return get_log()
 def load_models():
     """Load the trained LLM and VQ-VAE decoder (lazy, cached)."""
     global _model, _tokenizer, _vq_vae
-    if _model is not None and _tokenizer is not None:
-        return _model, _tokenizer, _vq_vae
-    import torch
-    # ── Load VQ-VAE decoder ─────────────────────────────────────────────
-    vq_vae_path = "vq_vae_final.pt"
-    if os.path.exists(vq_vae_path):
-        import torch.nn as nn
-        class VQVAEDecoderOnly(nn.Module):
-            """Minimal VQ-VAE decoder for token → pixel decoding."""
-            def __init__(self, codebook_size=1024, codebook_dim=256, latent_dim=256):
-                super().__init__()
-                self.codebook = nn.Embedding(codebook_size, codebook_dim)
-                self.proj = nn.Linear(codebook_dim, latent_dim)
-                # Decoder: upscale from 8x8 spatial to 64x64
-                self.decoder = nn.Sequential(
-                    nn.ConvTranspose2d(latent_dim, 128, 4, stride=2, padding=1),  # 8→16
-                    nn.ReLU(),
-                    nn.ConvTranspose2d(128, 64, 4, stride=2, padding=1),  # 16→32
-                    nn.ReLU(),
-                    nn.ConvTranspose2d(64, 32, 4, stride=2, padding=1),  # 32→64
-                    nn.ReLU(),
-                    nn.Conv2d(32, 3, 3, padding=1),
-                    nn.Sigmoid(),
-                )
-            def decode_tokens(self, token_ids, grid_h=8, grid_w=8):
-                """Decode a flat list of token IDs into a video frame."""
-                # token_ids: list of ints, length should be grid_h * grid_w
-                tokens = torch.tensor(token_ids[:grid_h * grid_w], dtype=torch.long)
-                if len(tokens) < grid_h * grid_w:
-                    tokens = torch.cat([tokens, torch.zeros(grid_h * grid_w - len(tokens), dtype=torch.long)])
-                # Lookup codebook
-                z = self.codebook(tokens)  # [H*W, D]
-                z = self.proj(z)  # [H*W, latent_dim]
-                z = z.reshape(1, grid_h, grid_w, -1).permute(0, 3, 1, 2)  # [1, C, H, W]
-                # Decode
-                frame = self.decoder(z)  # [1, 3, 64, 64]
-                return frame
-        _vq_vae = VQVAEDecoderOnly()
-        state = torch.load(vq_vae_path, map_location="cpu", weights_only=False)
-        # Try to load relevant weights
-        if isinstance(state, dict):
-            if "codebook" in state or "state_dict" in state:
-                # Full checkpoint
-                sd = state.get("state_dict", state)
                 filtered = {k: v for k, v in sd.items() if not k.startswith("encoder")}
                 _vq_vae.load_state_dict(filtered, strict=False)
-            elif "model_state_dict" in state:
-                _vq_vae.load_state_dict(state["model_state_dict"], strict=False)
-            else:
-                _vq_vae.load_state_dict(state, strict=False)
-        print("✅ VQ-VAE decoder loaded")
-    # ── Load trained LLM ────────────────────────────────────────────────
-    from transformers import AutoModelForCausalLM, AutoTokenizer
-    REPO_ID = "eeshaAI/zeeb"
-    print("📦 Loading trained model from EeshaAI/zeeb...")
-    _tokenizer = AutoTokenizer.from_pretrained(REPO_ID, trust_remote_code=True)
-    if _tokenizer.pad_token is None:
-        _tokenizer.pad_token = _tokenizer.eos_token
-    _model = AutoModelForCausalLM.from_pretrained(
-        REPO_ID,
-        trust_remote_code=True,
-        torch_dtype=torch.float32,
-    )
-    _model.eval()
-    print(f"✅ Model loaded. Vocab size: {len(_tokenizer)}")
-    return _model, _tokenizer, _vq_vae
 def generate_video(prompt: str, max_tokens: int = 128):
@@ -133,17 +102,18 @@ def generate_video(prompt: str, max_tokens: int = 128):
     log_lines.append(f"🎬 Generating video for: '{prompt}'\n\n")
     try:
-        # Load models
-        log_lines.append("📦 Loading trained model + VQ-VAE...\n")
         model, tokenizer, vq_vae = load_models()
         log_lines.append("✅ Models loaded.\n\n")
     except Exception as e:
         log_lines.append(f"❌ Failed to load models: {e}\n")
         return None, "\n".join(log_lines)
     # ── Format prompt ──────────────────────────────────────────────────
     text = f"Create a video of: {prompt} <video_start>"
-    log_lines.append(f"📝 Prompt formatted:\n   {text}\n\n")
     # ── Generate tokens ────────────────────────────────────────────────
     log_lines.append("🔥 Generating visual tokens...\n")
@@ -176,7 +146,6 @@ def generate_video(prompt: str, max_tokens: int = 128):
             in_video = False
             break
         if in_video:
-            # Check if it's a <v_N> token
             match = re.match(r"<v_(\d+)>", decoded.strip())
             if match:
                 visual_token_ids.append(int(match.group(1)))
@@ -184,23 +153,22 @@ def generate_video(prompt: str, max_tokens: int = 128):
     log_lines.append(f"🎨 Extracted {len(visual_token_ids)} visual tokens\n")
     if not visual_token_ids:
-        log_lines.append("⚠️ No visual tokens generated! The model may need more training.\n")
-        log_lines.append(f"\nFull output:\n{full_text}\n")
-        # Try alternative: parse from full_text
         all_v_tokens = re.findall(r"<v_(\d+)>", full_text)
         if all_v_tokens:
             visual_token_ids = [int(t) for t in all_v_tokens]
-            log_lines.append(f"\n🔄 Alternative extraction found {len(visual_token_ids)} tokens\n")
         else:
             return None, "\n".join(log_lines)
-    # Show sample of tokens
     sample_tokens = visual_token_ids[:20]
     log_lines.append(f"   Sample tokens: {sample_tokens}\n")
     log_lines.append(f"   Unique tokens: {len(set(visual_token_ids))}\n\n")
     # ── Decode to video frames ──────────────────────────────────────────
-    log_lines.append("🎞️ Decoding tokens → video frames via VQ-VAE...\n")
     grid_h, grid_w = 8, 8
     tokens_per_frame = grid_h * grid_w  # 64
@@ -215,43 +183,22 @@ def generate_video(prompt: str, max_tokens: int = 128):
             start = frame_idx * tokens_per_frame
             end = start + tokens_per_frame
             frame_tokens = visual_token_ids[start:end]
             try:
                 frame_tensor = vq_vae.decode_tokens(frame_tokens, grid_h, grid_w)
-                # Convert to numpy: [1, 3, 64, 64] → [64, 64, 3] uint8
-                frame_np = (frame_tensor[0].permute(1, 2, 0).numpy() * 255).astype(np.uint8)
                 frames.append(frame_np)
             except Exception as e:
                 log_lines.append(f"   ⚠️ Frame {frame_idx} decode error: {e}\n")
-                # Fallback: create frame from token values as colors
-                frame_np = np.zeros((64, 64, 3), dtype=np.uint8)
-                for i, t in enumerate(frame_tokens[:tokens_per_frame]):
-                    row, col = divmod(i, grid_w)
-                    cell_h, cell_w = 64 // grid_h, 64 // grid_w
-                    if row < grid_h and col < grid_w:
-                        # Use token value as a color
-                        r = (t * 37) % 256
-                        g = (t * 73) % 256
-                        b = (t * 113) % 256
-                        frame_np[row*cell_h:(row+1)*cell_h, col*cell_w:(col+1)*cell_w] = [r, g, b]
                 frames.append(frame_np)
     else:
-        # No VQ-VAE: create frames from token values as colored blocks
         log_lines.append("   ⚠️ No VQ-VAE, using token→color mapping\n")
         for frame_idx in range(num_frames):
             start = frame_idx * tokens_per_frame
             end = start + tokens_per_frame
             frame_tokens = visual_token_ids[start:end]
-            frame_np = np.zeros((64, 64, 3), dtype=np.uint8)
-            for i, t in enumerate(frame_tokens[:tokens_per_frame]):
-                row, col = divmod(i, grid_w)
-                cell_h, cell_w = 64 // grid_h, 64 // grid_w
-                if row < grid_h and col < grid_w:
-                    r = (t * 37) % 256
-                    g = (t * 73) % 256
-                    b = (t * 113) % 256
-                    frame_np[row*cell_h:(row+1)*cell_h, col*cell_w:(col+1)*cell_w] = [r, g, b]
-            frames.append(frame_np)
     if not frames:
         log_lines.append("❌ No frames generated!\n")
@@ -261,30 +208,25 @@ def generate_video(prompt: str, max_tokens: int = 128):
     log_lines.append(f"💾 Saving {len(frames)} frames as video...\n")
     try:
-        import imageio
-        output_path = "/tmp/generated_video.mp4"
-        # Upscale frames from 64x64 to 256x256 for better visibility
         from PIL import Image
         upscaled = []
         for f in frames:
             img = Image.fromarray(f)
             img = img.resize((256, 256), Image.NEAREST)
             upscaled.append(np.array(img))
-        # Save as mp4 (2 fps for slow playback since we have few frames)
-        imageio.mimsave(output_path, upscaled, fps=2)
-        log_lines.append(f"✅ Video saved to {output_path}\n")
-        log_lines.append(f"   Resolution: 256×256\n")
-        log_lines.append(f"   Frames: {len(upscaled)}\n")
-        log_lines.append(f"   FPS: 2\n\n")
-        log_lines.append("🎉 Video generation complete!\n")
-        return output_path, "\n".join(log_lines)
-    except ImportError:
-        # Fallback: save as GIF
         try:
-            from PIL import Image
             output_path = "/tmp/generated_video.gif"
-            pil_frames = [Image.fromarray(f).resize((256, 256), Image.NEAREST) for f in frames]
             pil_frames[0].save(
                 output_path,
                 save_all=True,
@@ -292,23 +234,43 @@ def generate_video(prompt: str, max_tokens: int = 128):
                 duration=500,
                 loop=0,
             )
-            log_lines.append(f"✅ GIF saved to {output_path}\n")
-            return output_path, "\n".join(log_lines)
-        except Exception as e:
-            log_lines.append(f"❌ Failed to save video: {e}\n")
-            # Return first frame as image at least
-            img_path = "/tmp/generated_frame.png"
-            Image.fromarray(frames[0]).resize((256, 256), Image.NEAREST).save(img_path)
-            log_lines.append(f"📸 Saved single frame to {img_path}\n")
-            return img_path, "\n".join(log_lines)
     except Exception as e:
         log_lines.append(f"❌ Video save error: {e}\n")
         return None, "\n".join(log_lines)
-# ── Auto-start training on boot ────────────────────────────────────────
-training_thread = threading.Thread(target=start_training_background, daemon=True)
-training_thread.start()
 # ── Gradio UI ──────────────────────────────────────────────────────────
@@ -320,55 +282,37 @@ with gr.Blocks(
     gr.Markdown(
         """
         # 🎬 Zeeb — Video-LLM
-        **OLMo 2 1B Instruct** fine-tuned with **LoRA** to generate video tokens.
-        Model repo: [EeshaAI/zeeb](https://huggingface.co/EeshaAI/zeeb)
         """
     )
-    with gr.Tabs():
-        # ── Tab 1: Generate Video ───────────────────────────────────────
-        with gr.Tab("🎬 Generate Video"):
-            prompt_input = gr.Textbox(
-                label="Video Description",
-                placeholder="A cat jumping on a sofa",
-                lines=2,
-            )
-            max_tokens_slider = gr.Slider(
-                minimum=32, maximum=256, value=128, step=32,
-                label="Max Visual Tokens",
-            )
-            generate_btn = gr.Button("🎬 Generate Video", variant="primary", size="lg")
-            video_output = gr.Video(label="Generated Video")
-            gen_log = gr.Textbox(
-                label="Generation Log",
-                lines=20,
-                interactive=False,
-                show_copy_button=True,
-            )
-            generate_btn.click(
-                fn=generate_video,
-                inputs=[prompt_input, max_tokens_slider],
-                outputs=[video_output, gen_log],
-            )
-        # ── Tab 2: Training ─────────────────────────────────────────────
-        with gr.Tab("🔧 Training"):
-            gr.Markdown(
-                """
-                Training **starts automatically** when this Space boots.
-                Click **Refresh Log** to see progress.
-                """
-            )
-            refresh_btn = gr.Button("🔄 Refresh Log")
-            logbox = gr.Textbox(
-                label="Training Log",
-                value=lambda: get_log(),
-                lines=25,
-                max_lines=200,
-                interactive=False,
-                show_copy_button=True,
-            )
-            refresh_btn.click(fn=refresh_log, outputs=logbox)
 if __name__ == "__main__":

 #!/usr/bin/env python3
 """
+Gradio App for EeshaAI/Zeeb — Video Generation
+================================================
+Uses the trained OLMo 2 1B + LoRA model to generate video tokens,
+then decodes them via VQ-VAE into a video file.
 """
 import os
 import re
 import threading
 import numpy as np
 import gradio as gr
 # Global model cache
 _model = None
 _tokenizer = None
 _vq_vae = None
+_loading_lock = threading.Lock()
 def load_models():
     """Load the trained LLM and VQ-VAE decoder (lazy, cached)."""
     global _model, _tokenizer, _vq_vae
+    with _loading_lock:
+        if _model is not None and _tokenizer is not None:
+            return _model, _tokenizer, _vq_vae
+        import torch
+        # ── Load VQ-VAE decoder ─────────────────────────────────────────
+        vq_vae_path = "vq_vae_final.pt"
+        if os.path.exists(vq_vae_path):
+            import torch.nn as nn
+            class VQVAEDecoderOnly(nn.Module):
+                """Minimal VQ-VAE decoder for token → pixel decoding."""
+                def __init__(self, codebook_size=1024, codebook_dim=256, latent_dim=256):
+                    super().__init__()
+                    self.codebook = nn.Embedding(codebook_size, codebook_dim)
+                    self.proj = nn.Linear(codebook_dim, latent_dim)
+                    self.decoder = nn.Sequential(
+                        nn.ConvTranspose2d(latent_dim, 128, 4, stride=2, padding=1),
+                        nn.ReLU(),
+                        nn.ConvTranspose2d(128, 64, 4, stride=2, padding=1),
+                        nn.ReLU(),
+                        nn.ConvTranspose2d(64, 32, 4, stride=2, padding=1),
+                        nn.ReLU(),
+                        nn.Conv2d(32, 3, 3, padding=1),
+                        nn.Sigmoid(),
+                    )
+                def decode_tokens(self, token_ids, grid_h=8, grid_w=8):
+                    tokens = torch.tensor(token_ids[:grid_h * grid_w], dtype=torch.long)
+                    if len(tokens) < grid_h * grid_w:
+                        tokens = torch.cat([tokens, torch.zeros(grid_h * grid_w - len(tokens), dtype=torch.long)])
+                    z = self.codebook(tokens)
+                    z = self.proj(z)
+                    z = z.reshape(1, grid_h, grid_w, -1).permute(0, 3, 1, 2)
+                    frame = self.decoder(z)
+                    return frame
+            _vq_vae = VQVAEDecoderOnly()
+            state = torch.load(vq_vae_path, map_location="cpu", weights_only=False)
+            if isinstance(state, dict):
+                if "state_dict" in state:
+                    sd = state["state_dict"]
+                elif "model_state_dict" in state:
+                    sd = state["model_state_dict"]
+                else:
+                    sd = state
                 filtered = {k: v for k, v in sd.items() if not k.startswith("encoder")}
                 _vq_vae.load_state_dict(filtered, strict=False)
+            print("✅ VQ-VAE decoder loaded")
+        # ── Load trained LLM ────────────────────────────────────────────
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+        REPO_ID = "eeshaAI/zeeb"
+        print("📦 Loading trained model from EeshaAI/zeeb...")
+        _tokenizer = AutoTokenizer.from_pretrained(REPO_ID, trust_remote_code=True)
+        if _tokenizer.pad_token is None:
+            _tokenizer.pad_token = _tokenizer.eos_token
+        _model = AutoModelForCausalLM.from_pretrained(
+            REPO_ID,
+            trust_remote_code=True,
+            torch_dtype=torch.float32,
+        )
+        _model.eval()
+        print(f"✅ Model loaded. Vocab size: {len(_tokenizer)}")
+        return _model, _tokenizer, _vq_vae
 def generate_video(prompt: str, max_tokens: int = 128):
     log_lines.append(f"🎬 Generating video for: '{prompt}'\n\n")
     try:
+        log_lines.append("📦 Loading trained model + VQ-VAE (first run takes ~3 min)...\n")
         model, tokenizer, vq_vae = load_models()
         log_lines.append("✅ Models loaded.\n\n")
     except Exception as e:
+        import traceback
         log_lines.append(f"❌ Failed to load models: {e}\n")
+        log_lines.append(traceback.format_exc())
         return None, "\n".join(log_lines)
     # ── Format prompt ──────────────────────────────────────────────────
     text = f"Create a video of: {prompt} <video_start>"
+    log_lines.append(f"📝 Prompt: {text}\n\n")
     # ── Generate tokens ────────────────────────────────────────────────
     log_lines.append("🔥 Generating visual tokens...\n")
             in_video = False
             break
         if in_video:
             match = re.match(r"<v_(\d+)>", decoded.strip())
             if match:
                 visual_token_ids.append(int(match.group(1)))
     log_lines.append(f"🎨 Extracted {len(visual_token_ids)} visual tokens\n")
     if not visual_token_ids:
+        log_lines.append("⚠️ No visual tokens in structured format. Trying regex on full output...\n")
         all_v_tokens = re.findall(r"<v_(\d+)>", full_text)
         if all_v_tokens:
             visual_token_ids = [int(t) for t in all_v_tokens]
+            log_lines.append(f"🔄 Regex found {len(visual_token_ids)} tokens\n")
         else:
+            log_lines.append("⚠️ No visual tokens at all. Showing raw output:\n")
+            log_lines.append(f"\n{full_text[:1000]}\n")
             return None, "\n".join(log_lines)
     sample_tokens = visual_token_ids[:20]
     log_lines.append(f"   Sample tokens: {sample_tokens}\n")
     log_lines.append(f"   Unique tokens: {len(set(visual_token_ids))}\n\n")
     # ── Decode to video frames ──────────────────────────────────────────
+    log_lines.append("🎞️ Decoding tokens → video frames...\n")
     grid_h, grid_w = 8, 8
     tokens_per_frame = grid_h * grid_w  # 64
             start = frame_idx * tokens_per_frame
             end = start + tokens_per_frame
             frame_tokens = visual_token_ids[start:end]
             try:
                 frame_tensor = vq_vae.decode_tokens(frame_tokens, grid_h, grid_w)
+                frame_np = (frame_tensor[0].permute(1, 2, 0).detach().numpy() * 255).astype(np.uint8)
                 frames.append(frame_np)
             except Exception as e:
                 log_lines.append(f"   ⚠️ Frame {frame_idx} decode error: {e}\n")
+                # Fallback: color blocks
+                frame_np = _tokens_to_color_blocks(frame_tokens, grid_h, grid_w)
                 frames.append(frame_np)
     else:
         log_lines.append("   ⚠️ No VQ-VAE, using token→color mapping\n")
         for frame_idx in range(num_frames):
             start = frame_idx * tokens_per_frame
             end = start + tokens_per_frame
             frame_tokens = visual_token_ids[start:end]
+            frames.append(_tokens_to_color_blocks(frame_tokens, grid_h, grid_w))
     if not frames:
         log_lines.append("❌ No frames generated!\n")
     log_lines.append(f"💾 Saving {len(frames)} frames as video...\n")
     try:
         from PIL import Image
+        # Upscale 64x64 → 256x256
         upscaled = []
         for f in frames:
             img = Image.fromarray(f)
             img = img.resize((256, 256), Image.NEAREST)
             upscaled.append(np.array(img))
+        # Try imageio for MP4
         try:
+            import imageio
+            output_path = "/tmp/generated_video.mp4"
+            imageio.mimsave(output_path, upscaled, fps=2)
+            log_lines.append(f"✅ Video saved as MP4: {output_path}\n")
+        except Exception:
+            # Fallback to GIF
             output_path = "/tmp/generated_video.gif"
+            pil_frames = [Image.fromarray(f) for f in upscaled]
             pil_frames[0].save(
                 output_path,
                 save_all=True,
                 duration=500,
                 loop=0,
             )
+            log_lines.append(f"✅ Video saved as GIF: {output_path}\n")
+        log_lines.append(f"   Resolution: 256×256\n")
+        log_lines.append(f"   Frames: {len(upscaled)}\n")
+        log_lines.append(f"   FPS: 2\n\n")
+        log_lines.append("🎉 Video generation complete!\n")
+        return output_path, "\n".join(log_lines)
     except Exception as e:
+        import traceback
         log_lines.append(f"❌ Video save error: {e}\n")
+        log_lines.append(traceback.format_exc())
         return None, "\n".join(log_lines)
+def _tokens_to_color_blocks(token_ids, grid_h=8, grid_w=8):
+    """Convert token IDs to a color-block image as fallback."""
+    frame = np.zeros((64, 64, 3), dtype=np.uint8)
+    cell_h, cell_w = 64 // grid_h, 64 // grid_w
+    for i, t in enumerate(token_ids[:grid_h * grid_w]):
+        row, col = divmod(i, grid_w)
+        r = (t * 37) % 256
+        g = (t * 73) % 256
+        b = (t * 113) % 256
+        frame[row*cell_h:(row+1)*cell_h, col*cell_w:(col+1)*cell_w] = [r, g, b]
+    return frame
+# ── Preload models on boot in background ───────────────────────────────
+def preload():
+    try:
+        load_models()
+        print("🚀 Models preloaded and ready!")
+    except Exception as e:
+        print(f"⚠️ Preload error: {e}")
+preload_thread = threading.Thread(target=preload, daemon=True)
+preload_thread.start()
 # ── Gradio UI ──────────────────────────────────────────────────────────
     gr.Markdown(
         """
         # 🎬 Zeeb — Video-LLM
+        **OLMo 2 1B Instruct** fine-tuned with **LoRA (r=4)** to generate video tokens.
+        Model: [EeshaAI/zeeb](https://huggingface.co/EeshaAI/zeeb)
+        Type a description and click Generate!
         """
     )
+    prompt_input = gr.Textbox(
+        label="Video Description",
+        placeholder="A cat jumping on a sofa",
+        lines=2,
+        value="A cat jumping on a sofa",
+    )
+    max_tokens_slider = gr.Slider(
+        minimum=32, maximum=256, value=128, step=32,
+        label="Max Visual Tokens to Generate",
+    )
+    generate_btn = gr.Button("🎬 Generate Video", variant="primary", size="lg")
+    video_output = gr.Video(label="Generated Video")
+    gen_log = gr.Textbox(
+        label="Generation Log",
+        lines=20,
+        interactive=False,
+        show_copy_button=True,
+    )
+    generate_btn.click(
+        fn=generate_video,
+        inputs=[prompt_input, max_tokens_slider],
+        outputs=[video_output, gen_log],
+    )
 if __name__ == "__main__":