roboflamingo-demo

Runtime error

App Files Files Community

aw1app commited on Nov 13, 2025

Commit

ff95b80

1 Parent(s): 15c4e01

Deploy with src only (no large eval data)

Browse files

Files changed (9) hide show

README.md +4 -20
app.py +91 -202
open_flamingo/__init__.py +2 -0
open_flamingo/src/__init__.py +0 -0
open_flamingo/src/factory.py +141 -0
open_flamingo/src/flamingo.py +338 -0
open_flamingo/src/flamingo_lm.py +191 -0
open_flamingo/src/helpers.py +279 -0
open_flamingo/src/utils.py +48 -0

README.md CHANGED Viewed

@@ -9,26 +9,10 @@ app_file: app.py
 pinned: false
 ---
-# RoboFlamingo Demo 🤖
-Interactive demo with REAL model inference!
-## Features
-- 📸 Upload your own robot images
-- 💬 Enter natural language instructions
-- 🎯 Get real model predictions
-- 📊 3D trajectory visualization
-## Usage
-1. Upload third-person view image
-2. Upload gripper view image
-3. Enter instruction (e.g., "Pick up the red block")
-4. Click "Predict Actions"
-## Requirements
-⚠️ **Requires GPU**: Enable T4 GPU in Space settings for real model.
-Without GPU, runs in simulation mode.
-## Resources
-- [Paper](https://arxiv.org/abs/2311.01378)
-- [Code](https://github.com/RoboFlamingo/RoboFlamingo)

 pinned: false
 ---
+# RoboFlamingo Interactive Demo 🤖
+Upload images and get real predictions!
+⚠️ Enable T4 GPU in Settings for real model.
+[Paper](https://arxiv.org/abs/2311.01378) | [Code](https://github.com/RoboFlamingo/RoboFlamingo)

app.py CHANGED Viewed

@@ -1,8 +1,4 @@
-"""
-RoboFlamingo Interactive Demo - Real Model
-Upload your images and get real predictions!
-"""
 import gradio as gr
 import torch
 import numpy as np
@@ -11,31 +7,18 @@ import matplotlib.pyplot as plt
 from io import BytesIO
 import sys
-# Add OpenFlamingo to path
 sys.path.insert(0, '/home/user/app/open_flamingo/src')
-print("=" * 70)
-print("🚀 INITIALIZING ROBOFLAMINGO")
-print("=" * 70)
-# ============================================================================
-# LOAD MODEL
-# ============================================================================
 MODEL_LOADED = False
-model = None
-image_processor = None
-tokenizer = None
 try:
-    print("📦 Importing OpenFlamingo...")
     from factory import create_model_and_transforms
-    print("✅ Import successful!")
-    print("🔧 Loading model (2-3 minutes on first run)...")
     device = "cuda" if torch.cuda.is_available() else "cpu"
-    print(f"📍 Device: {device}")
     model, image_processor, tokenizer = create_model_and_transforms(
         clip_vision_encoder_path="ViT-L-14",
@@ -46,228 +29,134 @@ try:
         decoder_type='lstm',
     )
-    model.to(device)
-    model.eval()
     MODEL_LOADED = True
-    print("=" * 70)
-    print("✅ REAL MODEL LOADED!")
-    print("=" * 70)
 except Exception as e:
-    print(f"❌ Model failed: {e}")
-    print("⚠️  SIMULATION MODE")
     import traceback
     traceback.print_exc()
-# ============================================================================
-# VISUALIZATION
-# ============================================================================
-def create_trajectory_plot(actions):
-    fig = plt.figure(figsize=(10, 8))
     ax = fig.add_subplot(111, projection='3d')
-    x = np.cumsum([a['delta_x'] for a in actions])
-    y = np.cumsum([a['delta_y'] for a in actions])
-    z = np.cumsum([a['delta_z'] for a in actions])
-    ax.plot(x, y, z, 'b-', linewidth=2, marker='o', markersize=6)
-    ax.scatter([x[0]], [y[0]], [z[0]], c='green', s=100, label='Start')
-    ax.scatter([x[-1]], [y[-1]], [z[-1]], c='red', s=100, label='End')
-    ax.set_xlabel('X (m)')
-    ax.set_ylabel('Y (m)')
-    ax.set_zlabel('Z (m)')
-    ax.set_title('Predicted Trajectory')
-    ax.legend()
-    ax.grid(True)
     buf = BytesIO()
     plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
-    buf.seek(0)
-    plt.close()
     return Image.open(buf)
-def create_gripper_timeline(gripper):
-    fig, ax = plt.subplots(figsize=(12, 3))
-    colors = ['green' if g==0 else 'red' for g in gripper]
-    labels = ['OPEN' if g==0 else 'CLOSE' for g in gripper]
-    ax.bar(range(len(gripper)), [1]*len(gripper), color=colors, alpha=0.7, edgecolor='black')
-    for i, label in enumerate(labels):
-        ax.text(i, 0.5, label, ha='center', va='center', fontweight='bold')
-    ax.set_xlabel('Timestep')
-    ax.set_title('Gripper Commands')
-    ax.set_ylim(0, 1.2)
-    ax.grid(True, alpha=0.3)
     buf = BytesIO()
     plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
-    buf.seek(0)
-    plt.close()
     return Image.open(buf)
-def format_table(actions):
-    table = "| T | Δx | Δy | Δz | Quat |\n|--|--|--|--|--|\n"
-    for a in actions:
-        table += f"| {a['timestep']} | {a['delta_x']:.3f} | {a['delta_y']:.3f} | {a['delta_z']:.3f} | "
-        table += f"({a['qw']:.2f},{a['qx']:.2f},{a['qy']:.2f},{a['qz']:.2f}) |\n"
-    return table
-# ============================================================================
-# PREDICTION
-# ============================================================================
-def simulate(instruction):
-    """Fallback simulation."""
-    seed = sum(ord(c) for c in instruction) % 100
-    np.random.seed(seed)
-    actions = []
     for t in range(12):
-        p = t / 12
-        actions.append({
-            'timestep': t,
-            'delta_x': (0.05 + np.random.randn()*0.01) * p,
-            'delta_y': (0.02 + np.random.randn()*0.01) * p,
-            'delta_z': (-0.03 + np.random.randn()*0.01) * (1-p),
-            'qw': 0.99, 'qx': 0.01, 'qy': 0.01, 'qz': 0.01
-        })
-    return actions, [0]*6 + [1]*6
-def predict(instruction, third_img, grip_img):
-    """Main prediction function."""
-    if not instruction or not instruction.strip():
-        return None, None, "", "❌ Enter instruction!"
-    if third_img is None:
-        return None, None, "", "❌ Upload third-person view!"
-    if grip_img is None:
-        return None, None, "", "❌ Upload gripper view!"
     try:
-        if isinstance(third_img, np.ndarray):
-            third_img = Image.fromarray(third_img)
-        if isinstance(grip_img, np.ndarray):
-            grip_img = Image.fromarray(grip_img)
         if not MODEL_LOADED:
-            print("⚠️  Using simulation")
-            actions, gripper = simulate(instruction)
-            status = f"⚠️ SIMULATION\n\n{instruction}\n\nEnable GPU for real model."
         else:
-            print(f"🚀 Real inference: {instruction}")
             with torch.no_grad():
-                t1 = image_processor(third_img).unsqueeze(0).to(device)
-                t2 = image_processor(grip_img).unsqueeze(0).to(device)
-                vision_x = torch.stack([t1, t2], dim=1)
-                tokens = tokenizer(instruction, return_tensors="pt", padding=True,
-                                 truncation=True, max_length=512).to(device)
-                outputs = model(vision_x=vision_x, lang_x=tokens['input_ids'],
-                              attention_mask=tokens.get('attention_mask'))
-                if isinstance(outputs, dict):
-                    acts = outputs.get('actions', outputs.get('action'))
-                    grip = outputs.get('gripper', outputs.get('gripper_command'))
-                elif isinstance(outputs, tuple):
-                    acts = outputs[0]
-                    grip = outputs[1] if len(outputs) > 1 else None
                 else:
-                    acts = outputs
-                    grip = None
-                if acts is not None:
-                    acts_np = acts[0].cpu().numpy()
-                    actions = []
-                    for t, a in enumerate(acts_np):
-                        if len(a) < 7:
-                            a = np.pad(a, (0, 7-len(a)))
-                        actions.append({
-                            'timestep': t, 'delta_x': float(a[0]), 'delta_y': float(a[1]),
-                            'delta_z': float(a[2]), 'qw': float(a[3]), 'qx': float(a[4]),
-                            'qy': float(a[5]), 'qz': float(a[6])
-                        })
-                    if grip is not None:
-                        grip_np = grip[0].cpu().numpy()
-                        gripper = [int(g>0.5) if np.isscalar(g) else int(g[0]>0.5) for g in grip_np]
-                    else:
-                        gripper = [0]*len(actions)
-                    status = f"✅ REAL MODEL\n\n{instruction}\n\n{device}\n{len(actions)} timesteps"
-                    print(f"✅ Success! {len(actions)} timesteps")
                 else:
-                    actions, gripper = simulate(instruction)
-                    status = f"⚠️ Unexpected output\n{instruction}"
-        traj = create_trajectory_plot(actions)
-        grip_plot = create_gripper_timeline(gripper)
-        table = format_table(actions)
-        return traj, grip_plot, table, status
     except Exception as e:
-        print(f"❌ Error: {e}")
         import traceback
         traceback.print_exc()
-        actions, gripper = simulate(instruction)
-        traj = create_trajectory_plot(actions)
-        grip_plot = create_gripper_timeline(gripper)
-        table = format_table(actions)
-        return traj, grip_plot, table, f"❌ Error: {str(e)}"
-# ============================================================================
-# UI
-# ============================================================================
-mode = "🟢 REAL MODEL" if MODEL_LOADED else "🟡 SIMULATION"
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown(f"""
-    # 🤖 RoboFlamingo Demo - {mode}
-    ### Vision-Language Foundation Models as Effective Robot Imitators
-    {'✅ Real model loaded!' if MODEL_LOADED else '⚠️ Simulation mode - Enable GPU in settings'}
-    **Upload your images and enter instructions:**
-    """)
     with gr.Row():
         with gr.Column():
-            gr.Markdown("### 📥 Inputs")
-            instruction = gr.Textbox(label="Instruction",
-                                    placeholder="Pick up the red block", lines=3)
             with gr.Row():
-                third = gr.Image(label="Third-Person View", type="pil", height=250)
-                gripper = gr.Image(label="Gripper View", type="pil", height=250)
-            btn = gr.Button("🚀 Predict Actions", variant="primary", size="lg")
-            status = gr.Textbox(label="Status", lines=5, interactive=False)
         with gr.Column():
-            gr.Markdown("### 📊 Predictions")
             traj = gr.Image(label="Trajectory", type="pil")
             grip = gr.Image(label="Gripper", type="pil")
-    with gr.Row():
-        table = gr.Markdown()
-    btn.click(predict, [instruction, third, gripper], [traj, grip, table, status])
-    gr.Markdown(f"""
-    ---
-    **Status:** {mode} | **Device:** {'GPU' if torch.cuda.is_available() else 'CPU'}
-    [Paper](https://arxiv.org/abs/2311.01378) | [Code](https://github.com/RoboFlamingo/RoboFlamingo)
-    {'⚠️ Enable T4 GPU in Settings → Hardware for real model' if not MODEL_LOADED else ''}
-    """)
 demo.launch()

+"""RoboFlamingo Interactive Demo"""
 import gradio as gr
 import torch
 import numpy as np
 from io import BytesIO
 import sys
 sys.path.insert(0, '/home/user/app/open_flamingo/src')
+print("🚀 Initializing RoboFlamingo")
 MODEL_LOADED = False
 try:
+    print("📦 Importing...")
     from factory import create_model_and_transforms
     device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"Device: {device}")
     model, image_processor, tokenizer = create_model_and_transforms(
         clip_vision_encoder_path="ViT-L-14",
         decoder_type='lstm',
     )
+    model.to(device).eval()
     MODEL_LOADED = True
+    print("✅ Model loaded!")
 except Exception as e:
+    print(f"⚠️ Model failed: {e}")
     import traceback
     traceback.print_exc()
+def plot_traj(acts):
+    fig = plt.figure(figsize=(10,8))
     ax = fig.add_subplot(111, projection='3d')
+    x = np.cumsum([a['delta_x'] for a in acts])
+    y = np.cumsum([a['delta_y'] for a in acts])
+    z = np.cumsum([a['delta_z'] for a in acts])
+    ax.plot(x, y, z, 'b-', lw=2, marker='o', ms=6)
+    ax.scatter(x[0], y[0], z[0], c='green', s=100, label='Start')
+    ax.scatter(x[-1], y[-1], z[-1], c='red', s=100, label='End')
+    ax.set_xlabel('X'); ax.set_ylabel('Y'); ax.set_zlabel('Z')
+    ax.legend(); ax.grid()
     buf = BytesIO()
     plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
+    buf.seek(0); plt.close()
     return Image.open(buf)
+def plot_grip(grip):
+    fig, ax = plt.subplots(figsize=(12,3))
+    cols = ['green' if g==0 else 'red' for g in grip]
+    ax.bar(range(len(grip)), [1]*len(grip), color=cols, alpha=0.7, ec='black')
+    for i, g in enumerate(grip):
+        ax.text(i, 0.5, 'OPEN' if g==0 else 'CLOSE', ha='center', va='center', weight='bold')
+    ax.set_xlabel('Timestep'); ax.set_ylim(0,1.2); ax.grid(alpha=0.3)
     buf = BytesIO()
     plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
+    buf.seek(0); plt.close()
     return Image.open(buf)
+def simulate(inst):
+    np.random.seed(sum(ord(c) for c in inst) % 100)
+    acts = []
     for t in range(12):
+        p = t/12
+        acts.append({'timestep': t, 'delta_x': (0.05+np.random.randn()*0.01)*p,
+                    'delta_y': (0.02+np.random.randn()*0.01)*p,
+                    'delta_z': (-0.03+np.random.randn()*0.01)*(1-p),
+                    'qw': 0.99, 'qx': 0.01, 'qy': 0.01, 'qz': 0.01})
+    return acts, [0]*6+[1]*6
+def predict(inst, img1, img2):
+    if not inst or not inst.strip():
+        return None, None, "", "❌ Enter instruction"
+    if img1 is None or img2 is None:
+        return None, None, "", "❌ Upload both images"
     try:
+        if isinstance(img1, np.ndarray):
+            img1 = Image.fromarray(img1)
+        if isinstance(img2, np.ndarray):
+            img2 = Image.fromarray(img2)
         if not MODEL_LOADED:
+            acts, grip = simulate(inst)
+            status = f"⚠️ SIMULATION\n{inst}\nEnable GPU for real model"
         else:
             with torch.no_grad():
+                t1 = image_processor(img1).unsqueeze(0).to(device)
+                t2 = image_processor(img2).unsqueeze(0).to(device)
+                vis = torch.stack([t1, t2], dim=1)
+                tok = tokenizer(inst, return_tensors="pt", padding=True, truncation=True).to(device)
+                out = model(vision_x=vis, lang_x=tok['input_ids'], attention_mask=tok.get('attention_mask'))
+                if isinstance(out, dict):
+                    a = out.get('actions', out.get('action'))
+                    g = out.get('gripper')
+                elif isinstance(out, tuple):
+                    a = out[0]
+                    g = out[1] if len(out)>1 else None
                 else:
+                    a = out
+                    g = None
+                if a is not None:
+                    anp = a[0].cpu().numpy()
+                    acts = []
+                    for t, ac in enumerate(anp):
+                        if len(ac)<7: ac = np.pad(ac, (0,7-len(ac)))
+                        acts.append({'timestep': t, 'delta_x': float(ac[0]), 'delta_y': float(ac[1]),
+                                   'delta_z': float(ac[2]), 'qw': float(ac[3]), 'qx': float(ac[4]),
+                                   'qy': float(ac[5]), 'qz': float(ac[6])})
+                    grip = [int(x>0.5) if np.isscalar(x) else int(x[0]>0.5) for x in (g[0].cpu().numpy() if g is not None else [0]*len(acts))]
+                    status = f"✅ REAL MODEL\n{inst}\n{device}"
                 else:
+                    acts, grip = simulate(inst)
+                    status = f"⚠️ Unexpected output\n{inst}"
+        traj = plot_traj(acts)
+        gp = plot_grip(grip)
+        table = "| T | Δx | Δy | Δz |\n|--|--|--|--|\n"
+        for a in acts:
+            table += f"| {a['timestep']} | {a['delta_x']:.3f} | {a['delta_y']:.3f} | {a['delta_z']:.3f} |\n"
+        return traj, gp, table, status
     except Exception as e:
+        print(f"Error: {e}")
         import traceback
         traceback.print_exc()
+        acts, grip = simulate(inst)
+        return plot_traj(acts), plot_grip(grip), "", f"❌ {str(e)}"
+mode = "🟢 REAL" if MODEL_LOADED else "🟡 SIM"
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown(f"# 🤖 RoboFlamingo - {mode}\n{'Real model loaded!' if MODEL_LOADED else 'Enable GPU for real model'}")
     with gr.Row():
         with gr.Column():
+            inst = gr.Textbox(label="Instruction", placeholder="Pick up the red block", lines=3)
             with gr.Row():
+                img1 = gr.Image(label="Third-Person", type="pil", height=250)
+                img2 = gr.Image(label="Gripper", type="pil", height=250)
+            btn = gr.Button("🚀 Predict", variant="primary", size="lg")
+            st = gr.Textbox(label="Status", lines=4, interactive=False)
         with gr.Column():
             traj = gr.Image(label="Trajectory", type="pil")
             grip = gr.Image(label="Gripper", type="pil")
+    tab = gr.Markdown()
+    btn.click(predict, [inst, img1, img2], [traj, grip, tab, st])
+    gr.Markdown(f"**Status:** {mode} | [Paper](https://arxiv.org/abs/2311.01378)")
 demo.launch()

open_flamingo/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .src.flamingo import Flamingo
2	+ from .src.factory import create_model_and_transforms

open_flamingo/src/__init__.py ADDED Viewed

File without changes

open_flamingo/src/factory.py ADDED Viewed

	@@ -0,0 +1,141 @@

+from typing import Optional
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import open_clip
+from .flamingo import Flamingo
+from .flamingo_lm import FlamingoLMMixin
+from .utils import extend_instance
+def create_model_and_transforms(
+    clip_vision_encoder_path: str,
+    clip_vision_encoder_pretrained: str,
+    lang_encoder_path: str,
+    tokenizer_path: str,
+    cross_attn_every_n_layers: int = 1,
+    use_local_files: bool = False,
+    decoder_layers_attr_name: str = None,
+    freeze_lm_embeddings: bool = False,
+    cache_dir: Optional[str] = None,
+    **flamingo_kwargs,
+):
+    """
+    Initialize a Flamingo model from a pretrained vision encoder and language encoder.
+    Appends special tokens to the tokenizer and freezes backbones.
+    Args:
+        clip_vision_encoder_path (str): path to pretrained clip model (e.g. "ViT-B-32")
+        clip_vision_encoder_pretrained (str): name of pretraining dataset for clip model (e.g. "laion2b_s32b_b79k")
+        lang_encoder_path (str): path to pretrained language encoder
+        tokenizer_path (str): path to pretrained tokenizer
+        cross_attn_every_n_layers (int, optional): determines how often to add a cross-attention layer. Defaults to 1.
+        use_local_files (bool, optional): whether to use local files. Defaults to False.
+        decoder_layers_attr_name (str, optional): name of the decoder layers attribute. Defaults to None.
+        freeze_lm_embeddings (bool, optional): whether to freeze LM input embeddings when configuring Perceiver.
+        cache_dir (str, optional): path to cache directory for downloading OpenClip/HF weights.
+    Returns:
+        Flamingo: Flamingo model from pretrained vision and language encoders
+        Image processor: Pipeline to preprocess input images
+        Tokenizer: A tokenizer for the language model
+    """
+    vision_encoder, _, image_processor = open_clip.create_model_and_transforms(
+        clip_vision_encoder_path,
+        pretrained=clip_vision_encoder_pretrained,
+        cache_dir=cache_dir,
+    )
+    # set the vision encoder to output the visual features
+    vision_encoder.visual.output_tokens = True
+    text_tokenizer = AutoTokenizer.from_pretrained(
+        tokenizer_path,
+        local_files_only=use_local_files,
+        trust_remote_code=True,
+        cache_dir=cache_dir,
+    )
+    # add Flamingo special tokens to the tokenizer
+    text_tokenizer.add_special_tokens(
+        {"additional_special_tokens": ["<|endofchunk|>", "<image>", "<action>"]}
+    )
+    if text_tokenizer.pad_token is None:
+        # Issue: GPT models don't have a pad token, which we use to
+        # modify labels for the loss.
+        text_tokenizer.add_special_tokens({"pad_token": "<PAD>"})
+    lang_encoder = AutoModelForCausalLM.from_pretrained(
+        lang_encoder_path,
+        local_files_only=use_local_files,
+        trust_remote_code=True,
+        cache_dir=cache_dir,
+    )
+    # hacks for MPT-1B, which doesn't have a get_input_embeddings method
+    if "mpt-1b-redpajama-200b" in lang_encoder_path:
+        class EmbeddingFnMixin:
+            def get_input_embeddings(self):
+                return self.transformer.wte
+            def set_input_embeddings(self, new_embeddings):
+                self.transformer.wte = new_embeddings
+        extend_instance(lang_encoder, EmbeddingFnMixin)
+    # convert LM to FlamingoLM
+    extend_instance(lang_encoder, FlamingoLMMixin)
+    if decoder_layers_attr_name is None:
+        decoder_layers_attr_name = _infer_decoder_layers_attr_name(lang_encoder)
+    lang_encoder.set_decoder_layers_attr_name(decoder_layers_attr_name)
+    lang_encoder.resize_token_embeddings(len(text_tokenizer))
+    model = Flamingo(
+        vision_encoder,
+        lang_encoder,
+        text_tokenizer.encode("<|endofchunk|>")[-1],
+        text_tokenizer.encode("<image>")[-1],
+        vis_dim=open_clip.get_model_config(clip_vision_encoder_path)["vision_cfg"][
+            "width"
+        ],
+        cross_attn_every_n_layers=cross_attn_every_n_layers,
+        **flamingo_kwargs,
+    )
+    # Freeze all parameters
+    model.requires_grad_(False)
+    assert sum(p.numel() for p in model.parameters() if p.requires_grad) == 0
+    # Unfreeze perceiver, gated_cross_attn_layers, and LM input embeddings
+    model.perceiver.requires_grad_(True)
+    model.lang_encoder.gated_cross_attn_layers.requires_grad_(True)
+    if not freeze_lm_embeddings:
+        model.lang_encoder.get_input_embeddings().requires_grad_(True)
+        # TODO: investigate also training the output embeddings when untied
+    print(
+        f"Flamingo model initialized with {sum(p.numel() for p in model.parameters() if p.requires_grad)} trainable parameters"
+    )
+    return model, image_processor, text_tokenizer
+def _infer_decoder_layers_attr_name(model):
+    for k in __KNOWN_DECODER_LAYERS_ATTR_NAMES:
+        if k.lower() in model.__class__.__name__.lower():
+            return __KNOWN_DECODER_LAYERS_ATTR_NAMES[k]
+    raise ValueError(
+        f"We require the attribute name for the nn.ModuleList in the decoder storing the transformer block layers. Please supply this string manually."
+    )
+__KNOWN_DECODER_LAYERS_ATTR_NAMES = {
+    "opt": "model.decoder.layers",
+    "gptj": "transformer.h",
+    "gpt-j": "transformer.h",
+    "pythia": "gpt_neox.layers",
+    "llama": "model.layers",
+    "gptneoxforcausallm": "gpt_neox.layers",
+    "mpt": "transformer.blocks",
+    "mosaicgpt": "transformer.blocks",
+}

open_flamingo/src/flamingo.py ADDED Viewed

	@@ -0,0 +1,338 @@

+import torch
+from einops import rearrange
+from torch import nn
+from .helpers import PerceiverResampler
+from torch.distributed.fsdp.wrap import (
+    enable_wrap,
+    wrap,
+)
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from torch.distributed.fsdp import (
+    FullyShardedDataParallel as FSDP,
+)
+from .utils import apply_with_stopping_condition
+class Flamingo(nn.Module):
+    def __init__(
+        self,
+        vision_encoder: nn.Module,
+        lang_encoder: nn.Module,
+        eoc_token_id: int,
+        media_token_id: int,
+        vis_dim: int,
+        cross_attn_every_n_layers: int = 1,
+        gradient_checkpointing: bool = False,
+    ):
+        """
+        Args:
+            vision_encoder (nn.Module): HF CLIPModel
+            lang_encoder (nn.Module): HF causal language model
+            eoc_token_id (int): Token id for <|endofchunk|>
+            media_token_id (int): Token id for <image>
+            vis_dim (int): Dimension of the visual features.
+                Visual features are projected to match this shape along the last dimension.
+            cross_attn_every_n_layers (int, optional): How often to apply cross attention after transformer layer. Defaults to 1.
+        """
+        super().__init__()
+        self.eoc_token_id = eoc_token_id
+        self.media_token_id = media_token_id
+        self.vis_dim = vis_dim
+        if hasattr(lang_encoder.config, "d_model"):
+            self.lang_dim = lang_encoder.config.d_model  # mpt uses d_model
+        else:
+            self.lang_dim = lang_encoder.config.hidden_size
+        self.vision_encoder = vision_encoder.visual
+        self.perceiver = PerceiverResampler(dim=self.vis_dim)
+        self.lang_encoder = lang_encoder
+        self.lang_encoder.init_flamingo(
+            media_token_id=media_token_id,
+            lang_hidden_size=self.lang_dim,
+            vis_hidden_size=self.vis_dim,
+            cross_attn_every_n_layers=cross_attn_every_n_layers,
+            gradient_checkpointing=gradient_checkpointing,
+        )
+        self._use_gradient_checkpointing = gradient_checkpointing
+        self.perceiver._use_gradient_checkpointing = gradient_checkpointing
+    def forward(
+        self,
+        vision_x: torch.Tensor,
+        lang_x: torch.Tensor,
+        attention_mask: torch.Tensor = None,
+        labels: torch.Tensor = None,
+        clear_conditioned_layers: bool = True,
+        past_key_values=None,
+        use_cache: bool = False,
+    ):
+        """
+        Forward pass of Flamingo.
+        Args:
+            vision_x (torch.Tensor): Vision input
+                shape (B, T_img, F, C, H, W) with F=1
+            lang_x (torch.Tensor): Language input ids
+                shape (B, T_txt)
+            attention_mask (torch.Tensor, optional): Attention mask. Defaults to None.
+            labels (torch.Tensor, optional): Labels. Defaults to None.
+            clear_conditioned_layers: if True, clear the conditioned layers
+                once the foward pass is completed. Set this to false if the
+                same set of images will be reused in another subsequent
+                forward pass.
+            past_key_values: pre-computed values to pass to language model.
+                See past_key_values documentation in Hugging Face
+                CausalLM models.
+            use_cache: whether to use cached key values. See use_cache
+                documentation in Hugging Face CausalLM models.
+        """
+        assert (
+            self.lang_encoder.initialized_flamingo
+        ), "Flamingo layers are not initialized. Please call `init_flamingo` first."
+        assert (
+            self.lang_encoder._use_cached_vision_x or vision_x is not None
+        ), "Must provide either vision_x or have precached media using cache_media()."
+        if self.lang_encoder._use_cached_vision_x:
+            # Case: use cached; vision_x should be cached and other
+            # vision-related inputs should not be provided.
+            assert (
+                vision_x is None
+            ), "Expect vision_x to be None when media has been cached using cache_media(). Try uncache_media() first."
+            assert self.lang_encoder.is_conditioned()
+        else:
+            # Case: do not use caching (i.e. this is a standard forward pass);
+            self._encode_vision_x(vision_x=vision_x)
+            self._condition_media_locations(input_ids=lang_x)
+        output = self.lang_encoder(
+            input_ids=lang_x,
+            attention_mask=attention_mask,
+            labels=labels,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+        )
+        if clear_conditioned_layers:
+            self.lang_encoder.clear_conditioned_layers()
+        return output
+    def generate(
+        self,
+        vision_x: torch.Tensor,
+        lang_x: torch.Tensor,
+        attention_mask: torch.Tensor = None,
+        **kwargs,
+    ):
+        """
+        Generate text conditioned on vision and language inputs.
+        Args:
+            vision_x (torch.Tensor): Vision input
+                shape (B, T_img, F, C, H, W)
+                images in the same chunk are collated along T_img, and frames are collated along F
+                currently only F=1 is supported (single-frame videos)
+            lang_x (torch.Tensor): Language input
+                shape (B, T_txt)
+            **kwargs: see generate documentation in Hugging Face CausalLM models. Some notable kwargs:
+                max_length (int, optional): Maximum length of the output. Defaults to None.
+                attention_mask (torch.Tensor, optional): Attention mask. Defaults to None.
+                num_beams (int, optional): Number of beams. Defaults to 1.
+                max_new_tokens (int, optional): Maximum new tokens. Defaults to None.
+                temperature (float, optional): Temperature. Defaults to 1.0.
+                top_k (int, optional): Top k. Defaults to 50.
+                top_p (float, optional): Top p. Defaults to 1.0.
+                no_repeat_ngram_size (int, optional): No repeat ngram size. Defaults to 0.
+                length_penalty (float, optional): Length penalty. Defaults to 1.0.
+                num_return_sequences (int, optional): Number of return sequences. Defaults to 1.
+                do_sample (bool, optional): Do sample. Defaults to False.
+                early_stopping (bool, optional): Early stopping. Defaults to False.
+        Returns:
+            torch.Tensor: lang_x with generated tokens appended to it
+        """
+        num_beams = kwargs.pop("num_beams", 1)
+        if num_beams > 1:
+            vision_x = vision_x.repeat_interleave(num_beams, dim=0)
+        self.lang_encoder._use_cached_vision_x = True
+        self._encode_vision_x(vision_x=vision_x)
+        eos_token_id = kwargs.pop("eos_token_id", self.eoc_token_id)
+        output = self.lang_encoder.generate(
+            input_ids=lang_x,
+            attention_mask=attention_mask,
+            eos_token_id=eos_token_id,
+            num_beams=num_beams,
+            **kwargs,
+        )
+        self.lang_encoder.clear_conditioned_layers()
+        self.lang_encoder._use_cached_vision_x = False
+        return output
+    def _encode_vision_x(self, vision_x: torch.Tensor):
+        """
+        Compute media tokens from vision input by passing it through vision encoder and conditioning language model.
+        Args:
+            vision_x (torch.Tensor): Vision input
+                shape (B, T_img, F, C, H, W)
+                Images in the same chunk are collated along T_img, and frames are collated along F
+                Currently only F=1 is supported (single-frame videos)
+        rearrange code based on https://github.com/dhansmair/flamingo-mini
+        """
+        assert vision_x.ndim == 6, "vision_x should be of shape (b, T_img, F, C, H, W)"
+        b, T, F = vision_x.shape[:3]
+        assert F == 1, "Only single frame supported"
+        vision_x = rearrange(vision_x, "b T F c h w -> (b T F) c h w")
+        with torch.no_grad():
+            vision_x = self.vision_encoder(vision_x)[1]
+        vision_x = rearrange(vision_x, "(b T F) v d -> b T F v d", b=b, T=T, F=F)
+        vision_x = self.perceiver(vision_x)
+        for layer in self.lang_encoder._get_decoder_layers():
+            layer.condition_vis_x(vision_x)
+    def wrap_fsdp(self, wrapper_kwargs, device_id):
+        """
+        Manually wraps submodules for FSDP and move other parameters to device_id.
+        Why manually wrap?
+        - all parameters within the FSDP wrapper must have the same requires_grad.
+            We have a mix of frozen and unfrozen parameters.
+        - model.vision_encoder.visual needs to be individually wrapped or encode_vision_x errors
+            See: https://github.com/pytorch/pytorch/issues/82461#issuecomment-1269136344
+        The rough wrapping structure is:
+        - FlamingoModel
+            - FSDP(FSDP(vision_encoder))
+            - FSDP(FSDP(perceiver))
+            - lang_encoder
+                - FSDP(FSDP(input_embeddings))
+                - FlamingoLayers
+                    - FSDP(FSDP(gated_cross_attn_layer))
+                    - FSDP(FSDP(decoder_layer))
+                - FSDP(FSDP(output_embeddings))
+                - other parameters
+        Known issues:
+        - Our FSDP strategy is not compatible with tied embeddings. If the LM embeddings are tied,
+            train with DDP or set the --freeze_lm_embeddings flag to true.
+        - With FSDP + gradient ckpting, one can increase the batch size with seemingly no upper bound.
+            Although the training curves look okay, we found that downstream performance dramatically
+            degrades if the batch size is unreasonably large (e.g., 100 MMC4 batch size for OPT-125M).
+        FAQs about our FSDP wrapping strategy:
+        Why double wrap?
+        As of torch==2.0.1, FSDP's _post_forward_hook and _post_backward_hook
+        only free gathered parameters if the module is NOT FSDP root.
+        Why unfreeze the decoder_layers?
+        See https://github.com/pytorch/pytorch/issues/95805
+        As of torch==2.0.1, FSDP's _post_backward_hook is only registed if the flat param
+        requires_grad=True. We need the postback to fire to avoid OOM.
+        To effectively freeze the decoder layers, we exclude them from the optimizer.
+        What is assumed to be frozen v. unfrozen?
+        We assume that the model is being trained under normal Flamingo settings
+        with these lines being called in factory.py:
+            ```
+            # Freeze all parameters
+            model.requires_grad_(False)
+            assert sum(p.numel() for p in model.parameters() if p.requires_grad) == 0
+            # Unfreeze perceiver, gated_cross_attn_layers, and LM input embeddings
+            model.perceiver.requires_grad_(True)
+            model.lang_encoder.gated_cross_attn_layers.requires_grad_(True)
+            [optional] model.lang_encoder.get_input_embeddings().requires_grad_(True)
+            ```
+        """
+        # unfreeze the decoder layers
+        for block in self.lang_encoder.old_decoder_blocks:
+            block.requires_grad_(True)
+        # wrap in FSDP
+        with enable_wrap(wrapper_cls=FSDP, **wrapper_kwargs):
+            self.perceiver = wrap(wrap(self.perceiver))
+            self.lang_encoder.old_decoder_blocks = nn.ModuleList(
+                wrap(wrap(block)) for block in self.lang_encoder.old_decoder_blocks
+            )
+            self.lang_encoder.gated_cross_attn_layers = nn.ModuleList(
+                wrap(wrap(layer)) if layer is not None else None
+                for layer in self.lang_encoder.gated_cross_attn_layers
+            )
+            self.lang_encoder.init_flamingo_layers(self._use_gradient_checkpointing)
+            self.lang_encoder.set_input_embeddings(
+                wrap(wrap(self.lang_encoder.get_input_embeddings()))
+            )
+            self.lang_encoder.set_output_embeddings(
+                wrap(wrap(self.lang_encoder.get_output_embeddings()))
+            )
+            self.vision_encoder = wrap(wrap(self.vision_encoder))  # frozen
+        # manually move non-FSDP managed parameters to device_id
+        # these are all in lang_encoder
+        apply_with_stopping_condition(
+            module=self.lang_encoder,
+            apply_fn=lambda m: m.to(device_id),
+            apply_condition=lambda m: len(list(m.children())) == 0,
+            stopping_condition=lambda m: isinstance(m, FSDP),
+        )
+        # exclude the original decoder layers from the optimizer
+        for block in self.lang_encoder.old_decoder_blocks:
+            for p in block.parameters():
+                p.exclude_from_optimizer = True
+        # set up clip_grad_norm_ function
+        def clip_grad_norm_(max_norm):
+            self.perceiver.clip_grad_norm_(max_norm)
+            for layer in self.lang_encoder.gated_cross_attn_layers:
+                if layer is not None:
+                    layer.clip_grad_norm_(max_norm)
+            self.lang_encoder.get_input_embeddings().clip_grad_norm_(max_norm)
+        self.clip_grad_norm_ = clip_grad_norm_
+    def _condition_media_locations(self, input_ids: torch.Tensor):
+        """
+        Compute the media token locations from lang_x and condition the language model on these.
+        Args:
+            input_ids (torch.Tensor): Language input
+                shape (B, T_txt)
+        """
+        media_locations = input_ids == self.media_token_id
+        for layer in self.lang_encoder._get_decoder_layers():
+            layer.condition_media_locations(media_locations)
+    def cache_media(self, input_ids: torch.Tensor, vision_x: torch.Tensor):
+        """
+        Pre-cache a prompt/sequence of images / text for log-likelihood evaluations.
+        All subsequent calls to forward() will generate attending to the LAST
+        image in vision_x.
+        This is not meant to be used to cache things for generate().
+        Args:
+            input_ids (torch.Tensor): Language input
+                shape (B, T_txt)
+            vision_x (torch.Tensor): Vision input
+                shape (B, T_img, F, C, H, W)
+                Images in the same chunk are collated along T_img, and frames are collated along F
+                Currently only F=1 is supported (single-frame videos)
+        """
+        self._encode_vision_x(vision_x=vision_x)
+        self._condition_media_locations(input_ids=input_ids)
+        self.lang_encoder._use_cached_vision_x = True
+    def uncache_media(self):
+        """
+        Clear all conditioning.
+        """
+        self.lang_encoder.clear_conditioned_layers()
+        self.lang_encoder._use_cached_vision_x = False

open_flamingo/src/flamingo_lm.py ADDED Viewed

	@@ -0,0 +1,191 @@

+import torch.nn as nn
+from .helpers import GatedCrossAttentionBlock
+from .utils import getattr_recursive, setattr_recursive
+import copy
+class FlamingoLayer(nn.Module):
+    """
+    FlamingoLayer is a wrapper around the GatedCrossAttentionBlock and DecoderLayer.
+    """
+    def __init__(
+        self, gated_cross_attn_layer, decoder_layer, gradient_checkpointing=False, residual=False
+    ):
+        super().__init__()
+        self.gated_cross_attn_layer = gated_cross_attn_layer
+        self.decoder_layer = decoder_layer
+        self.vis_x = None
+        self.media_locations = None
+        self.residual = residual
+        if self.gated_cross_attn_layer is not None:
+            self.gated_cross_attn_layer._use_gradient_checkpointing = (
+                gradient_checkpointing
+            )
+        self.decoder_layer._use_gradient_checkpointing = gradient_checkpointing
+    def clone_parameters(self):
+        self.res_layer = copy.deepcopy(self.gated_cross_attn_layer)
+        if self.res_layer is not None:
+            self.res_layer.requires_grad_(False)
+    def is_conditioned(self) -> bool:
+        """Check whether the layer is conditioned."""
+        return self.vis_x is not None and self.media_locations is not None
+    # Used this great idea from this implementation of Flamingo (https://github.com/dhansmair/flamingo-mini/)
+    def condition_vis_x(self, vis_x):
+        self.vis_x = vis_x
+    def condition_media_locations(self, media_locations):
+        self.media_locations = media_locations
+    def condition_use_cached_media(self, use_cached_media):
+        self.use_cached_media = use_cached_media
+    def forward(
+        self,
+        lang_x,
+        attention_mask=None,
+        **decoder_layer_kwargs,
+    ):
+        # Cross attention
+        if self.gated_cross_attn_layer is not None:
+            if self.vis_x is None:
+                raise ValueError("vis_x must be conditioned before forward pass")
+            if self.media_locations is None:
+                raise ValueError(
+                    "media_locations must be conditioned before forward pass"
+                )
+            lang_x = self.gated_cross_attn_layer(
+                lang_x,
+                self.vis_x,
+                media_locations=self.media_locations,
+                use_cached_media=self.use_cached_media,
+            )
+            # Residual
+            if self.residual and self.res_layer is not None:
+                lang_x_res = self.res_layer(
+                    lang_x,
+                    self.vis_x,
+                    media_locations=self.media_locations,
+                    attend_previous=self.attend_previous,
+                )
+                lang_x = (lang_x + lang_x_res) / 2.0
+        # Normal decoder layer
+        lang_x = self.decoder_layer(
+            lang_x, attention_mask=attention_mask, **decoder_layer_kwargs
+        )
+        return lang_x
+class FlamingoLMMixin(nn.Module):
+    """
+    Mixin to add cross-attention layers to a language model.
+    """
+    def set_decoder_layers_attr_name(self, decoder_layers_attr_name):
+        self.decoder_layers_attr_name = decoder_layers_attr_name
+    def _get_decoder_layers(self):
+        return getattr_recursive(self, self.decoder_layers_attr_name)
+    def _set_decoder_layers(self, value):
+        setattr_recursive(self, self.decoder_layers_attr_name, value)
+    def init_flamingo(
+        self,
+        media_token_id,
+        lang_hidden_size,
+        vis_hidden_size,
+        cross_attn_every_n_layers,
+        gradient_checkpointing,
+        residual=False,
+    ):
+        """
+        Initialize Flamingo by adding a new gated cross attn to the decoder. Store the media token id for computing the media locations.
+        """
+        print('-'*100)
+        print(self.decoder_layers_attr_name)
+        self.old_decoder_blocks = self._get_decoder_layers()
+        self.gated_cross_attn_layers = nn.ModuleList(
+            [
+                GatedCrossAttentionBlock(
+                    dim=lang_hidden_size, dim_visual=vis_hidden_size
+                )
+                if (layer_idx + 1) % cross_attn_every_n_layers == 0
+                else None
+                for layer_idx, _ in enumerate(self._get_decoder_layers())
+            ]
+        )
+        self.init_flamingo_layers(gradient_checkpointing, residual=residual)
+        self.media_token_id = media_token_id
+        self.initialized_flamingo = True
+        self._use_cached_vision_x = False
+    def init_flamingo_layers(self, gradient_checkpointing, residual=False):
+        """
+        Re initializes the FlamingoLayers.
+        Propagates any changes made to self.gated_corss_attn_layers or self.old_decoder_blocks
+        """
+        self._set_decoder_layers(
+            nn.ModuleList(
+                [
+                    FlamingoLayer(
+                        gated_cross_attn_layer, decoder_layer, gradient_checkpointing, residual=residual
+                    )
+                    for gated_cross_attn_layer, decoder_layer in zip(
+                        self.gated_cross_attn_layers, self.old_decoder_blocks
+                    )
+                ]
+            )
+        )
+    def forward(self, input_ids, attention_mask, **kwargs):
+        """Condition the Flamingo layers on the media locations before forward()"""
+        if not self.initialized_flamingo:
+            raise ValueError(
+                "Flamingo layers are not initialized. Please call `init_flamingo` first."
+            )
+        media_locations = input_ids == self.media_token_id
+        # if there are media already cached and we're generating and there are no media tokens in the input,
+        # we'll assume that ALL input tokens should attend to the last previous media that is cached.
+        # this is especially important for HF generate() compatibility, since generate() calls forward()
+        # repeatedly one token at a time (with no media tokens).
+        # without this check, the model would not attend to any images when generating (after the first token)
+        use_cached_media_locations = (
+            self._use_cached_vision_x
+            and self.is_conditioned()
+            and not media_locations.any()
+        )
+        for layer in self._get_decoder_layers():
+            if not use_cached_media_locations:
+                layer.condition_media_locations(media_locations)
+            layer.condition_use_cached_media(use_cached_media_locations)
+        # package arguments for the other parent's forward. since we don't know the order of the arguments,
+        # make them all kwargs
+        kwargs["input_ids"] = input_ids
+        kwargs["attention_mask"] = attention_mask
+        return super().forward(**kwargs)  # Call the other parent's forward method
+    def is_conditioned(self) -> bool:
+        """Check whether all decoder layers are already conditioned."""
+        return all(l.is_conditioned() for l in self._get_decoder_layers())
+    def clone_parameters(self):
+        for layer in self._get_decoder_layers():
+            layer.clone_parameters()
+    def clear_conditioned_layers(self):
+        for layer in self._get_decoder_layers():
+            layer.condition_vis_x(None)
+            layer.condition_media_locations(None)
+            layer.condition_use_cached_media(None)

open_flamingo/src/helpers.py ADDED Viewed

	@@ -0,0 +1,279 @@

+"""
+Based on: https://github.com/lucidrains/flamingo-pytorch
+"""
+import torch
+from einops import rearrange, repeat
+from einops_exts import rearrange_many
+from torch import einsum, nn
+def exists(val):
+    return val is not None
+def FeedForward(dim, mult=4):
+    inner_dim = int(dim * mult)
+    return nn.Sequential(
+        nn.LayerNorm(dim),
+        nn.Linear(dim, inner_dim, bias=False),
+        nn.GELU(),
+        nn.Linear(inner_dim, dim, bias=False),
+    )
+class PerceiverAttention(nn.Module):
+    def __init__(self, *, dim, dim_head=64, heads=8):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        inner_dim = dim_head * heads
+        self.norm_media = nn.LayerNorm(dim)
+        self.norm_latents = nn.LayerNorm(dim)
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, dim, bias=False)
+    def forward(self, x, latents):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (b, T, n1, D)
+            latent (torch.Tensor): latent features
+                shape (b, T, n2, D)
+        """
+        x = self.norm_media(x)
+        latents = self.norm_latents(latents)
+        h = self.heads
+        q = self.to_q(latents)
+        kv_input = torch.cat((x, latents), dim=-2)
+        k, v = self.to_kv(kv_input).chunk(2, dim=-1)
+        q, k, v = rearrange_many((q, k, v), "b t n (h d) -> b h t n d", h=h)
+        q = q * self.scale
+        # attention
+        sim = einsum("... i d, ... j d  -> ... i j", q, k)
+        sim = sim - sim.amax(dim=-1, keepdim=True).detach()
+        attn = sim.softmax(dim=-1)
+        out = einsum("... i j, ... j d -> ... i d", attn, v)
+        out = rearrange(out, "b h t n d -> b t n (h d)", h=h)
+        return self.to_out(out)
+class PerceiverResampler(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        depth=6,
+        dim_head=64,
+        heads=8,
+        num_latents=64,
+        max_num_media=None,
+        max_num_frames=None,
+        ff_mult=4,
+    ):
+        super().__init__()
+        self.latents = nn.Parameter(torch.randn(num_latents, dim))
+        self.frame_embs = (
+            nn.Parameter(torch.randn(max_num_frames, dim))
+            if exists(max_num_frames)
+            else None
+        )
+        self.media_time_embs = (
+            nn.Parameter(torch.randn(max_num_media, 1, dim))
+            if exists(max_num_media)
+            else None
+        )
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
+                        FeedForward(dim=dim, mult=ff_mult),
+                    ]
+                )
+            )
+        self.norm = nn.LayerNorm(dim)
+    def forward(self, x):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (b, T, F, v, D)
+        Returns:
+            shape (b, T, n, D) where n is self.num_latents
+        """
+        b, T, F, v = x.shape[:4]
+        # frame and media time embeddings
+        if exists(self.frame_embs):
+            frame_embs = repeat(self.frame_embs[:F], "F d -> b T F v d", b=b, T=T, v=v)
+            x = x + frame_embs
+        x = rearrange(
+            x, "b T F v d -> b T (F v) d"
+        )  # flatten the frame and spatial dimensions
+        if exists(self.media_time_embs):
+            x = x + self.media_time_embs[:T]
+        # blocks
+        latents = repeat(self.latents, "n d -> b T n d", b=b, T=T)
+        for attn, ff in self.layers:
+            latents = attn(x, latents) + latents
+            latents = ff(latents) + latents
+        return self.norm(latents)
+# gated cross attention
+class MaskedCrossAttention(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        dim_visual,
+        dim_head=64,
+        heads=8,
+        only_attend_immediate_media=True,
+    ):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        inner_dim = dim_head * heads
+        self.norm = nn.LayerNorm(dim)
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(dim_visual, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, dim, bias=False)
+        # whether for text to only attend to immediate preceding image, or all previous images
+        self.only_attend_immediate_media = only_attend_immediate_media
+    def forward(self, x, media, media_locations=None, use_cached_media=False):
+        """
+        Args:
+            x (torch.Tensor): text features
+                shape (B, T_txt, D_txt)
+            media (torch.Tensor): image features
+                shape (B, T_img, n, D_img) where n is the dim of the latents
+            media_locations: boolean mask identifying the media tokens in x
+                shape (B, T_txt)
+            use_cached_media: bool
+                If true, treat all of x as if they occur after the last media
+                registered in media_locations. T_txt does not need to exactly
+                equal media_locations.shape[1] in this case
+        """
+        if not use_cached_media:
+            assert (
+                media_locations.shape[1] == x.shape[1]
+            ), f"media_location.shape is {media_locations.shape} but x.shape is {x.shape}"
+        T_txt = x.shape[1]
+        _, T_img, n = media.shape[:3]
+        h = self.heads
+        x = self.norm(x)
+        q = self.to_q(x)
+        media = rearrange(media, "b t n d -> b (t n) d")
+        k, v = self.to_kv(media).chunk(2, dim=-1)
+        q, k, v = rearrange_many((q, k, v), "b n (h d) -> b h n d", h=h)
+        q = q * self.scale
+        sim = einsum("... i d, ... j d -> ... i j", q, k)
+        if exists(media_locations):
+            media_time = torch.arange(T_img, device=x.device) + 1
+            if use_cached_media:
+                # text time is set to the last cached media location
+                text_time = repeat(
+                    torch.count_nonzero(media_locations, dim=1),
+                    "b -> b i",
+                    i=T_txt,
+                )
+            else:
+                # at each boolean of True, increment the time counter (relative to media time)
+                text_time = media_locations.cumsum(dim=-1)
+            # text time must equal media time if only attending to most immediate image
+            # otherwise, as long as text time is greater than media time (if attending to all previous images / media)
+            mask_op = torch.eq if self.only_attend_immediate_media else torch.ge
+            text_to_media_mask = mask_op(
+                rearrange(text_time, "b i -> b 1 i 1"),
+                repeat(media_time, "j -> 1 1 1 (j n)", n=n),
+            )
+            sim = sim.masked_fill(~text_to_media_mask, -torch.finfo(sim.dtype).max)
+        sim = sim - sim.amax(dim=-1, keepdim=True).detach()
+        attn = sim.softmax(dim=-1)
+        if exists(media_locations) and self.only_attend_immediate_media:
+            # any text without a preceding media needs to have attention zeroed out
+            text_without_media_mask = text_time == 0
+            text_without_media_mask = rearrange(
+                text_without_media_mask, "b i -> b 1 i 1"
+            )
+            attn = attn.masked_fill(text_without_media_mask, 0.0)
+        out = einsum("... i j, ... j d -> ... i d", attn, v)
+        out = rearrange(out, "b h n d -> b n (h d)")
+        return self.to_out(out)
+class GatedCrossAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        dim_visual,
+        dim_head=64,
+        heads=8,
+        ff_mult=4,
+        only_attend_immediate_media=True,
+    ):
+        super().__init__()
+        self.attn = MaskedCrossAttention(
+            dim=dim,
+            dim_visual=dim_visual,
+            dim_head=dim_head,
+            heads=heads,
+            only_attend_immediate_media=only_attend_immediate_media,
+        )
+        self.attn_gate = nn.Parameter(torch.tensor([0.0]))
+        self.ff = FeedForward(dim, mult=ff_mult)
+        self.ff_gate = nn.Parameter(torch.tensor([0.0]))
+    def forward(
+        self,
+        x,
+        media,
+        media_locations=None,
+        use_cached_media=False,
+    ):
+        x = (
+            self.attn(
+                x,
+                media,
+                media_locations=media_locations,
+                use_cached_media=use_cached_media,
+            )
+            * self.attn_gate.tanh()
+            + x
+        )
+        x = self.ff(x) * self.ff_gate.tanh() + x
+        return x

open_flamingo/src/utils.py ADDED Viewed

	@@ -0,0 +1,48 @@

+def extend_instance(obj, mixin):
+    """Apply mixins to a class instance after creation"""
+    base_cls = obj.__class__
+    base_cls_name = obj.__class__.__name__
+    obj.__class__ = type(
+        base_cls_name, (mixin, base_cls), {}
+    )  # mixin needs to go first for our forward() logic to work
+def getattr_recursive(obj, att):
+    """
+    Return nested attribute of obj
+    Example: getattr_recursive(obj, 'a.b.c') is equivalent to obj.a.b.c
+    """
+    if att == "":
+        return obj
+    i = att.find(".")
+    if i < 0:
+        return getattr(obj, att)
+    else:
+        return getattr_recursive(getattr(obj, att[:i]), att[i + 1 :])
+def setattr_recursive(obj, att, val):
+    """
+    Set nested attribute of obj
+    Example: setattr_recursive(obj, 'a.b.c', val) is equivalent to obj.a.b.c = val
+    """
+    if "." in att:
+        obj = getattr_recursive(obj, ".".join(att.split(".")[:-1]))
+    setattr(obj, att.split(".")[-1], val)
+def apply_with_stopping_condition(
+    module, apply_fn, apply_condition=None, stopping_condition=None, **other_args
+):
+    if stopping_condition(module):
+        return
+    if apply_condition(module):
+        apply_fn(module, **other_args)
+    for child in module.children():
+        apply_with_stopping_condition(
+            child,
+            apply_fn,
+            apply_condition=apply_condition,
+            stopping_condition=stopping_condition,
+            **other_args
+        )