Loading into root will supposedly make them easier to find

Browse files

Files changed (15) hide show

caption_match.py +247 -0
common_settings.py +18 -0
general_training_helper.py +172 -0
latent_diffusion_pipeline.py +99 -0
model_index.json +1 -1
naming_conventions.py +29 -0
pipeline_loader.py +41 -0
plotter.py +173 -0
sampler.py +473 -0
sentence_transformers_helper.py +114 -0
text_diffusion_pipeline.py +442 -0
text_model.py +206 -0
text_to_level_diffusion.py +194 -0
tokenizer.py +147 -0
util.py +216 -0

caption_match.py ADDED Viewed

	@@ -0,0 +1,247 @@

+from create_ascii_captions import assign_caption
+# Quantity order for scoring partial matches
+QUANTITY_TERMS = ["one", "two", "a few", "several", "many"]
+# Topics to compare
+TOPIC_KEYWORDS = [
+    #"giant gap", # I think all gaps are subsumed by the floor topic
+    "floor", "ceiling",
+    "broken pipe", "upside down pipe", "pipe",
+    "coin line", "coin",
+    "platform", "tower", #"wall",
+    "broken cannon", "cannon",
+    "ascending staircase", "descending staircase",
+    "rectangular",
+    "irregular",
+    "question block", "loose block",
+    "enem"  # catch "enemy"/"enemies"
+]
+# Need list because the order matters
+KEYWORD_TO_NEGATED_PLURAL = [
+    (" broken pipe.", ""), # If not the first phrase
+    ("broken pipe. ", ""), # If the first phrase (after removing all others)
+    (" broken cannon.", ""), # If not the first phrase
+    ("broken cannon. ", ""), # If the first phrase (after removing all others)
+    ("pipe", "pipes"),
+    ("cannon", "cannons"),
+    ("platform", "platforms"),
+    ("tower", "towers"),
+    ("staircase", "staircases"),
+    ("enem", "enemies"),
+    ("rectangular", "rectangular block clusters"),
+    ("irregular", "irregular block clusters"),
+    ("coin line", "coin lines"),
+    ("coin.", "coins."), # Need period to avoid matching "coin line"
+    ("question block", "question blocks"),
+    ("loose block", "loose blocks")
+]
+BROKEN_TOPICS = 2 # Number of topics that are considered "broken" (e.g., "broken pipe", "broken cannon")
+# Plural normalization map (irregulars)
+PLURAL_EXCEPTIONS = {
+    "enemies": "enemy",
+}
+def normalize_plural(phrase):
+    # Normalize known irregular plurals
+    for plural, singular in PLURAL_EXCEPTIONS.items():
+        phrase = phrase.replace(plural, singular)
+    # Normalize regular plurals (basic "s" endings)
+    words = phrase.split()
+    normalized_words = []
+    for word in words:
+        if word.endswith('s') and not word.endswith('ss'):  # avoid "class", "boss"
+            singular = word[:-1]
+            normalized_words.append(singular)
+        else:
+            normalized_words.append(word)
+    return ' '.join(normalized_words)
+def extract_phrases(caption, debug=False):
+    phrases = [phrase.strip() for phrase in caption.split('.') if phrase.strip()]
+    topic_to_phrase = {}
+    already_matched_phrases = set()  # Track phrases that have been matched
+    for topic in TOPIC_KEYWORDS:
+        matching_phrases = []
+        for p in phrases:
+            # Only consider phrases that haven't been matched to longer topics
+            if topic in p and p not in already_matched_phrases:
+                matching_phrases.append(p)
+        if matching_phrases:
+            # Filter out "no ..." phrases as equivalent to absence
+            phrase = matching_phrases[0]
+            if phrase.lower().startswith("no "):
+                topic_to_phrase[topic] = None
+                if debug:
+                    print(f"[Extract] Topic '{topic}': detected 'no ...', treating as None")
+            else:
+                topic_to_phrase[topic] = phrase
+                already_matched_phrases.add(phrase)  # Mark this phrase as matched
+                if debug:
+                    print(f"[Extract] Topic '{topic}': found phrase '{phrase}'")
+        else:
+            topic_to_phrase[topic] = None
+            if debug:
+                print(f"[Extract] Topic '{topic}': no phrase found")
+    return topic_to_phrase
+def quantity_score(phrase1, phrase2, debug=False):
+    def find_quantity(phrase):
+        for term in QUANTITY_TERMS:
+            if term in phrase:
+                return term
+        return None
+    qty1 = find_quantity(phrase1)
+    qty2 = find_quantity(phrase2)
+    if debug:
+        print(f"[Quantity] Comparing quantities: '{qty1}' vs. '{qty2}'")
+    if qty1 and qty2:
+        idx1 = QUANTITY_TERMS.index(qty1)
+        idx2 = QUANTITY_TERMS.index(qty2)
+        diff = abs(idx1 - idx2)
+        max_diff = len(QUANTITY_TERMS) - 1
+        score = 1.0 - (diff / max_diff)
+        if debug:
+            print(f"[Quantity] Quantity indices: {idx1} vs. {idx2}, diff: {diff}, score: {score:.2f}")
+        return score
+    if debug:
+        print("[Quantity] At least one quantity missing, assigning partial score 0.1")
+    return 0.1
+def compare_captions(correct_caption, generated_caption, debug=False, return_matches=False):
+    correct_phrases = extract_phrases(correct_caption, debug=debug)
+    generated_phrases = extract_phrases(generated_caption, debug=debug)
+    total_score = 0.0
+    num_topics = len(TOPIC_KEYWORDS)
+    exact_matches = []
+    partial_matches = []
+    excess_phrases = []
+    if debug:
+        print("\n--- Starting Topic Comparison ---\n")
+    for topic in TOPIC_KEYWORDS:
+        correct = correct_phrases[topic]
+        generated = generated_phrases[topic]
+        if debug:
+            print(f"[Topic: {topic}] Correct: {correct} | Generated: {generated}")
+        if correct is None and generated is None:
+            total_score += 1.0
+            if debug:
+                print(f"[Topic: {topic}] Both None — full score: 1.0\n")
+        elif correct is None or generated is None:
+            total_score += -1.0
+            if generated is not None: # Considered an excess phrase
+                excess_phrases.append(generated)
+            if debug:
+                print(f"[Topic: {topic}] One is None — penalty: -1.0\n")
+        else:
+            # Normalize pluralization before comparison
+            norm_correct = normalize_plural(correct)
+            norm_generated = normalize_plural(generated)
+            if debug:
+                print(f"[Topic: {topic}] Normalized: Correct: '{norm_correct}' | Generated: '{norm_generated}'")
+            if norm_correct == norm_generated:
+                total_score += 1.0
+                exact_matches.append(generated)
+                if debug:
+                    print(f"[Topic: {topic}] Exact match — score: 1.0\n")
+            elif any(term in norm_correct for term in QUANTITY_TERMS) and any(term in norm_generated for term in QUANTITY_TERMS):
+                qty_score = quantity_score(norm_correct, norm_generated, debug=debug)
+                total_score += qty_score
+                partial_matches.append(generated)
+                if debug:
+                    print(f"[Topic: {topic}] Quantity-based partial score: {qty_score:.2f}\n")
+            else:
+                total_score += 0.1
+                partial_matches.append(generated)
+                if debug:
+                    print(f"[Topic: {topic}] Partial match (topic overlap) — score: 0.1\n")
+        if debug:
+            print(f"[Topic: {topic}] Current total score: {total_score:.4f}\n")
+    if debug:
+        print("total_score before normalization:", total_score)
+        print(f"Number of topics: {num_topics}")
+    final_score = total_score / num_topics
+    if debug:
+        print(f"--- Final score: {final_score:.4f} ---\n")
+    if return_matches:
+        return final_score, exact_matches, partial_matches, excess_phrases
+    return final_score
+def process_scene_segments(scene, segment_width, prompt, id_to_char, char_to_id, tile_descriptors, describe_locations, describe_absence, verbose=False):
+    """
+    Process a scene by partitioning it into segments, assigning captions, and computing comparison scores.
+    Args:
+        scene (list): The scene to process, represented as a 2D list.
+        segment_width (int): The width of each segment.
+        prompt (str): The prompt to compare captions against.
+        id_to_char (dict): Mapping from tile IDs to characters.
+        char_to_id (dict): Mapping from characters to tile IDs.
+        tile_descriptors (dict): Descriptions of individual tile types.
+        describe_locations (bool): Whether to include location descriptions in captions.
+        describe_absence (bool): Whether to indicate absence of items in captions.
+        verbose (bool): If True, print captions and scores for each segment.
+    Returns:
+        tuple: A tuple containing the average comparison score, captions for each segment, and scores for each segment.
+    """
+    # Partition the scene into segments of the specified width
+    segments = [
+        [row[i:i+segment_width] for row in scene]  # Properly slice each row of the scene
+        for i in range(0, len(scene[0]), segment_width)
+    ]
+    # Assign captions and compute scores for each segment
+    segment_scores = []
+    segment_captions = []
+    for idx, segment in enumerate(segments):
+        segment_caption = assign_caption(segment, id_to_char, char_to_id, tile_descriptors, describe_locations, describe_absence)
+        segment_score = compare_captions(prompt, segment_caption)
+        segment_scores.append(segment_score)
+        segment_captions.append(segment_caption)
+        if verbose:
+            print(f"Segment {idx + 1} caption: {segment_caption}")
+            print(f"Segment {idx + 1} comparison score: {segment_score}")
+    # Compute the average comparison score
+    average_score = sum(segment_scores) / len(segment_scores) if segment_scores else 0
+    if verbose:
+        print(f"Average comparison score across all segments: {average_score}")
+    return average_score, segment_captions, segment_scores
+if __name__ == '__main__':
+    ref = "floor with one gap. two enemies. one platform. one tower."
+    gen = "giant gap with one chunk of floor. two enemies. one platform. one tower."
+    score = compare_captions(ref, gen, debug=True)
+    print(f"Should be: {ref}")
+    print(f"  but was: {gen}")
+    print(f"Score: {score}")

common_settings.py ADDED Viewed

	@@ -0,0 +1,18 @@

+NUM_INFERENCE_STEPS = 30
+GUIDANCE_SCALE = 7.5
+MARIO_HEIGHT = 16
+MARIO_WIDTH = 16
+MARIO_TILE_PIXEL_DIM = 16
+MARIO_TILE_COUNT = 13
+LR_HEIGHT = 32
+LR_WIDTH = 32
+LR_TILE_PIXEL_DIM = 8
+LR_TILE_COUNT = 8
+MEGAMAN_HEIGHT = 14
+MEGAMAN_WIDTH = 16

general_training_helper.py ADDED Viewed

	@@ -0,0 +1,172 @@

+from torch.utils.data import DataLoader
+from level_dataset import LevelDataset
+import random
+from plotter import Plotter
+from datetime import datetime
+import os
+import threading
+import json
+import torch.nn.functional as F
+import torch
+def create_dataloaders(json_path, val_json, tokenizer, data_mode, augment, num_tiles,
+                       negative_prompt_training, block_embeddings, batch_size):
+    # Initialize dataset
+    train_dataset = LevelDataset(
+        json_path=json_path,
+        tokenizer=tokenizer,
+        shuffle=True,
+        mode=data_mode,
+        augment=augment,
+        num_tiles=num_tiles,
+        negative_captions=negative_prompt_training,
+        block_embeddings=block_embeddings
+    )
+    val_dataset = None
+    if val_json is not None:
+        val_dataset = LevelDataset(
+            json_path=val_json,
+            tokenizer=tokenizer,
+            shuffle=False,
+            mode=data_mode,
+            augment=False,
+            num_tiles=num_tiles,
+            negative_captions=negative_prompt_training,
+            block_embeddings=block_embeddings
+        )
+    # Create dataloader
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_size=batch_size,
+        shuffle=True,
+        num_workers=4,
+        drop_last=True,
+        persistent_workers=True
+    )
+    val_dataloader = None
+    if val_dataset is not None:
+        val_dataloader = DataLoader(
+            val_dataset,
+            batch_size=batch_size,
+            shuffle=False,
+            num_workers=4,
+            drop_last=False,
+            persistent_workers=True
+        )
+    return train_dataloader, val_dataloader
+def get_random_training_samples(train_dataloader, negative_prompt_training, output_dir = None):
+    train_dataset = train_dataloader.dataset
+    # Sample four random captions from the dataset
+    sample_indices = [random.randint(0, len(train_dataset) - 1) for _ in range(4)]
+    sample_captions = [train_dataset[i][1] for i in sample_indices]
+    print("Sample captions:")
+    for caption in sample_captions:
+        print(caption)
+    sample_negative_captions = ""
+    if negative_prompt_training:
+        sample_negative_captions = [train_dataset[i][2] for i in sample_indices]
+        print("Sample negative captions:")
+        for caption in sample_negative_captions:
+            print(f"  NEG: {caption}")
+    #Write captions to a file
+    if output_dir is not None:
+        os.makedirs(output_dir, exist_ok=True)
+        out_path = os.path.join(output_dir, "sample_captions.txt")
+        with open(out_path, "w", encoding="utf-8") as f:
+            f.write("Sample captions:\n")
+            for caption in sample_captions:
+                f.write(str(caption) + "\n")
+            if negative_prompt_training:
+                f.write("\nSample negative captions:\n")
+                for caption in sample_negative_captions:
+                    f.write(str(caption) + "\n")
+        print(f"Sample captions written to {out_path}")
+    return sample_captions, sample_negative_captions
+def start_plotter(log_file, output_dir, left_key, right_key, left_label, right_label, png_name):
+    formatted_date = datetime.now().strftime(r'%Y%m%d-%H%M%S')
+    plotter = Plotter(log_file, update_interval=5.0, left_key=left_key, right_key=right_key,
+                            left_label=left_label, right_label=right_label, output_png=f'{png_name}_{formatted_date}.png')
+    plot_thread = threading.Thread(target=plotter.start_plotting)
+    plot_thread.daemon = True
+    plot_thread.start()
+    print(f"{png_name} plotting enabled. Progress will be saved to {os.path.join(output_dir, f'{png_name}_{formatted_date}.png')}")
+    return plotter, plot_thread
+def kill_plotter(plotter, plot_thread):
+    if plot_thread and plot_thread.is_alive():
+        plotter.stop_plotting()
+        plot_thread.join(timeout=5.0)
+        if plot_thread.is_alive():
+            print("Warning: Plot thread did not terminate properly")
+def load_config_from_json(config_path):
+    """Load hyperparameters from a JSON config file."""
+    try:
+        with open(config_path, 'r') as f:
+            config = json.load(f)
+            print(f"Configuration loaded from {config_path}")
+            # Print the loaded config for verification
+            print("Loaded hyperparameters:")
+            for key, value in config.items():
+                print(f"  {key}: {value}")
+            return config
+    except (json.JSONDecodeError, FileNotFoundError) as e:
+        print(f"Error loading config file: {e}")
+        raise e
+def update_args_from_config(args, config):
+    """Update argparse namespace with values from config."""
+    # Convert config dict to argparse namespace
+    for key, value in config.items():
+        if hasattr(args, key):
+            setattr(args, key, value)
+    return args
+def get_scene_from_embeddings(image, block_embeddings):
+    """Code copied over from level_dataset, should give limited support for block embeddings"""
+    # Reshape sample to [batch_size * height * width, embedding_dim]
+    batch_size, embedding_dim, height, width = image.shape
+    flat_samples = image.permute(0, 2, 3, 1).reshape(-1, embedding_dim)
+    # Normalize vectors for cosine similarity
+    flat_samples = F.normalize(flat_samples, p=2, dim=1).cpu()
+    block_embeddings = F.normalize(block_embeddings, p=2, dim=1)
+    # Calculate cosine similarity between each position and all tile embeddings
+    similarities = torch.matmul(flat_samples, block_embeddings.t())
+    # Get indices of most similar tiles
+    indices = torch.softmax(similarities, dim=1)
+    # Reshape back to [batch_size, height, width]
+    indices = indices.reshape(batch_size, height, width, 13)
+    indices = indices.permute(0, 3, 1, 2)
+    image=indices.detach().cpu()
+    return image

latent_diffusion_pipeline.py ADDED Viewed

	@@ -0,0 +1,99 @@

+from diffusers import DDPMPipeline
+import torch
+import torch.nn.functional as F
+from typing import Optional, Union, List, Tuple
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.ddpm.pipeline_ddpm import ImagePipelineOutput
+import common_settings as common_settings
+import os
+import json
+from general_training_helper import get_scene_from_embeddings
+class UnconditionalDDPMPipeline(DDPMPipeline):
+    def __init__(self, unet, scheduler, block_embeddings=None):
+        super().__init__(unet, scheduler)
+        self.block_embeddings = block_embeddings
+    def save_pretrained(self, save_directory):
+        os.makedirs(save_directory, exist_ok=True)
+        super().save_pretrained(save_directory)
+        # Save block_embeddings tensor if it exists
+        if self.block_embeddings is not None:
+            torch.save(self.block_embeddings, os.path.join(save_directory, "block_embeddings.pt"))
+    @classmethod
+    def from_pretrained(cls, pretrained_model_path, **kwargs):
+        pipeline = super().from_pretrained(pretrained_model_path, **kwargs)
+        # Load block_embeddings tensor if it exists
+        block_embeds_path = os.path.join(pretrained_model_path, "block_embeddings.pt")
+        if os.path.exists(block_embeds_path):
+            pipeline.block_embeddings = torch.load(block_embeds_path, map_location="cpu")
+        else:
+            pipeline.block_embeddings = None
+        return pipeline
+    def give_sprite_scaling_factors(self, sprite_scaling_factors):
+        """
+        Set the sprite scaling factors for the pipeline.
+        This is used to apply per-sprite temperature scaling during inference.
+        """
+        self.sprite_scaling_factors = sprite_scaling_factors
+    def __call__(
+        self,
+        batch_size: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        num_inference_steps: int = common_settings.NUM_INFERENCE_STEPS,
+        output_type: Optional[str] = "tensor",
+        return_dict: bool = True,
+        height: int = common_settings.MARIO_HEIGHT, width: int = common_settings.MARIO_WIDTH,
+        latents: Optional[torch.FloatTensor] = None,
+        show_progress_bar=True,
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        self.unet.eval()
+        with torch.no_grad():
+            if latents is not None:
+                image = latents.to(self.device)
+            else:
+                image_shape = (
+                    batch_size,
+                    self.unet.config.in_channels,
+                    height,
+                    width
+                )
+                image = torch.randn(image_shape, generator=generator, device=self.device)
+            self.scheduler.set_timesteps(num_inference_steps)
+            iterator = self.progress_bar(self.scheduler.timesteps) if show_progress_bar else self.scheduler.timesteps
+            for t in iterator:
+                #print(image.shape)
+                model_output = self.unet(image, t).sample
+                image = self.scheduler.step(model_output, t, image, generator=generator).prev_sample
+            # Apply per-sprite temperature scaling if enabled
+            if hasattr(self,"sprite_scaling_factors") and self.sprite_scaling_factors is not None:
+                image = image / self.sprite_scaling_factors.view(1, -1, 1, 1)
+            if self.block_embeddings is not None:
+                image = get_scene_from_embeddings(image, self.block_embeddings)
+            else:
+                image = F.softmax(image, dim=1)
+                image = image.detach().cpu()
+            if not return_dict:
+                return (image,)
+            return ImagePipelineOutput(images=image)
+    def print_unet_architecture(self):
+        """Prints the architecture of the UNet model."""
+        print(self.unet)

model_index.json CHANGED Viewed

@@ -6,7 +6,7 @@
     "DDPMScheduler"
   ],
   "text_encoder": [
-    "models.text_model",
     "TransformerModel"
   ],
   "tokenizer": [

     "DDPMScheduler"
   ],
   "text_encoder": [
+    "text_model",
     "TransformerModel"
   ],
   "tokenizer": [

naming_conventions.py ADDED Viewed

	@@ -0,0 +1,29 @@

+model_name_map = [
+    ("Mar1and2-conditional-regular", "MLM-regular"),
+    ("Mar1and2-conditional-absence", "MLM-absence"),
+    ("Mar1and2-conditional-negative", "MLM-negative"),
+    ("Mar1and2-conditional-MiniLM-regular", "MiniLM-single-regular"),
+    ("Mar1and2-conditional-MiniLM-absence", "MiniLM-single-absence"),
+    ("Mar1and2-conditional-MiniLM-negative", "MiniLM-single-negative"),
+    ("Mar1and2-conditional-MiniLMsplit-regular", "MiniLM-multiple-regular"),
+    ("Mar1and2-conditional-MiniLMsplit-absence", "MiniLM-multiple-absence"),
+    ("Mar1and2-conditional-MiniLMsplit-negative", "MiniLM-multiple-negative"),
+    ("Mar1and2-conditional-GTE-regular", "GTE-single-regular"),
+    ("Mar1and2-conditional-GTE-absence", "GTE-single-absence"),
+    ("Mar1and2-conditional-GTE-negative", "GTE-single-negative"),
+    ("Mar1and2-conditional-GTEsplit-regular", "GTE-multiple-regular"),
+    ("Mar1and2-conditional-GTEsplit-absence", "GTE-multiple-absence"),
+    ("Mar1and2-conditional-GTEsplit-negative", "GTE-multiple-negative"),
+    ("Mar1and2-fdm-MiniLM-regular", "FDM-MiniLM-regular"),
+    ("Mar1and2-fdm-MiniLM-absence", "FDM-MiniLM-absence"),
+    ("Mar1and2-fdm-GTE-regular", "FDM-GTE-regular"),
+    ("Mar1and2-fdm-GTE-absence", "FDM-GTE-absence"),
+    ("Mar1and2-wgan", "WGAN"),
+    ("Mar1and2-unconditional", "Unconditional"),
+    ("MarioGPT_metrics", "MarioGPT"),
+]
+def get_model_name_map_and_order():
+    mapping = dict(model_name_map)
+    order = [v for k, v in model_name_map]
+    return mapping, order

pipeline_loader.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from text_diffusion_pipeline import TextConditionalDDPMPipeline
+from latent_diffusion_pipeline import UnconditionalDDPMPipeline
+import os
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+def get_pipeline(model_path):
+    # If model_path is a local directory, use the original logic
+    if os.path.isdir(model_path):
+        #Diffusion models
+        if os.path.exists(os.path.join(model_path, "unet")):
+            if os.path.exists(os.path.join(model_path, "text_encoder")):
+                #If it has a text encoder and a unet, it's text conditional diffusion
+                pipe = TextConditionalDDPMPipeline.from_pretrained(model_path)
+            else:
+                #If it has no text encoder, use the unconditional diffusion model
+                pipe = UnconditionalDDPMPipeline.from_pretrained(model_path)
+    else:
+        # Assume it's a Hugging Face Hub model ID
+        # Try to load config to determine if it's text-conditional
+        try:
+            config, _ = DiffusionPipeline.load_config(model_path)
+            components = config.get("components", {})
+        except Exception:
+            components = {}
+        if "text_encoder" in components or "text_encoder" in str(components):
+            # Use the local pipeline file for custom_pipeline
+            pipe = DiffusionPipeline.from_pretrained(
+                model_path,
+                custom_pipeline="models.text_diffusion_pipeline.TextConditionalDDPMPipeline",
+                trust_remote_code=True,
+            )
+        else:
+            # Fallback: try unconditional
+            pipe = DiffusionPipeline.from_pretrained(
+                model_path,
+                custom_pipeline="models.latent_diffusion_pipeline.UnconditionalDDPMPipeline",
+                trust_remote_code=True,
+            )
+    return pipe

plotter.py ADDED Viewed

	@@ -0,0 +1,173 @@

+# Track changes in loss and learning rate during execution
+import argparse
+import matplotlib
+import matplotlib.pyplot as plt
+import os
+import time
+import json
+import tempfile
+import shutil
+from pathlib import Path
+def parse_args():
+    parser = argparse.ArgumentParser(description="Train a text-conditional diffusion model for tile-based level generation")
+    # Dataset args
+    parser.add_argument("--log_file", type=str, default=None, help="The the filepath of the file to get the data from")
+    parser.add_argument("--left_key", type=str, default=None, help="The key for the left y-axis")
+    parser.add_argument("--right_key", type=str, default=None, help="The key for the right y-axis")
+    parser.add_argument("--left_label", type=str, default=None, help="The label for the left y-axis")
+    parser.add_argument("--right_label", type=str, default=None, help="The label for the right y-axis")
+    parser.add_argument("--output_png", type=str, default="output.png", help="The output png file")
+    parser.add_argument("--update_interval", type=int, default=1.0, help="The update inteval in epochs")
+    parser.add_argument("--start_point", type=int, default=None, help="The start point for the plot")
+    return parser.parse_args()
+def main():
+    args = parse_args()
+    log_file = args.log_file
+    left_key = args.left_key
+    right_key = args.right_key
+    left_label = args.left_label
+    right_label = args.right_label
+    output_png = args.output_png
+    update_interval = args.update_interval
+    start_point = args.start_point
+    general_update_plot(log_file, left_key, right_key, left_label, right_label, output_png, update_interval=update_interval, startPoint=start_point)
+def general_update_plot(log_file, left_key, right_key, left_label, right_label, output_png, update_interval=1.0, startPoint=None):
+    log_dir = os.path.dirname(log_file)
+    # Create figure here and ensure it's closed
+    fig = plt.figure(figsize=(10, 6))
+    ax = fig.add_subplot(111)
+    try:
+        if os.path.exists(log_file):
+            with open(log_file, 'r') as f:
+                data = [json.loads(line) for line in f if line.strip()]
+            if not data:
+                return
+            if startPoint is not None:
+                data = [entry for entry in data if entry.get('epoch', 0) >= startPoint]
+            if not data:
+                return
+            epochs = [entry.get('epoch', 0) for entry in data]
+            left = [entry.get(left_key, 0) for entry in data]
+            # For right axis (e.g., lr), only include points where right_key exists
+            right_points = [(entry.get('epoch', 0), entry.get(right_key))
+                            for entry in data if right_key in entry]
+            if right_points:
+                right_epochs, right_values = zip(*right_points)
+            else:
+                right_epochs, right_values = [], []
+            # Clear axis
+            ax.clear()
+            # Plot both metrics on the same axis
+            ax.plot(epochs, left, 'b-', label=left_label)
+            if right_epochs:
+                ax.plot(right_epochs, right_values, 'r-', label=right_label)
+            ax.set_xlabel('Epoch')
+            ax.set_ylabel(left_label) # "Loss" as y-axis label
+            ax.set_title('Training Progress')
+            ax.legend(loc='upper left')
+            #Limit x-axis to startPoint if provided
+            if startPoint is not None:
+                ax.set_xlim(left=startPoint)
+            fig.tight_layout()
+            # Use the stored base directory instead of getting it from log_file
+            if os.path.isabs(output_png) or os.path.dirname(output_png):
+                output_path = output_png
+            else:
+                output_path = os.path.join(log_dir, output_png)
+            save_figure_safely(fig, output_path)
+    finally:
+        plt.close(fig)  # Ensure figure is closed even if an error occurs
+def save_figure_safely(fig, output_path):
+    """Save figure to a temporary file first, then move it to the final location"""
+    output_path = str(Path(output_path))  # Convert to string path
+    # Create temporary file with .png extension
+    with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp_file:
+        tmp_path = tmp_file.name
+    try:
+        # Save to temporary file
+        fig.savefig(tmp_path)
+        # Create output directory if it doesn't exist
+        os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True)
+        # Try to move the file to final destination
+        # If move fails, try to copy and then delete
+        try:
+            shutil.move(tmp_path, output_path)
+        except OSError:
+            shutil.copy2(tmp_path, output_path)
+            os.unlink(tmp_path)
+    except Exception as e:
+        # Clean up temporary file if anything goes wrong
+        if os.path.exists(tmp_path):
+            os.unlink(tmp_path)
+        raise e
+class Plotter:
+    def __init__(self, log_file, update_interval=1.0, left_key='loss', right_key='lr',
+                 left_label='Loss', right_label='Learning Rate', output_png='training_progress.png'):
+        self.log_dir = os.path.dirname(log_file)
+        self.log_file = log_file
+        self.update_interval = update_interval
+        self.running = True
+        self.output_png = output_png
+        self.left_key = left_key
+        self.right_key = right_key
+        self.left_label = left_label
+        self.right_label = right_label
+        matplotlib.use('Agg')
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.stop_plotting()
+    def __del__(self):
+        self.stop_plotting()
+    def update_plot(self):
+        general_update_plot(self.log_file, self.left_key, self.right_key,
+                          self.left_label, self.right_label, self.output_png,
+                          update_interval=self.update_interval)
+    def start_plotting(self):
+        print("Starting plotting in background")
+        while self.running:
+            self.update_plot()
+            time.sleep(self.update_interval)
+    def stop_plotting(self):
+        if hasattr(self, 'running'):  # Check if already stopped
+            self.running = False
+            self.update_plot()
+            print("Plotting stopped")
+if __name__ == "__main__":
+    main()

sampler.py ADDED Viewed

	@@ -0,0 +1,473 @@

+from __future__ import annotations
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+import os
+import subprocess
+import tempfile
+import numpy as np
+import torch
+from PIL.Image import Image
+from tqdm import tqdm
+from transformers import LogitsProcessorList, TemperatureLogitsWarper, TopKLogitsWarper
+from mario_gpt.lm.base import BaseMarioLM
+from mario_gpt.prompter import Prompter
+from mario_gpt.simulator import Simulator
+from mario_gpt.utils import (
+    convert_level_to_png,
+    load_level,
+    save_level,
+    trim_level,
+    view_level,
+)
+def scene_to_ascii(scene, id_to_char, shorten: bool = True) -> List[str]:
+    """
+    Convert JSON scene files from a list of lists of ints
+    to a list of ASCII strings using id_to_char mapping.
+    If shorten is True, only the last 15 rows are kept.
+    Args:
+        scene: List[List[int]] - 2D array of tile IDs
+        id_to_char: Dict[int, str] - mapping from tile ID to ASCII character
+        shorten: bool - If True, will shorten the output to only include the first 15 rows
+                        so A* Mario (for SNES graphics) to run without glitching
+    Returns:
+        List[str]: List of strings, each representing a row in ASCII
+    """
+    if shorten and len(scene) > 15:
+        scene = scene[-15:]  # Keep only the last 15 rows
+    return ["".join(id_to_char[num] for num in row) for row in scene]
+@dataclass
+class SampleOutput:
+    level: Optional[List[str]]
+    prompt: Optional[str] = None
+    img: Optional[Image] = None
+    sample_predictions_str: Optional[List[str]] = None
+    sample_predictions_img: Optional[Image] = None
+    level_tensor: Optional[torch.Tensor] = None
+    sample_predictions_tensor: Optional[torch.Tensor] = None
+    # Uses MarioEval graphics for rendering levels when True
+    use_snes_graphics: bool = False
+    @classmethod
+    def create(
+        cls,
+        level_tensor: torch.Tensor,
+        sample_predictions_tensor: torch.Tensor,
+        tokenizer,
+        prompter: Optional[Prompter] = None,
+    ) -> SampleOutput:
+        # batch = 1
+        level = None
+        img = None
+        try:
+            level = view_level(level_tensor, tokenizer)
+            img = convert_level_to_png(level)[0]
+        except Exception as e:
+            print(
+                f"Failed to generate string or image representation for full level! Got error {e}"
+            )
+            level = None
+            img = None
+        try:
+            sample_predictions_str = view_level(sample_predictions_tensor, tokenizer)
+            sample_predictions_img = convert_level_to_png(sample_predictions_str)[0]
+        except Exception as e:
+            print(
+                f"Failed to generate string or image representation for sampled predictions! Got error {e}"
+            )
+            sample_predictions_str = None
+            sample_predictions_img = None
+        prompt = None
+        if prompter is not None:
+            prompt = prompter(level_tensor)[0]
+        return SampleOutput(
+            level,
+            prompt,
+            img,
+            sample_predictions_str,
+            sample_predictions_img,
+            level_tensor,
+            sample_predictions_tensor,
+        )
+    @classmethod
+    def from_level_predictions(
+        cls,
+        level: torch.Tensor,
+        sample_predictions: torch.Tensor,
+        tokenizer,
+        prompter: Optional[Prompter] = None,
+    ) -> Union[SampleOutput, List[SampleOutput]]:
+        level_tensor = trim_level(level).squeeze().detach().cpu()
+        sample_predictions_tensor = (
+            trim_level(sample_predictions).squeeze().detach().cpu()
+        )
+        if len(level_tensor.shape) == 1:
+            return SampleOutput.create(
+                level_tensor, sample_predictions_tensor, tokenizer, prompter
+            )
+        out = []
+        for _level_tensor, _sample_predictions_tensor in zip(
+            level_tensor, sample_predictions_tensor
+        ):
+            sample_output = SampleOutput.create(
+                _level_tensor, _sample_predictions_tensor, tokenizer, prompter
+            )
+            out.append(sample_output)
+        return out
+    def save(self, filename: str) -> str:
+        save_level(self.level, filename)
+    @classmethod
+    def load(cls, filename: str) -> SampleOutput:
+        level = load_level(filename)
+        return SampleOutput(level=level)
+    def play(self, game="mario", level_idx=None, dataset_path=None):
+        """
+        Play the level using the specified game engine.
+        game: "mario" (default) or "loderunner"
+        """
+        if game == "loderunner":
+            import tempfile, json
+            # Convert self.level (list of strings) to Lode Runner JSON format
+            scene = [[c for c in row] for row in self.level]
+            lr_json = [{
+                "scene": scene,
+                "caption": ""
+            }]
+            with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as tmp:
+                json.dump(lr_json, tmp)
+                tmp_path = tmp.name
+            import sys, os
+            #sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
+            from LodeRunner.loderunner import main
+            tmp_path = tmp_path if dataset_path is None else dataset_path
+            print(f"Playing Lode Runner level interactively -- {tmp_path}!")
+            main.play_lr_level(tmp_path, level_index=level_idx if level_idx is not None else 1)
+        else:
+            if self.use_snes_graphics:
+                simulator = CustomSimulator(level=self.level, jar_path="MarioEval.jar")
+            else:
+                simulator = CustomSimulator(level=self.level, jar_path="NESMarioEval.jar")
+            simulator.interactive()
+    def run_astar(self, render=True):
+        if self.use_snes_graphics:
+            simulator = CustomSimulator(level=self.level, jar_path="MarioEval.jar")
+        else:
+            simulator = CustomSimulator(level=self.level, jar_path="NESMarioEval.jar")
+        return simulator.astar(render)
+class CustomSimulator:
+    """
+        The classic Mario simulator used by MarioGPT is generally,
+        better, but it doesn't return any information about
+        Mario's performance. The main point of this simulator
+        is that information about the performance of the agent
+        is printed to the console (though I still need a way
+        to caption and return that information)
+    """
+    def __init__(self, level, jar_path="MarioEval.jar"):
+        while len(level) > 15:
+            level.pop(0)
+        # For some reason, my older A* agent
+        # crashes on Mario levels with 16 rows or more
+        self.level = level
+        self.jar_path = jar_path
+    def interactive(self):
+        t = tempfile.NamedTemporaryFile(suffix=".txt", delete=False)
+        save_level(self.level, t.name)
+        print(f"Playing level interactively -- {t.name}!")
+        _ = subprocess.run(
+            ["java", "-jar", self.jar_path, "human", t.name, "human"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+        t.close()
+        os.unlink(t.name)
+    def astar(self, render: bool = True):
+        t = tempfile.NamedTemporaryFile(suffix=".txt", delete=False)
+        save_level(self.level, t.name)
+        print(f"Running Astar agent on level! -- {t.name}")
+        render_str = "human" if render else "norender"
+        result = subprocess.run(
+            ["java", "-jar", self.jar_path, "astar", t.name, render_str],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+        t.close()
+        os.unlink(t.name)
+        # Combine stdout and stderr, decode to string, and return
+        output = result.stdout.decode("utf-8") + result.stderr.decode("utf-8")
+        return output
+def save_level(level: List[str], filename: str):
+    concatenated = "\n".join(level)
+    with open(filename, "w") as f:
+        f.write(concatenated)
+    return filename
+class GPTSampler:
+    def __init__(
+        self,
+        mario_lm: BaseMarioLM,
+        temperature: float = 2.0,
+        top_k: int = 16,
+        context_len: int = 700,
+        use_tqdm: bool = False,
+        use_argmax: bool = False,
+    ):
+        self.mario_lm = mario_lm
+        self.temperature = temperature
+        self.top_k = top_k
+        self.context_len = context_len
+        self.use_tqdm = use_tqdm
+        self.use_argmax = use_argmax
+        self.logits_processor = LogitsProcessorList()
+        self.logits_warper = LogitsProcessorList(
+            [
+                TopKLogitsWarper(top_k),  # number of characters
+                TemperatureLogitsWarper(temperature),
+            ]
+        )
+    @property
+    def device(self) -> torch.device:
+        return self.mario_lm.device
+    def step(
+        self,
+        seed: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        with torch.no_grad():
+            attention_mask = torch.ones_like(seed).to(seed.device)
+            input_ids = seed
+            out = self.mario_lm.lm(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                token_type_ids=None,
+            )
+            logits = out.logits.detach()
+            if len(logits.shape) == 2:
+                logits = logits.view(1, 1, -1)
+            next_token_logits = logits[:, -1, :]
+            if self.use_argmax:
+                next_tokens = next_token_logits.argmax(-1)
+            else:
+                next_token_scores = self.logits_processor(input_ids, next_token_logits)
+                next_token_scores = self.logits_warper(input_ids, next_token_scores)
+                probs = torch.nn.functional.softmax(next_token_scores, dim=-1)
+                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+        return next_tokens, encoder_hidden_states
+    def sample(
+        self,
+        seed: Union[Optional[torch.Tensor], Optional[SampleOutput]] = None,
+        prompts: Optional[List[str]] = None,
+        num_steps: int = 1,
+        encoder_hidden_states: torch.Tensor = None,
+        return_tensor: bool = False,
+    ):
+        self.mario_lm.eval()
+        context_len = self.context_len - 28
+        with torch.no_grad():
+            if seed is None:
+                seed = self.mario_lm.generate_seed(1, batch_size=len(prompts)).to(
+                    self.device
+                )
+                out_tensor = seed.to(self.device)
+            elif isinstance(seed, SampleOutput):
+                out_tensor = seed.level_tensor.to(self.device).squeeze()
+            else:
+                out_tensor = seed.to(self.device).squeeze()
+            if len(out_tensor.shape) < 2:
+                # if we pass in a single seed vector, then we repeat for each prompt
+                # Otherwise, we treat inputs as separate seed-prompt pairs
+                out_tensor = out_tensor.view(1, -1).repeat(len(prompts), 1)
+            if encoder_hidden_states is None:
+                if prompts is not None:
+                    encoder_hidden_states = torch.stack(
+                        [
+                            self.mario_lm.prompter.output_hidden(prompt)
+                            for prompt in prompts
+                        ]
+                    )
+                else:
+                    encoder_hidden_states = torch.stack(
+                        [
+                            self.mario_lm.prompter(sample_prompt=True)[1]
+                            for _ in range(seed.shape[0])
+                        ]
+                    )
+            encoder_hidden_states = encoder_hidden_states.to(
+                self.device
+            )  # b x 1 x hidden_dim
+            encoder_hidden_states = encoder_hidden_states.view(
+                out_tensor.shape[0], 1, -1
+            )
+            if not self.use_tqdm:
+                bar = np.arange(num_steps)
+            else:
+                bar = tqdm(np.arange(num_steps))
+            with torch.no_grad():
+                for i in bar:
+                    inp = out_tensor * 1
+                    if len(out_tensor.shape) > 0 and out_tensor.shape[-1] > context_len:
+                        diff = inp.shape[-1] % 14  # height of mario level
+                        ctx = context_len + diff
+                        inp = inp[:, -ctx:] * 1
+                    next_tokens, encoder_hidden_states = self.step(
+                        inp,
+                        encoder_hidden_states=encoder_hidden_states,
+                    )
+                    out_tensor = torch.cat(
+                        [out_tensor, next_tokens.unsqueeze(-1)], dim=-1
+                    )
+                    if self.use_tqdm:
+                        bar.set_description(
+                            f"shape: {inp.shape}, {out_tensor.shape} first: {inp[0][0]}, last: {out_tensor[0][-1]}"
+                        )
+            if self.use_tqdm:
+                bar.close()
+        sample_out = SampleOutput.from_level_predictions(
+            out_tensor,
+            out_tensor[:, -num_steps:],
+            self.mario_lm.tokenizer,
+            self.mario_lm.prompter,
+        )
+        self.mario_lm.train()
+        if return_tensor:
+            return sample_out, out_tensor
+        return sample_out
+    def __call__(self, *args, **kwargs):
+        return self.sample(*args, **kwargs)
+class BertSampler:
+    def __init__(
+        self,
+        mario_lm: BaseMarioLM,
+        temperature: float = 2.0,
+        top_k: int = 16,
+        context_len: int = 448,
+        mask_proportion: float = 0.16,
+    ):
+        self.mario_lm = mario_lm
+        self.temperature = temperature
+        self.top_k = top_k
+        self.logits_processor = LogitsProcessorList()
+        self.logits_warper = LogitsProcessorList(
+            [
+                TopKLogitsWarper(top_k),  # number of characters
+                TemperatureLogitsWarper(temperature),
+            ]
+        )
+        self.context_len = context_len
+        self.mask_proportion = mask_proportion
+        self.mask_portion = int(self.context_len * self.mask_proportion)
+        self.mask_portion = self.mask_portion - self.mask_portion % 14 + 14
+    @property
+    def device(self) -> torch.device:
+        return self.mario_lm.device
+    def get_context(self, input_ids, mask_indices):
+        start_idx = mask_indices[0]
+        end_idx = mask_indices[-1]
+        if input_ids.shape[-1] <= self.context_len:
+            clipped = input_ids.shape[-1] % 14
+            input_ids = input_ids[:clipped]
+        portion = (self.context_len - self.mask_portion) / 2
+        remainder = 0
+        left = start_idx - portion
+        if left < 0:
+            remainder = -1 * left
+        right = end_idx + portion + remainder
+        return input_ids[left:right]
+    def sample(
+        self,
+        seed: Union[torch.Tensor, SampleOutput],
+        mask: torch.Tensor,
+        return_tensor: bool = False,
+    ):
+        self.mario_lm.eval()
+        mask_indices = mask.nonzero()
+        input_ids = seed
+        if isinstance(seed, SampleOutput):
+            input_ids = seed.level_tensor.to(self.device).squeeze()
+        input_id_list = []
+        for i in range(input_ids.shape[0]):
+            input_id = input_ids[i]
+            mask_index = mask_indices[mask_indices[:, 0] == i][:, -1]
+            input_id = self.get_context(input_id, mask_index)
+            input_id_list.append(input_id)
+        input_ids = torch.stack(input_ids, dim=0).to(self.device)
+        attention_mask = torch.ones_like(input_ids).to(seed.device)
+        if len(input_ids.shape) < 2:
+            # if we pass in a single seed vector, then we repeat for each prompt
+            # Otherwise, we treat inputs as separate seed-prompt pairs
+            input_ids = input_ids.view(1, -1)
+        out = self.mario_lm.lm(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=None,
+        )
+        logits = out.logits.detach()
+        if len(logits.shape) == 2:
+            logits = logits.view(1, 1, -1)
+        if self.use_argmax:
+            tokens = logits.argmax(-1)
+        else:
+            tokens_scores = self.logits_processor(input_ids, tokens)
+            tokens_scores = self.logits_warper(input_ids, tokens_scores)
+            probs = torch.nn.functional.softmax(tokens_scores, dim=-1)
+            tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+        out = input_ids.detach()
+        for i in range(input_ids.shape[0]):
+            mask_index = mask_indices[mask_indices[:, 0] == i][:, -1]
+            out[i, mask_index] = tokens[i, mask_index].detach()
+        sample_out = SampleOutput.from_level_predictions(
+            out,
+            tokens,
+            self.mario_lm.tokenizer,
+            self.mario_lm.prompter,
+        )
+        self.mario_lm.train()
+        if return_tensor:
+            return sample_out, tokens
+        return sample_out

sentence_transformers_helper.py ADDED Viewed

	@@ -0,0 +1,114 @@

+from transformers import AutoTokenizer, AutoModel
+import torch
+import torch.nn.functional as F
+#Mean Pooling - Take average of all tokens
+def mean_pooling(model_output, attention_mask):
+    token_embeddings = model_output.last_hidden_state
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+#Encode text
+def encode(texts, tokenizer, model, device='cpu'):
+    # Tokenize sentences
+    encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
+    encoded_input.to(device)
+    # Compute token embeddings
+    with torch.no_grad():
+        model_output = model(**encoded_input, return_dict=True)
+    # Perform pooling
+    embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
+    # Normalize embeddings
+    embeddings = F.normalize(embeddings, p=2, dim=1)
+    embeddings = embeddings.to(device)
+    return embeddings
+# Get embeddings for a batch of captions and optional negative captions
+def get_embeddings(batch_size, tokenizer, model, captions=None, neg_captions=None, device='cpu'):
+    embeddings = encode([""]*batch_size, tokenizer, model, device)
+    if captions is not None:
+        caption_embeddings = encode(captions, tokenizer, model, device)
+        embeddings = torch.cat((embeddings, caption_embeddings), dim=0)
+    if neg_captions is not None:
+        neg_embeddings = encode(neg_captions, tokenizer, model, device)
+        embeddings = torch.cat((neg_embeddings, embeddings), dim=0)
+    embeddings = embeddings.unsqueeze(1)
+    return embeddings
+def get_embeddings_split(batch_size, tokenizer, model, captions=None, neg_captions=None, device='cpu', max_length=20):
+    padding_length = max(max([s.count(".") for s in captions]) if captions else 1,
+                     max([s.count(".") for s in neg_captions]) if neg_captions else 1)
+    if (padding_length>max_length):
+        raise ValueError(f"Token sequence length {padding_length} exceeds specified length {max_length}.")
+    empty_split = split_sentences([""] * batch_size, padding_length)
+    embeddings = get_embeddings_from_split(empty_split, tokenizer, model, device)
+    if(captions is not None):
+        captions_split = split_sentences(captions, padding_length)
+        caption_embeddings = get_embeddings_from_split(captions_split, tokenizer, model, device)
+        embeddings = torch.cat((embeddings, caption_embeddings), dim=0)
+    if(neg_captions is not None):
+        neg_split = split_sentences(neg_captions, padding_length)
+        neg_embeddings = get_embeddings_from_split(neg_split, tokenizer, model, device)
+        embeddings = torch.cat((neg_embeddings, embeddings), dim=0)
+    #We don't need to unsqueeze this, we have an array of (batch_size, padding_length, encoding_size) already
+    return embeddings.to(device)
+#This method takes a caption batch in list form, and outputs a 2d list where every caption has been split by period
+def split_sentences(caption_array, padding_length=20):
+    split_caption_array = []
+    #Padding happens here
+    for caption in caption_array:
+        split_caption = [s.strip() for s in caption.split(".") if s.strip()]
+        #This is the token padding, we just use an empty string
+        split_caption += [""] * (padding_length - len(split_caption))
+        split_caption_array.append(split_caption)
+    return split_caption_array
+#Expects all split vectors to be the same length
+def get_embeddings_from_split(caption_batch, tokenizer, model, device='cpu'):
+    all_caption_encodings = []
+    for caption_sequence in caption_batch:
+        #Encode the sequence of split captions as if it was a batch, should now be a [maxlength, embeddingsize] tensor
+        caption_sequence = encode(caption_sequence, tokenizer, model, device)
+        #We don't reshape this to avoid having to unsqueeze it later
+        all_caption_encodings.append(caption_sequence)
+    all_caption_encodings = torch.stack(all_caption_encodings, dim=0)
+    return all_caption_encodings
+if __name__ == "__main__":
+    cap = split_sentences(["Hello. My name is George. How. Are you doing. Today?", "I am doing. Just fine. Thanks."])
+    model_url = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
+    device = 'cuda'
+    tokenizer = AutoTokenizer.from_pretrained(model_url)
+    model = AutoModel.from_pretrained(model_url, trust_remote_code=True).to(device)
+    get_embeddings_from_split(cap, tokenizer, model, device)

text_diffusion_pipeline.py ADDED Viewed

	@@ -0,0 +1,442 @@

+import torch
+import torch.nn.functional as F
+from typing import NamedTuple, Optional
+import os
+from diffusers import DDPMPipeline, UNet2DConditionModel, DDPMScheduler
+import json
+# Running the main at the end of this requires messing with this import
+from text_model import TransformerModel
+import torch
+import torch.nn.functional as F
+from transformers import AutoTokenizer, AutoModel
+import common_settings as common_settings
+import sentence_transformers_helper as st_helper
+import text_model as text_model
+from general_training_helper import get_scene_from_embeddings
+class PipelineOutput(NamedTuple):
+    images: torch.Tensor
+# Create a custom pipeline for text-conditional generation
+class TextConditionalDDPMPipeline(DDPMPipeline):
+    def __init__(self, unet, scheduler, text_encoder=None, tokenizer=None, supports_pretrained_split=False, block_embeddings=None):
+        super().__init__(unet=unet, scheduler=scheduler)
+        self.text_encoder = text_encoder
+        self.tokenizer = tokenizer
+        self.supports_negative_prompt = hasattr(unet, 'negative_prompt_support') and unet.negative_prompt_support
+        self.supports_pretrained_split = supports_pretrained_split
+        self.block_embeddings = block_embeddings
+        if self.tokenizer is None and self.text_encoder is not None:
+            # Use the tokenizer from the text encoder if not provided
+            self.tokenizer = self.text_encoder.tokenizer
+        # Register the text_encoder so that .to(), .cpu(), .cuda(), etc. work correctly
+        self.register_modules(
+            unet=unet,
+            scheduler=scheduler,
+            text_encoder=self.text_encoder,
+            tokenizer=self.tokenizer,
+        )
+    # Override the to() method to ensure text_encoder is moved to the correct device
+    def to(self, device=None, dtype=None):
+        # Call the parent's to() method first
+        pipeline = super().to(device, dtype)
+        # Additionally move the text_encoder to the device
+        if self.text_encoder is not None:
+            self.text_encoder.to(device)
+        return pipeline
+    def save_pretrained(self, save_directory):
+        os.makedirs(save_directory, exist_ok=True)
+        super().save_pretrained(save_directory)  # saves UNet and scheduler
+        # Save block_embeddings tensor if it exists
+        if self.block_embeddings is not None:
+            torch.save(self.block_embeddings, os.path.join(save_directory, "block_embeddings.pt"))
+        # Save supports_negative_prompt and supports_pretrained_split flags
+        with open(os.path.join(save_directory, "pipeline_config.json"), "w") as f:
+            json.dump({
+                "supports_negative_prompt": self.supports_negative_prompt,
+                "supports_pretrained_split": self.supports_pretrained_split,
+                "text_encoder_type": type(self.text_encoder).__name__
+            }, f)
+        #Text encoder/tokenizer saving is different depending on if we're using a larger pretrained model
+        if isinstance(self.text_encoder, TransformerModel):
+            # Save custom text encoder
+            if self.text_encoder is not None:
+                self.text_encoder.save_pretrained(os.path.join(save_directory, "text_encoder"))
+        else:
+            #Save pretrained tokenizer by name, so we can load from huggingface instead of saving a giant local model
+            text_encoder_info = {
+                "text_encoder_name": self.text_encoder.config.name_or_path,
+                "tokenizer_name": self.tokenizer.name_or_path,
+            }
+            text_encoder_directory = os.path.join(save_directory, "text_encoder")
+            os.makedirs(text_encoder_directory, exist_ok=True)
+            with open(os.path.join(text_encoder_directory, "loading_info.json"), "w") as f:
+                json.dump(text_encoder_info, f)
+    @classmethod
+    def from_pretrained(cls, pretrained_model_path, **kwargs):
+        #from diffusers.utils import load_config, load_state_dict
+        # Load model_index.json
+        #model_index = load_config(pretrained_model_path)
+        # Load components manually
+        unet_path = os.path.join(pretrained_model_path, "unet")
+        unet = UNet2DConditionModel.from_pretrained(unet_path)
+        scheduler_path = os.path.join(pretrained_model_path, "scheduler")
+        # Have heard that DDIMScheduler might be faster for inference, though not necessarily better
+        scheduler = DDPMScheduler.from_pretrained(scheduler_path)
+        tokenizer = None
+        text_encoder_path = os.path.join(pretrained_model_path, "text_encoder")
+        if os.path.exists(text_encoder_path):
+            #Test for the new saving system, where we save a simple config file
+            if os.path.exists(os.path.join(text_encoder_path, "loading_info.json")):
+                with open(os.path.join(text_encoder_path, "loading_info.json"), "r") as f:
+                    encoder_config = json.load(f)
+                text_encoder = AutoModel.from_pretrained(encoder_config['text_encoder_name'], trust_remote_code=True)
+                tokenizer = AutoTokenizer.from_pretrained(encoder_config['tokenizer_name'])
+            #Legacy loading system, loads models directly if the whole thing is saved in the directory
+            else:
+                try:
+                    text_encoder = AutoModel.from_pretrained(text_encoder_path, local_files_only=True, trust_remote_code=True)
+                    tokenizer = AutoTokenizer.from_pretrained(text_encoder_path, local_files_only=True)
+                except (ValueError, KeyError):
+                    text_encoder = TransformerModel.from_pretrained(text_encoder_path)
+                    tokenizer = text_encoder.tokenizer
+        else:
+            text_encoder = None
+        # Instantiate your pipeline
+        pipeline = cls(
+            unet=unet,
+            scheduler=scheduler,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            **kwargs,
+        )
+        #Loads block embeddings if present
+        block_embeds_path = os.path.join(pretrained_model_path, "block_embeddings.pt")
+        if os.path.exists(block_embeds_path):
+            pipeline.block_embeddings = torch.load(block_embeds_path, map_location="cpu")
+        else:
+            pipeline.block_embeddings = None
+        # Load supports_negative_prompt flag if present
+        config_path = os.path.join(pretrained_model_path, "pipeline_config.json")
+        if os.path.exists(config_path):
+            with open(config_path, "r") as f:
+                config = json.load(f)
+            pipeline.supports_negative_prompt = config.get("supports_negative_prompt", False)
+            pipeline.supports_pretrained_split = config.get("supports_pretrained_split", False)
+        return pipeline
+    # --- Handle batching for captions ---
+    def _prepare_text_batch(self, text: Optional[str | list[str]], batch_size: int, name: str) -> Optional[list[str]]:
+        if text is None:
+            return None
+        if isinstance(text, str):
+            return [text] * batch_size
+        if isinstance(text, list):
+            if len(text) == 1:
+                return text * batch_size
+            if len(text) != batch_size:
+                raise ValueError(f"{name} list length {len(text)} does not match batch_size {batch_size}")
+            return text
+        raise ValueError(f"{name} must be a string or list of strings")
+    def _prepare_initial_sample(self,
+                                raw_latent_sample: Optional[torch.Tensor],
+                                input_scene: Optional[torch.Tensor],
+                                batch_size: int, height: int, width: int,
+                                generator: Optional[torch.Generator]) -> torch.Tensor:
+        """Prepare the initial sample for diffusion."""
+        sample_shape = (batch_size, self.unet.config.in_channels, height, width)
+        if raw_latent_sample is not None:
+            if input_scene is not None:
+                raise ValueError("Cannot provide both raw_latent_sample and input_scene")
+            sample = raw_latent_sample.to(self.device)
+            if sample.shape[1] != sample_shape[1]:
+                raise ValueError(f"Wrong number of channels in raw_latent_sample: Expected {self.unet.config.in_channels} but got {sample.shape[1]}")
+            if sample.shape[0] == 1 and batch_size > 1:
+                sample = sample.repeat(batch_size, 1, 1, 1)
+            elif sample.shape[0] != batch_size:
+                raise ValueError(f"raw_latent_sample batch size {sample.shape[0]} does not match batch_size {batch_size}")
+        elif input_scene is not None:
+            # input_scene can be (H, W) or (batch_size, H, W)
+            scene_tensor = torch.tensor(input_scene, dtype=torch.long, device=self.device)
+            if scene_tensor.dim() == 2:
+                # (H, W) -> repeat for batch
+                scene_tensor = scene_tensor.unsqueeze(0).repeat(batch_size, 1, 1)
+            elif scene_tensor.shape[0] == 1 and batch_size > 1:
+                scene_tensor = scene_tensor.repeat(batch_size, 1, 1)
+            elif scene_tensor.shape[0] != batch_size:
+                raise ValueError(f"input_scene batch size {scene_tensor.shape[0]} does not match batch_size {batch_size}")
+            # One-hot encode: (batch, H, W, C)
+            one_hot = F.one_hot(scene_tensor, num_classes=self.unet.config.in_channels).float()
+            # (batch, H, W, C) -> (batch, C, H, W)
+            sample = one_hot.permute(0, 3, 1, 2)
+        else:
+            # Start from random noise
+            sample = torch.randn(sample_shape, generator=generator, device=self.device)
+        return sample
+    def __call__(
+        self,
+        caption: Optional[str | list[str]] = None,
+        negative_prompt: Optional[str | list[str]] = None,
+        generator: Optional[torch.Generator] = None,
+        num_inference_steps: int = common_settings.NUM_INFERENCE_STEPS,
+        guidance_scale: float = common_settings.GUIDANCE_SCALE,
+        height: int = common_settings.MARIO_HEIGHT,
+        width: int = common_settings.MARIO_WIDTH,
+        raw_latent_sample: Optional[torch.FloatTensor] = None,
+        input_scene: Optional[torch.Tensor] = None,
+        output_type: str = "tensor",
+        batch_size: int = 1,
+        show_progress_bar: bool = True,
+    ) -> PipelineOutput:
+        """Generate a batch of images based on text input using the diffusion model.
+        Args:
+            caption: Text description(s) of the desired output. Can be a string or list of strings.
+            negative_prompt: Text description(s) of what should not appear in the output. String or list.
+            generator: Random number generator for reproducibility.
+            num_inference_steps: Number of denoising steps (more = higher quality, slower).
+            guidance_scale: How strongly the generation follows the text prompt (higher = stronger).
+            height: Height of generated image in tiles.
+            width: Width of generated image in tiles.
+            raw_latent_sample: Optional starting point for diffusion instead of random noise.
+                Must have correct number of channels matching the UNet.
+            input_scene: Optional 2D or 3D int tensor where each value corresponds to a tile type.
+                Will be converted to one-hot encoding as starting point.
+            output_type: Currently only "tensor" is supported.
+            batch_size: Number of samples to generate in parallel.
+        Returns:
+            PipelineOutput containing the generated image tensor (batch_size, ...).
+        """
+        #       I would like to simplify the code to this, but the AI suggestion didn't work, and
+        #       I did not feel good just pasting it all in. Will need to tackle it bit by bit.
+        #        if caption is not None and self.text_encoder is None:
+        #            raise ValueError("Text encoder required for conditional generation")
+        #        self.unet.eval()
+        #        if self.text_encoder is not None:
+        #            self.text_encoder.to(self.device)
+        #            self.text_encoder.eval()
+        #
+        #        with torch.no_grad():
+        #            # Process text inputs
+        #            captions = self.prepare_text_batch(caption, batch_size, "caption")
+        #            negatives = self.prepare_text_batch(negative_prompt, batch_size, "negative_prompt")
+        #            # Get embeddings
+        #            text_embeddings = self.prepare_embeddings(captions, negatives, batch_size)
+        #
+        #            # Set up initial latent state
+        #            sample = self.prepare_initial_sample(raw_latent_sample, input_scene,
+        #                                              batch_size, height, width, generator)
+        #            # Run diffusion process
+        #            sample = self.run_diffusion(sample, text_embeddings, num_inference_steps,
+        #                                      guidance_scale, generator, show_progress_bar,
+        #                                      has_caption=caption is not None,
+        #                                      has_negative=negative_prompt is not None)
+        #            # Format output
+        #            if output_type == "tensor":
+        #                sample = F.softmax(sample, dim=1)
+        #            else:
+        #                raise ValueError(f"Unsupported output type: {output_type}")
+        #        return PipelineOutput(images=sample)
+        # Validate text encoder if we need it
+        if caption is not None and self.text_encoder is None:
+            raise ValueError("Text encoder is required for conditional generation")
+        self.unet.eval()
+        if self.text_encoder is not None:
+            self.text_encoder.to(self.device)
+            self.text_encoder.eval()
+        with torch.no_grad():
+            captions = self._prepare_text_batch(caption, batch_size, "caption")
+            negatives = self._prepare_text_batch(negative_prompt, batch_size, "negative_prompt")
+            # --- Prepare text embeddings ---
+            if(isinstance(self.text_encoder, TransformerModel)):
+                text_embeddings = text_model.get_embeddings(batch_size=batch_size,
+                                                            tokenizer=self.text_encoder.tokenizer,
+                                                            text_encoder=self.text_encoder,
+                                                            captions=captions,
+                                                            neg_captions=negatives,
+                                                            device=self.device)
+            else: #Case for the pre-trained text encoder
+                if(self.supports_pretrained_split): #If we have a split flag incorporated
+                    text_embeddings = st_helper.get_embeddings_split(batch_size = batch_size,
+                                                            tokenizer=self.tokenizer,
+                                                            model=self.text_encoder,
+                                                            captions=captions,
+                                                            neg_captions=negatives,
+                                                            device=self.device)
+                else:
+                    text_embeddings = st_helper.get_embeddings(batch_size = batch_size,
+                                                                tokenizer=self.tokenizer,
+                                                                model=self.text_encoder,
+                                                                captions=captions,
+                                                                neg_captions=negatives,
+                                                                device=self.device)
+            # --- Set up initial latent state ---
+            sample = self._prepare_initial_sample(raw_latent_sample, input_scene,
+                                                 batch_size, height, width, generator)
+            # --- Set up diffusion process ---
+            self.scheduler.set_timesteps(num_inference_steps)
+            # Denoising loop
+            iterator = self.progress_bar(self.scheduler.timesteps) if show_progress_bar else self.scheduler.timesteps
+            for t in iterator:
+                # Handle conditional generation
+                if captions is not None:
+                    if negatives is not None:
+                        # Three copies for negative prompt guidance
+                        model_input = torch.cat([sample, sample, sample], dim=0)
+                    else:
+                        # Two copies for standard classifier-free guidance
+                        model_input = torch.cat([sample, sample], dim=0)
+                else:
+                    model_input = sample
+                # Predict noise residual
+                model_kwargs = {"encoder_hidden_states": text_embeddings}
+                noise_pred = self.unet(model_input, t, **model_kwargs).sample
+                # Apply guidance
+                if captions is not None:
+                    if negatives is not None:
+                        # Split predictions for negative, unconditional, and text-conditional
+                        noise_pred_neg, noise_pred_uncond, noise_pred_text = noise_pred.chunk(3)
+                        noise_pred_guided = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                        noise_pred = noise_pred_guided - guidance_scale * (noise_pred_neg - noise_pred_uncond)
+                    else:
+                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                # Compute previous sample: x_{t-1} = scheduler(x_t, noise_pred)
+                sample = self.scheduler.step(noise_pred, t, sample, generator=generator).prev_sample
+            # Convert to output format
+            if output_type == "tensor":
+                if self.block_embeddings is not None:
+                    sample = get_scene_from_embeddings(sample, self.block_embeddings)
+                else:
+                    # Apply softmax to get probabilities for each tile type
+                    sample = F.softmax(sample, dim=1)
+                    sample = sample.detach().cpu()
+            else:
+                raise ValueError(f"Unsupported output type: {output_type}")
+        return PipelineOutput(images=sample)
+    def print_unet_architecture(self):
+        """Prints the architecture of the UNet model."""
+        print(self.unet)
+    def print_text_encoder_architecture(self):
+        """Prints the architecture of the text encoder model, if it exists."""
+        if self.text_encoder is not None:
+            print(self.text_encoder)
+        else:
+            print("No text encoder is set.")
+    def save_unet_architecture_pdf(self, height, width, filename="unet_architecture", batch_size=1, device=None):
+        """
+        Have to separately install torchview for this to work
+        Saves a visualization of the UNet architecture as a PDF using torchview.
+        Args:
+            height: Height of the dummy input.
+            width: Width of the dummy input.
+            filename: Output PDF filename.
+            batch_size: Batch size for dummy input.
+            device: Device to run the dummy input on (defaults to pipeline device).
+        """
+        from torchview import draw_graph
+        import graphviz
+        if device is None:
+            device = self.device if hasattr(self, 'device') else 'cpu'
+        in_channels = self.unet.config.in_channels if hasattr(self.unet, 'config') else 1
+        sample_shape = tuple([batch_size, in_channels, height, width])
+        dummy_x = torch.randn(size=sample_shape, device=device)
+        dummy_t = torch.tensor([0] * batch_size, dtype=torch.long, device=device)
+        # Prepare dummy text embedding (match what your UNet expects)
+        if hasattr(self.unet, 'config') and hasattr(self.unet.config, 'cross_attention_dim'):
+            cross_attention_dim = self.unet.config.cross_attention_dim
+        else:
+            cross_attention_dim = 128  # fallback
+        encoder_hidden_states = torch.randn(batch_size, 1, cross_attention_dim, device=device)
+        self.unet.eval()
+        inputs = (dummy_x, dummy_t, encoder_hidden_states)
+        #self.unet.down_blocks = self.unet.down_blocks[:2]
+        graph = draw_graph(
+            model=self.unet,
+            input_data=inputs,
+            expand_nested=False,
+            #enable_output_shape=True,
+            #roll_out="nested",
+            depth=1
+        )
+        #graph.visual_graph.engine = "neato"
+        graph.visual_graph.attr(#rankdir="LR",
+                                nodesep="0.1",      # decrease space between nodes in the same rank (default ~0.25)
+                                ranksep="0.2",       # decrease space between ranks (default ~0.5)
+                                concentrate="true"  # merge edges between nodes in the same rank
+                            )
+        graph.visual_graph.node_attr.update(
+            shape="rectangle",
+            width="1.5",   # narrow width
+            height="0.5"  # taller height to make vertical rectangles
+            #fixedsize="true"
+        )
+        graph.visual_graph.render(filename, format='pdf', cleanup=False)  # Cleanup removes intermediate files
+        graph.visual_graph.save('unet_architecture.dot')
+        # Save the graph to a PDF file
+        print(f"UNet architecture saved to {filename}")

text_model.py ADDED Viewed

	@@ -0,0 +1,206 @@

+import argparse
+from xml.parsers.expat import model
+import torch
+import torch.nn as nn
+import math
+import os
+import json
+from safetensors.torch import save_file, load_file
+from tokenizer import Tokenizer
+def get_embeddings(batch_size, tokenizer, text_encoder, captions=None, neg_captions=None, device='cpu'):
+    max_length = text_encoder.max_seq_length
+    empty_ids = encode_token_captions([""] * batch_size, tokenizer, max_length, device=device)
+    embeddings = text_encoder.get_embeddings(empty_ids)
+    if(captions is not None):
+        caption_ids = encode_token_captions(captions, tokenizer, max_length, device=device)
+        caption_embeddings = text_encoder.get_embeddings(caption_ids)
+        embeddings = torch.cat((embeddings, caption_embeddings), dim=0)
+    if(neg_captions is not None):
+        neg_ids = encode_token_captions(neg_captions, tokenizer, max_length, device=device)
+        neg_embeddings = text_encoder.get_embeddings(neg_ids)
+        embeddings = torch.cat((neg_embeddings, embeddings), dim=0)
+    return embeddings.to(device)
+def encode_token_captions(captions, tokenizer, max_length, device='cpu'):
+    caption_ids = []
+    for caption in captions:
+        tokens = tokenizer.encode(caption)
+        caption_tokens = tokenizer.pad_sequence(tokens, max_length)
+        caption_ids.append(torch.tensor(caption_tokens, dtype=torch.long).unsqueeze(0))
+    return torch.cat(caption_ids, dim=0).to(device)
+# Transformer model for MLM training
+class TransformerModel(nn.Module):
+    def __init__(self, vocab_size, embedding_dim, hidden_dim, tokenizer=None, num_heads=8, num_layers=4, max_seq_length=100):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.vocab_size = vocab_size
+        self.hidden_dim = hidden_dim
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.max_seq_length = max_seq_length
+        self.embedding = nn.Embedding(vocab_size, embedding_dim)
+        self.positional_encoding = self.create_positional_encoding(max_seq_length, embedding_dim)
+        encoder_layers = nn.TransformerEncoderLayer(
+            d_model=embedding_dim,
+            nhead=num_heads,
+            dim_feedforward=hidden_dim,
+            batch_first=True
+        )
+        self.transformer = nn.TransformerEncoder(encoder_layers, num_layers)
+        self.fc = nn.Linear(embedding_dim, vocab_size)
+        self.tokenizer = tokenizer
+    def create_positional_encoding(self, max_seq_length, embedding_dim):
+        # The implementation uses a sinusoidal positional encoding, which creates a unique pattern for each position in the sequence.
+        # The frequencies create unique values, the sin/cos bounds values
+        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
+        # Creates a set of divisors that create different frequencies
+        div_term = torch.exp(torch.arange(0, embedding_dim, 2).float() * (-math.log(10000.0) / embedding_dim))
+        pe = torch.zeros(max_seq_length, embedding_dim)
+        # Even dimensions use sin, odd dimensions use cos
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        return pe.unsqueeze(0)
+    def get_embeddings(self, x):
+        """ This gets the actual latent embedding vectors """
+        # Ensure positional encoding is on the same device as input
+        pe = self.positional_encoding[:, :x.size(1), :].to(x.device)
+        # Embed input and add positional encoding
+        embedded = self.embedding(x) + pe
+        return self.transformer(embedded)
+    def forward(self, x):
+        """ This gets the token within the vocabulary """
+        transformer_out = self.get_embeddings(x)
+        # Project to vocabulary size
+        return self.fc(transformer_out)
+    def save_pretrained(self, save_directory):
+        os.makedirs(save_directory, exist_ok=True)
+        config = {
+            "vocab_size": self.vocab_size,
+            "embedding_dim": self.embedding_dim,
+            "hidden_dim": self.hidden_dim,
+            "num_heads": self.num_heads,
+            "num_layers": self.num_layers,
+            "max_seq_length": self.max_seq_length,
+        }
+        with open(os.path.join(save_directory, "config.json"), "w") as f:
+            json.dump(config, f)
+        # Save model weights
+        save_file(self.state_dict(), os.path.join(save_directory, "model.safetensors"))
+        # Save tokenizer if present
+        if self.tokenizer is not None:
+            self.tokenizer.save(os.path.join(save_directory, "tokenizer.pkl"))
+    @classmethod
+    def from_pretrained(cls, load_directory):
+        with open(os.path.join(load_directory, "config.json")) as f:
+            config = json.load(f)
+        model = cls(**config)
+        # Load weights
+        state_dict = load_file(os.path.join(load_directory, "model.safetensors"))
+        model.load_state_dict(state_dict)
+        # Load tokenizer if available
+        tokenizer_path = os.path.join(load_directory, "tokenizer.pkl")
+        if os.path.exists(tokenizer_path):
+            tokenizer = Tokenizer()
+            tokenizer.load(tokenizer_path)
+            model.tokenizer = tokenizer
+        return model
+    def print_architecture(self, inputs=None):
+        parser = argparse.ArgumentParser()
+        parser.add_argument("--model_path", type=str, required=True, help="Path to trained transformer model")
+        parser.add_argument("--json", type=str, default="SMB1_LevelsAndCaptions-regular-test.json", help="Path to dataset json file")
+        parser.add_argument("--num_samples", type=int, default=10, help="Number of captions to evaluate")
+        parser.add_argument("--mask_prob", type=float, default=0.15, help="Probability of masking each token")
+        parser.add_argument("--compare_checkpoints", action="store_true", default=False, help="Run comparison across all model checkpoints")
+        args = parser.parse_args()
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        model = TransformerModel.from_pretrained(args.model_path).to(device)
+        print(f"Loaded model from {args.model_path}")
+        import os
+        import re
+        import json
+        import matplotlib.pyplot as plt
+        from torchview import draw_graph
+        import graphviz
+        graph = draw_graph(
+            model=model,
+            input_data=inputs,
+            expand_nested=False,
+            #enable_output_shape=True,
+            #roll_out="nested",
+            depth=1
+        )
+        # Save plot
+        filename = 'mlm_architecture'
+        graph.visual_graph.render(filename, format='pdf', cleanup=False)  # Cleanup removes intermediate files
+        #graph.visual_graph.save('unet_architecture.dot')
+    def save_architecture_pdf(self, filename="transformer_architecture.pdf", input_length=32):
+        """Save a visualization of the model architecture as a PDF using torchview."""
+        try:
+            from torchview import draw_graph
+        except ImportError:
+            raise ImportError("torchview is required for model visualization. Install with 'pip install torchview'.")
+        import torch
+        import os
+        # Create a dummy input of the correct type for the model
+        captions = ["full floor. two coins. one pipe.", "floor with two gaps. one cannon. many enemies."]
+        tensor = encode_token_captions(captions, self.tokenizer, self.max_seq_length, device=next(self.parameters()).device)
+        input_length = tensor.size(1) if tensor.dim() > 1 else self.max_seq_length
+        num_tokens_list = [len(self.tokenizer.encode(c)) for c in captions]
+        input_length = max(num_tokens_list) if num_tokens_list else input_length
+        dummy_input = torch.zeros((1, input_length), dtype=torch.long, device=next(self.parameters()).device)
+        # Draw the graph and save as PNG
+        graph = draw_graph(self, input_data=dummy_input, expand_nested=True, save_graph=True, filename=filename.replace('.pdf',''), directory=".", depth=2)
+        png_file = filename.replace('.pdf', '.png')
+        # Convert PNG to PDF
+        if os.path.exists(png_file):
+            try:
+                from PIL import Image
+                im = Image.open(png_file)
+                im.save(filename, "PDF", resolution=100.0)
+                print(f"Saved architecture PDF to {filename}")
+                # Optionally, remove the PNG file
+                os.remove(png_file)
+            except ImportError:
+                print(f"PIL not installed. Architecture saved as PNG: {png_file}")
+            except Exception as e:
+                print(f"Could not convert PNG to PDF: {e}")
+        else:
+            print(f"Could not find PNG file to convert: {png_file}")

text_to_level_diffusion.py ADDED Viewed

	@@ -0,0 +1,194 @@

+from interactive_generation import InteractiveGeneration
+import torch
+from level_dataset import visualize_samples, convert_to_level_format, positive_negative_caption_split
+from caption_match import compare_captions, process_scene_segments
+from create_ascii_captions import assign_caption
+from util import extract_tileset
+from sampler import scene_to_ascii
+import argparse
+import common_settings as common_settings
+from sampler import SampleOutput
+from pipeline_loader import get_pipeline
+def parse_args():
+    parser = argparse.ArgumentParser(description="Generate levels using a trained diffusion model")
+    # Model and generation parameters
+    parser.add_argument("--model_path", type=str, required=True, help="Path to the trained diffusion model")
+    parser.add_argument("--tileset", default='..\TheVGLC\Super Mario Bros\smb.json', help="Descriptions of individual tile types")
+    #parser.add_argument("--describe_locations", action="store_true", default=False, help="Include location descriptions in the captions")
+    parser.add_argument("--describe_absence", action="store_true", default=False, help="Indicate when there are no occurrences of an item or structure")
+    parser.add_argument("--automatic_negative_captions", action="store_true", default=False, help="Automatically create negative captions for prompts so the user doesn't have to")
+    parser.add_argument(
+        "--game",
+        type=str,
+        default="Mario",
+        choices=["Mario", "LR"],
+        help="Which game to create a model for (affects sample style and tile count)"
+    )
+    return parser.parse_args()
+class InteractiveLevelGeneration(InteractiveGeneration):
+    def __init__(self, args):
+        super().__init__(
+            {
+                "caption": str,
+                "width": int,
+                "negative_prompt": str,
+                "start_seed": int,
+                "end_seed": int,
+                "num_inference_steps": int,
+                "guidance_scale": float
+            },
+            default_parameters={
+                "width":  width, #common_settings.MARIO_WIDTH,
+                "start_seed": 1,
+                "end_seed": 1,  # Will be set to start_seed if blank
+                "num_inference_steps": common_settings.NUM_INFERENCE_STEPS,
+                "guidance_scale": common_settings.GUIDANCE_SCALE,
+                "caption": "",
+                "negative_prompt": ""
+            }
+        )
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.pipe = get_pipeline(args.model_path).to(self.device)
+        self.pipe.print_unet_architecture()
+        #self.pipe.save_unet_architecture_pdf(height, width)
+        if args.automatic_negative_captions or not self.pipe.supports_negative_prompt:
+            self.input_parameters.pop('negative_prompt', None)
+            self.default_parameters.pop('negative_prompt', None)
+        if args.automatic_negative_captions and not self.pipe.supports_negative_prompt:
+            raise ValueError("Automatic negative caption generation is not possible with a model that doesn't support it")
+        if args.tileset:
+            _, self.id_to_char, self.char_to_id, self.tile_descriptors = extract_tileset(args.tileset)
+        self.args = args
+        if self.args.game == "LR":
+            del self.input_parameters["width"]
+        print(f"Tileset in use: {self.args.tileset}")
+    def generate_image(self, param_values, generator, **extra_params):
+        if self.args.automatic_negative_captions:
+            pos, neg = positive_negative_caption_split(param_values["caption"], True)
+            param_values["negative_prompt"] = neg
+        images = self.pipe(
+            generator=generator,
+            **param_values
+        ).images
+        # Convert to indices
+        sample_tensor = images[0].unsqueeze(0)
+        sample_indices = convert_to_level_format(sample_tensor)
+        # Add level data to the list
+        scene = sample_indices[0].tolist()
+        if self.args.game == "LR":
+            number_of_tiles = common_settings.LR_TILE_COUNT
+            scene = [[x % number_of_tiles for x in row] for row in scene]
+        # Assign a caption to the sceneof whichever game is being played
+        if self.args.game == "Mario":
+            actual_caption = assign_caption(scene, self.id_to_char, self.char_to_id, self.tile_descriptors, False, self.args.describe_absence)
+            level_width = common_settings.MARIO_WIDTH
+        elif self.args.game == "LR":
+            actual_caption = lr_assign_caption(scene, self.id_to_char, self.char_to_id, self.tile_descriptors, False, self.args.describe_absence)
+            level_width = common_settings.LR_WIDTH
+        else:
+            raise ValueError(f"Unknown game: {self.args.game}")
+        if args.game == "LR":
+            print(f"Describe resulting image: {actual_caption}")
+            lr_compare_score = lr_compare_captions(param_values.get("caption", ""), actual_caption)
+            print(f"Comparison score: {lr_compare_score}")
+            # Use the new function to process scene segments
+            average_score, segment_captions, segment_scores = lr_process_scene_segments(
+                scene=scene,
+                segment_width=level_width,
+                prompt=param_values.get("caption", ""),
+                id_to_char=self.id_to_char,
+                char_to_id=self.char_to_id,
+                tile_descriptors=self.tile_descriptors,
+                describe_locations=False, #self.args.describe_locations,
+                describe_absence=self.args.describe_absence,
+                verbose=True
+            )
+        elif args.game == "Mario":
+            compare_score = compare_captions(param_values.get("caption", ""), actual_caption)
+            print(f"Comparison score: {compare_score}")
+            # Use the new function to process scene segments
+            average_score, segment_captions, segment_scores = process_scene_segments(
+                scene=scene,
+                segment_width=level_width,
+                prompt=param_values.get("caption", ""),
+                id_to_char=self.id_to_char,
+                char_to_id=self.char_to_id,
+                tile_descriptors=self.tile_descriptors,
+                describe_locations=False, #self.args.describe_locations,
+                describe_absence=self.args.describe_absence,
+                verbose=True
+            )
+            # Ask if user wants to play level
+            play_level = input("Do you want to play this level? (y/n): ").strip().lower()
+            if play_level == 'y':
+                print("Playing level...")
+                char_grid = scene_to_ascii(scene, self.id_to_char, False)
+                level = SampleOutput(level=char_grid, use_snes_graphics=False)
+                console_output = level.run_astar()
+                print(console_output)
+            elif play_level == 'n':
+                print("Level not played.")
+            else:
+                raise ValueError(f"Unknown input: {play_level}")
+        return visualize_samples(images)
+    def get_extra_params(self, param_values):
+        if "negative_prompt" in param_values and param_values["negative_prompt"] == "":
+            del param_values["negative_prompt"]
+        if param_values["caption"] == "":
+            del param_values["caption"]
+        param_values["output_type"] = "tensor"
+        # Lode Runner
+        if self.args.game == "LR":
+            param_values["height"] = common_settings.LR_HEIGHT
+            param_values["width"] = common_settings.LR_WIDTH
+        return dict()
+if __name__ == "__main__":
+    args = parse_args()
+    if args.game == "Mario":
+        args.num_tiles = common_settings.MARIO_TILE_COUNT
+        height = common_settings.MARIO_HEIGHT
+        width = common_settings.MARIO_WIDTH
+        args.tile_size = common_settings.MARIO_TILE_PIXEL_DIM
+        args.tileset = '..\TheVGLC\Super Mario Bros\smb.json'
+    elif args.game == "LR":
+        args.num_tiles = common_settings.LR_TILE_COUNT
+        height = common_settings.LR_HEIGHT
+        width = common_settings.LR_WIDTH
+        args.tile_size = common_settings.LR_TILE_PIXEL_DIM
+        args.tileset = '..\TheVGLC\Lode Runner\Loderunner.json'
+    else:
+        raise ValueError(f"Unknown game: {args.game}")
+    ig = InteractiveLevelGeneration(args)
+    ig.start()

tokenizer.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import json
+import re
+from collections import Counter
+import pickle
+import argparse
+class Tokenizer:
+    def __init__(self):
+        self.special_tokens = ["[PAD]", "[MASK]"]
+        self.vocab = {}
+        self.token_to_id = {}
+        self.id_to_token = {}
+    def tokenize(self, text):
+        # Match words, numbers, periods, and commas as separate tokens
+        tokens = re.findall(r'\w+|[.,]|\[mask\]|\[pad\]', text.lower())
+        # Restore MASK and PAD to all caps
+        modified_list = []
+        for s in tokens:
+            modified_s = s.replace("[mask]", "[MASK]").replace("[pad]", "[PAD]")
+            modified_list.append(modified_s)
+        return modified_list
+    def pad_sequence(self, tokens, length):
+        """Pads tokenized sequences to length with a padding token (assumed to be '[PAD]')."""
+        if len(tokens) > length:
+            raise ValueError(f"Token sequence length {len(tokens)} exceeds specified length {length}.")
+        pad_token = self.token_to_id["[PAD]"]
+        return tokens + [pad_token] * (length - len(tokens))
+    def build_vocab(self, dataset_path, min_freq=1):
+        token_counter = Counter()
+        with open(dataset_path, 'r') as f:
+            data = json.load(f)
+            for entry in data:
+                caption = entry['caption']
+                tokens = self.tokenize(caption)
+                token_counter.update(tokens)
+        # Keep tokens that meet the min frequency
+        tokens = [tok for tok, count in token_counter.items() if count >= min_freq]
+        # Ensure special tokens are always included
+        all_tokens = self.special_tokens + sorted(tokens)
+        # Build vocab dictionaries
+        self.vocab = {tok: idx for idx, tok in enumerate(all_tokens)}
+        self.token_to_id = self.vocab
+        self.id_to_token = {idx: tok for tok, idx in self.vocab.items()}
+        print(f"Vocabulary size: {len(self.vocab)}")
+    def encode(self, text):
+        tokens = self.tokenize(text)
+        encoded = []
+        for tok in tokens:
+            if tok not in self.token_to_id:
+                raise ValueError(f"Unknown token encountered: {tok} in {text}")
+            encoded.append(self.token_to_id[tok])
+        return encoded
+    def encode_batch(self, texts, pad_to_length=None):
+        """
+        Encode a batch of texts into token IDs with padding to ensure uniform length.
+        Args:
+            texts (list): A list of strings to encode
+            pad_to_length (int, optional): Length to pad all sequences to. If None,
+                                          will pad to the length of the longest sequence.
+        Returns:
+            list: A list of lists, where each inner list contains the token IDs for a text
+        """
+        # Get the padding token ID
+        pad_token = self.token_to_id["[PAD]"]
+        # First encode all texts
+        encoded_texts = []
+        for text in texts:
+            try:
+                encoded = self.encode(text)
+                encoded_texts.append(encoded)
+            except ValueError as e:
+                raise ValueError(f"Error encoding text: {text}. {str(e)}")
+        # Determine padding length
+        if pad_to_length is None:
+            pad_to_length = max(len(seq) for seq in encoded_texts)
+        # Pad sequences to uniform length
+        padded_texts = []
+        for seq in encoded_texts:
+            if len(seq) > pad_to_length:
+                # Truncate if too long
+                padded_texts.append(seq[:pad_to_length])
+            else:
+                # Pad if too short
+                padding = [pad_token] * (pad_to_length - len(seq))
+                padded_texts.append(seq + padding)
+        return padded_texts
+    def decode(self, token_ids):
+        return ' '.join(self.id_to_token[tok_id] for tok_id in token_ids)
+    def save(self, path):
+        with open(path, 'wb') as f:
+            pickle.dump({'vocab': self.vocab}, f)
+    def load(self, path):
+        with open(path, 'rb') as f:
+            data = pickle.load(f)
+            self.vocab = data['vocab']
+            self.token_to_id = self.vocab
+            self.id_to_token = {idx: tok for tok, idx in self.vocab.items()}
+    def get_vocab(self):
+        return sorted(self.vocab.keys())
+    def get_vocab_size(self):
+        return len(self.vocab)
+if __name__ == "__main__":
+    tokenizer = Tokenizer()
+    parser = argparse.ArgumentParser(description="Tokenizer utility for saving and loading vocabularies.")
+    parser.add_argument("action", choices=["save", "load"], help="Action to perform: 'save' or 'load'.")
+    parser.add_argument("--json_file", type=str, default='Mario_LevelsAndCaptions.json', help="Path to the JSON file containing the dataset (required for 'save').")
+    parser.add_argument("--pkl_file", type=str, default='Mario_Tokenizer.pkl', help="Path to the pickle file to save/load the tokenizer.")
+    args = parser.parse_args()
+    if args.action == "save":
+        if not args.json_file:
+            raise ValueError("The --json_file argument is required for the 'save' action.")
+        tokenizer.build_vocab(args.json_file)
+        tokenizer.save(args.pkl_file)
+    elif args.action == "load":
+        tokenizer.load(args.pkl_file)
+    # Example usage
+    #print(tokenizer.encode("floor with one gap. one enemy."))
+    #print(tokenizer.get_vocab())
+    #for id, token in tokenizer.id_to_token.items():
+    #    print(id,":",token)

util.py ADDED Viewed

	@@ -0,0 +1,216 @@

+import json
+import sys
+import os
+from collections import Counter
+# This file contains utility functions for analyzing and describing levels in both Lode Runner and Super Mario Bros.
+# Could define these via the command line, but for now they are hardcoded
+coarse_locations = True
+coarse_counts = True
+pluralize = True
+give_staircase_lengths = False
+def describe_size(count):
+    if count <= 4: return "small"
+    else: return "big"
+def describe_quantity(count):
+    if count == 0: return "no"
+    elif count == 1: return "one"
+    elif count == 2: return "two"
+    elif count < 5: return "a few"
+    elif count < 10: return "several"
+    else: return "many"
+def get_tile_descriptors(tileset):
+    """Creates a mapping from tile character to its list of descriptors."""
+    result = {char: set(attrs) for char, attrs in tileset["tiles"].items()}
+    # Fake tiles. Should these contain anything? Note that code elsewhere expects everything to be passable or solid
+    result["!"] = {"passable"}
+    result["*"] = {"passable"}
+    return result
+def analyze_floor(scene, id_to_char, tile_descriptors, describe_absence):
+    """Analyzes the last row of the 32x32 scene and generates a floor description."""
+    WIDTH = len(scene[0])
+    last_row = scene[-1]  # The FLOOR row of the scene
+    solid_count = sum(
+        1 for tile in last_row
+        if tile in id_to_char and (
+            "solid" in tile_descriptors.get(id_to_char[tile], []) or
+            "diggable" in tile_descriptors.get(id_to_char[tile], [])
+        )
+    )
+    passable_count = sum(
+        1 for tile in last_row if "passable" in tile_descriptors.get(id_to_char[tile], [])
+    )
+    if solid_count == WIDTH:
+        return "full floor"
+    elif passable_count == WIDTH:
+        if describe_absence:
+            return "no floor"
+        else:
+            return ""
+    elif solid_count > passable_count:
+        # Count contiguous groups of passable tiles
+        gaps = 0
+        in_gap = False
+        for tile in last_row:
+            # Enemies are also a gap since they immediately fall into the gap
+            if "passable" in tile_descriptors.get(id_to_char[tile], []) or "enemy" in tile_descriptors.get(id_to_char[tile], []):
+                if not in_gap:
+                    gaps += 1
+                    in_gap = True
+            elif "solid" in tile_descriptors.get(id_to_char[tile], []):
+                in_gap = False
+            else:
+                print("error")
+                print(tile)
+                print(id_to_char[tile])
+                print(tile_descriptors)
+                print(tile_descriptors.get(id_to_char[tile], []))
+                raise ValueError("Every tile should be passable, solid, or enemy")
+        return f"floor with {describe_quantity(gaps) if coarse_counts else gaps} gap" + ("s" if pluralize and gaps != 1 else "")
+    else:
+        # Count contiguous groups of solid tiles
+        chunks = 0
+        in_chunk = False
+        for tile in last_row:
+            if "solid" in tile_descriptors.get(id_to_char[tile], []):
+                if not in_chunk:
+                    chunks += 1
+                    in_chunk = True
+            elif "passable" in tile_descriptors.get(id_to_char[tile], []) or "enemy" in tile_descriptors.get(id_to_char[tile], []):
+                in_chunk = False
+            else:
+                print("error")
+                print(tile)
+                print(tile_descriptors)
+                print(tile_descriptors.get(tile, []))
+                raise ValueError("Every tile should be either passable or solid")
+        return f"giant gap with {describe_quantity(chunks) if coarse_counts else chunks} chunk"+("s" if pluralize and chunks != 1 else "")+" of floor"
+def count_in_scene(scene, tiles, exclude=set()):
+    """ counts standalone tiles, unless they are in the exclude set """
+    count = 0
+    for r, row in enumerate(scene):
+        for c, t in enumerate(row):
+            #if exclude and t in tiles: print(r,c, exclude)
+            if (r,c) not in exclude and t in tiles:
+                #if exclude: print((r,t), exclude, (r,t) in exclude)
+                count += 1
+    #if exclude: print(tiles, exclude, count)
+    return count
+def count_caption_phrase(scene, tiles, name, names, offset = 0, describe_absence=False, exclude=set()):
+    """ offset modifies count used in caption """
+    count = offset + count_in_scene(scene, tiles, exclude)
+    #if name == "loose block": print("count", count)
+    if count > 0:
+        return f" {describe_quantity(count) if coarse_counts else count} " + (names if pluralize and count > 1 else name) + "."
+    elif describe_absence:
+        return f" no {names}."
+    else:
+        return ""
+def in_column(scene, x, tile):
+    for row in scene:
+        if row[x] == tile:
+            return True
+    return False
+def analyze_ceiling(scene, id_to_char, tile_descriptors, describe_absence, ceiling_row = 1):
+    """
+    Analyzes ceiling row (0-based index) to detect a ceiling.
+    Returns a caption phrase or an empty string if no ceiling is detected.
+    """
+    WIDTH = len(scene[0])
+    row = scene[ceiling_row]
+    solid_count = sum(1 for tile in row if "solid" in tile_descriptors.get(id_to_char[tile], []))
+    if solid_count == WIDTH:
+        return " full ceiling."
+    elif solid_count > WIDTH//2:
+        # Count contiguous gaps of passable tiles
+        gaps = 0
+        in_gap = False
+        for tile in row:
+            # Enemies are also a gap since they immediately fall into the gap, but they are marked as "moving" and not "passable"
+            if "passable" in tile_descriptors.get(id_to_char[tile], []) or "moving" in tile_descriptors.get(id_to_char[tile], []):
+                if not in_gap:
+                    gaps += 1
+                    in_gap = True
+            else:
+                in_gap = False
+        result = f" ceiling with {describe_quantity(gaps) if coarse_counts else gaps} gap" + ("s" if pluralize and gaps != 1 else "") + "."
+        # Adding the "moving" check should make this code unnecessary
+        #if result == ' ceiling with no gaps.':
+        #    print("This should not happen: ceiling with no gaps")
+        #    print("ceiling_row:", scene[ceiling_row])
+        #    result = " full ceiling."
+        return result
+    elif describe_absence:
+        return " no ceiling."
+    else:
+        return ""  # Not enough solid tiles for a ceiling
+def extract_tileset(tileset_path):
+    # Load tileset
+    with open(tileset_path, "r") as f:
+        tileset = json.load(f)
+        #print(f"tileset: {tileset}")
+        tile_chars = sorted(tileset['tiles'].keys())
+        # Wiggle room for the tileset to be a bit more flexible.
+        # However, this requires me to add some bogus tiles to the list.
+        # tile_chars.append('!')
+        # tile_chars.append('*')
+        #print(f"tile_chars: {tile_chars}")
+        id_to_char = {idx: char for idx, char in enumerate(tile_chars)}
+        #print(f"id_to_char: {id_to_char}")
+        char_to_id = {char: idx for idx, char in enumerate(tile_chars)}
+        #print(f"char_to_id: {char_to_id}")
+        tile_descriptors = get_tile_descriptors(tileset)
+        #print(f"tile_descriptors: {tile_descriptors}")
+    return tile_chars, id_to_char, char_to_id, tile_descriptors
+def flood_fill(scene, visited, start_row, start_col, id_to_char, tile_descriptors, excluded, pipes=False, target_descriptor=None):
+    stack = [(start_row, start_col)]
+    structure = []
+    while stack:
+        row, col = stack.pop()
+        if (row, col) in visited or (row, col) in excluded:
+            continue
+        tile = scene[row][col]
+        descriptors = tile_descriptors.get(id_to_char[tile], [])
+        # Use target_descriptor if provided, otherwise default to old solid/pipe logic
+        if target_descriptor is not None:
+            if target_descriptor not in descriptors:
+                continue
+        else:
+            if "solid" not in descriptors or (not pipes and "pipe" in descriptors) or (pipes and "pipe" not in descriptors):
+                continue
+        visited.add((row, col))
+        structure.append((row, col))
+        # Check neighbors
+        for d_row, d_col in [(-1,0), (1,0), (0,-1), (0,1)]:
+            # Weird special case for adjacent pipes
+            if (id_to_char[tile] == '>' or id_to_char[tile] == ']') and d_col == 1: # if on the right edge of a pipe
+                continue # Don't go right if on the right edge of a pipe
+            if (id_to_char[tile] == '<' or id_to_char[tile] == '[') and d_col == -1: # if on the left edge of a pipe
+                continue # Don't go left if on the left edge of a pipe
+            n_row, n_col = row + d_row, col + d_col
+            if 0 <= n_row < len(scene) and 0 <= n_col < len(scene[0]):
+                stack.append((n_row, n_col))
+    return structure