Code that is apparently needed by the pipeline

Browse files

Probably more than is needed, but we'll see.

Some files had to be modified from their original repo versions

Files changed (14) hide show

captions/caption_match.py +247 -0
captions/evaluate_caption_order_tolerance.py +288 -0
captions/util.py +216 -0
mapsheet.png +0 -0
models/general_training_helper.py +172 -0
models/latent_diffusion_pipeline.py +99 -0
models/pipeline_loader.py +41 -0
models/sentence_transformers_helper.py +114 -0
models/text_diffusion_pipeline.py +442 -0
models/text_model.py +206 -0
util/common_settings.py +18 -0
util/naming_conventions.py +29 -0
util/plotter.py +173 -0
util/sampler.py +473 -0

captions/caption_match.py ADDED Viewed

	@@ -0,0 +1,247 @@

+from create_ascii_captions import assign_caption
+# Quantity order for scoring partial matches
+QUANTITY_TERMS = ["one", "two", "a few", "several", "many"]
+# Topics to compare
+TOPIC_KEYWORDS = [
+    #"giant gap", # I think all gaps are subsumed by the floor topic
+    "floor", "ceiling",
+    "broken pipe", "upside down pipe", "pipe",
+    "coin line", "coin",
+    "platform", "tower", #"wall",
+    "broken cannon", "cannon",
+    "ascending staircase", "descending staircase",
+    "rectangular",
+    "irregular",
+    "question block", "loose block",
+    "enem"  # catch "enemy"/"enemies"
+]
+# Need list because the order matters
+KEYWORD_TO_NEGATED_PLURAL = [
+    (" broken pipe.", ""), # If not the first phrase
+    ("broken pipe. ", ""), # If the first phrase (after removing all others)
+    (" broken cannon.", ""), # If not the first phrase
+    ("broken cannon. ", ""), # If the first phrase (after removing all others)
+    ("pipe", "pipes"),
+    ("cannon", "cannons"),
+    ("platform", "platforms"),
+    ("tower", "towers"),
+    ("staircase", "staircases"),
+    ("enem", "enemies"),
+    ("rectangular", "rectangular block clusters"),
+    ("irregular", "irregular block clusters"),
+    ("coin line", "coin lines"),
+    ("coin.", "coins."), # Need period to avoid matching "coin line"
+    ("question block", "question blocks"),
+    ("loose block", "loose blocks")
+]
+BROKEN_TOPICS = 2 # Number of topics that are considered "broken" (e.g., "broken pipe", "broken cannon")
+# Plural normalization map (irregulars)
+PLURAL_EXCEPTIONS = {
+    "enemies": "enemy",
+}
+def normalize_plural(phrase):
+    # Normalize known irregular plurals
+    for plural, singular in PLURAL_EXCEPTIONS.items():
+        phrase = phrase.replace(plural, singular)
+    # Normalize regular plurals (basic "s" endings)
+    words = phrase.split()
+    normalized_words = []
+    for word in words:
+        if word.endswith('s') and not word.endswith('ss'):  # avoid "class", "boss"
+            singular = word[:-1]
+            normalized_words.append(singular)
+        else:
+            normalized_words.append(word)
+    return ' '.join(normalized_words)
+def extract_phrases(caption, debug=False):
+    phrases = [phrase.strip() for phrase in caption.split('.') if phrase.strip()]
+    topic_to_phrase = {}
+    already_matched_phrases = set()  # Track phrases that have been matched
+    for topic in TOPIC_KEYWORDS:
+        matching_phrases = []
+        for p in phrases:
+            # Only consider phrases that haven't been matched to longer topics
+            if topic in p and p not in already_matched_phrases:
+                matching_phrases.append(p)
+        if matching_phrases:
+            # Filter out "no ..." phrases as equivalent to absence
+            phrase = matching_phrases[0]
+            if phrase.lower().startswith("no "):
+                topic_to_phrase[topic] = None
+                if debug:
+                    print(f"[Extract] Topic '{topic}': detected 'no ...', treating as None")
+            else:
+                topic_to_phrase[topic] = phrase
+                already_matched_phrases.add(phrase)  # Mark this phrase as matched
+                if debug:
+                    print(f"[Extract] Topic '{topic}': found phrase '{phrase}'")
+        else:
+            topic_to_phrase[topic] = None
+            if debug:
+                print(f"[Extract] Topic '{topic}': no phrase found")
+    return topic_to_phrase
+def quantity_score(phrase1, phrase2, debug=False):
+    def find_quantity(phrase):
+        for term in QUANTITY_TERMS:
+            if term in phrase:
+                return term
+        return None
+    qty1 = find_quantity(phrase1)
+    qty2 = find_quantity(phrase2)
+    if debug:
+        print(f"[Quantity] Comparing quantities: '{qty1}' vs. '{qty2}'")
+    if qty1 and qty2:
+        idx1 = QUANTITY_TERMS.index(qty1)
+        idx2 = QUANTITY_TERMS.index(qty2)
+        diff = abs(idx1 - idx2)
+        max_diff = len(QUANTITY_TERMS) - 1
+        score = 1.0 - (diff / max_diff)
+        if debug:
+            print(f"[Quantity] Quantity indices: {idx1} vs. {idx2}, diff: {diff}, score: {score:.2f}")
+        return score
+    if debug:
+        print("[Quantity] At least one quantity missing, assigning partial score 0.1")
+    return 0.1
+def compare_captions(correct_caption, generated_caption, debug=False, return_matches=False):
+    correct_phrases = extract_phrases(correct_caption, debug=debug)
+    generated_phrases = extract_phrases(generated_caption, debug=debug)
+    total_score = 0.0
+    num_topics = len(TOPIC_KEYWORDS)
+    exact_matches = []
+    partial_matches = []
+    excess_phrases = []
+    if debug:
+        print("\n--- Starting Topic Comparison ---\n")
+    for topic in TOPIC_KEYWORDS:
+        correct = correct_phrases[topic]
+        generated = generated_phrases[topic]
+        if debug:
+            print(f"[Topic: {topic}] Correct: {correct} | Generated: {generated}")
+        if correct is None and generated is None:
+            total_score += 1.0
+            if debug:
+                print(f"[Topic: {topic}] Both None — full score: 1.0\n")
+        elif correct is None or generated is None:
+            total_score += -1.0
+            if generated is not None: # Considered an excess phrase
+                excess_phrases.append(generated)
+            if debug:
+                print(f"[Topic: {topic}] One is None — penalty: -1.0\n")
+        else:
+            # Normalize pluralization before comparison
+            norm_correct = normalize_plural(correct)
+            norm_generated = normalize_plural(generated)
+            if debug:
+                print(f"[Topic: {topic}] Normalized: Correct: '{norm_correct}' | Generated: '{norm_generated}'")
+            if norm_correct == norm_generated:
+                total_score += 1.0
+                exact_matches.append(generated)
+                if debug:
+                    print(f"[Topic: {topic}] Exact match — score: 1.0\n")
+            elif any(term in norm_correct for term in QUANTITY_TERMS) and any(term in norm_generated for term in QUANTITY_TERMS):
+                qty_score = quantity_score(norm_correct, norm_generated, debug=debug)
+                total_score += qty_score
+                partial_matches.append(generated)
+                if debug:
+                    print(f"[Topic: {topic}] Quantity-based partial score: {qty_score:.2f}\n")
+            else:
+                total_score += 0.1
+                partial_matches.append(generated)
+                if debug:
+                    print(f"[Topic: {topic}] Partial match (topic overlap) — score: 0.1\n")
+        if debug:
+            print(f"[Topic: {topic}] Current total score: {total_score:.4f}\n")
+    if debug:
+        print("total_score before normalization:", total_score)
+        print(f"Number of topics: {num_topics}")
+    final_score = total_score / num_topics
+    if debug:
+        print(f"--- Final score: {final_score:.4f} ---\n")
+    if return_matches:
+        return final_score, exact_matches, partial_matches, excess_phrases
+    return final_score
+def process_scene_segments(scene, segment_width, prompt, id_to_char, char_to_id, tile_descriptors, describe_locations, describe_absence, verbose=False):
+    """
+    Process a scene by partitioning it into segments, assigning captions, and computing comparison scores.
+    Args:
+        scene (list): The scene to process, represented as a 2D list.
+        segment_width (int): The width of each segment.
+        prompt (str): The prompt to compare captions against.
+        id_to_char (dict): Mapping from tile IDs to characters.
+        char_to_id (dict): Mapping from characters to tile IDs.
+        tile_descriptors (dict): Descriptions of individual tile types.
+        describe_locations (bool): Whether to include location descriptions in captions.
+        describe_absence (bool): Whether to indicate absence of items in captions.
+        verbose (bool): If True, print captions and scores for each segment.
+    Returns:
+        tuple: A tuple containing the average comparison score, captions for each segment, and scores for each segment.
+    """
+    # Partition the scene into segments of the specified width
+    segments = [
+        [row[i:i+segment_width] for row in scene]  # Properly slice each row of the scene
+        for i in range(0, len(scene[0]), segment_width)
+    ]
+    # Assign captions and compute scores for each segment
+    segment_scores = []
+    segment_captions = []
+    for idx, segment in enumerate(segments):
+        segment_caption = assign_caption(segment, id_to_char, char_to_id, tile_descriptors, describe_locations, describe_absence)
+        segment_score = compare_captions(prompt, segment_caption)
+        segment_scores.append(segment_score)
+        segment_captions.append(segment_caption)
+        if verbose:
+            print(f"Segment {idx + 1} caption: {segment_caption}")
+            print(f"Segment {idx + 1} comparison score: {segment_score}")
+    # Compute the average comparison score
+    average_score = sum(segment_scores) / len(segment_scores) if segment_scores else 0
+    if verbose:
+        print(f"Average comparison score across all segments: {average_score}")
+    return average_score, segment_captions, segment_scores
+if __name__ == '__main__':
+    ref = "floor with one gap. two enemies. one platform. one tower."
+    gen = "giant gap with one chunk of floor. two enemies. one platform. one tower."
+    score = compare_captions(ref, gen, debug=True)
+    print(f"Should be: {ref}")
+    print(f"  but was: {gen}")
+    print(f"Score: {score}")

captions/evaluate_caption_order_tolerance.py ADDED Viewed

	@@ -0,0 +1,288 @@

+import argparse
+import itertools
+import os
+import random
+from collections import defaultdict
+import sys, os
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+import util.common_settings as common_settings  # adjust import if needed
+from level_dataset import LevelDataset, visualize_samples, colors, mario_tiles  # adjust import if needed
+from torch.utils.data import DataLoader
+from evaluate_caption_adherence import calculate_caption_score_and_samples  # adjust import if needed
+import matplotlib.pyplot as plt
+import matplotlib
+import json
+from tqdm import tqdm
+import numpy as np
+import torch
+from tqdm import tqdm
+from captions.util import extract_tileset
+from models.pipeline_loader import get_pipeline
+def parse_args():
+    parser = argparse.ArgumentParser(description="Evaluate caption order tolerance for a diffusion model.")
+    parser.add_argument("--model_path", type=str, required=True, help="Path to the trained diffusion model")
+    parser.add_argument("--caption", type=str, required=False, default=None, help="Caption to evaluate, phrases separated by periods")
+    parser.add_argument("--tileset", type=str, help="Path to the tileset JSON file")
+    #parser.add_argument("--json", type=str, default="datasets\\Test_for_caption_order_tolerance.json", help="Path to dataset json file")
+    #parser.add_argument("--json", type=str, default="datasets\\SMB1_LevelsAndCaptions-regular-test.json", help="Path to dataset json file")
+    parser.add_argument("--json", type=str, default="datasets\\Mar1and2_LevelsAndCaptions-regular.json", help="Path to dataset json file")
+    #parser.add_argument("--trials", type=int, default=3, help="Number of times to evaluate each caption permutation")
+    parser.add_argument("--inference_steps", type=int, default=common_settings.NUM_INFERENCE_STEPS)
+    parser.add_argument("--guidance_scale", type=float, default=common_settings.GUIDANCE_SCALE)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--game", type=str, choices=["Mario", "LR"], default="Mario", help="Game to evaluate (Mario or Lode Runner)")
+    parser.add_argument("--describe_absence", action="store_true", default=False, help="Indicate when there are no occurrences of an item or structure")
+    parser.add_argument("--save_as_json", action="store_true", help="Save generated levels as JSON")
+    parser.add_argument("--output_dir", type=str, default="visualizations", help="Output directory if not comparing checkpoints (subdir of model directory)")
+    parser.add_argument("--max_permutations", type=int, default=5, help="Maximum amount of permutations that can be made per caption")
+    return parser.parse_args()
+def setup_environment(seed):
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+    return device
+def load_captions_from_json(json_path):
+    with open(json_path, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+    # If the JSON is a list of dicts with a "caption" key
+    captions = [entry["caption"] for entry in data if "caption" in entry]
+    return captions
+def creation_of_parameters(caption, max_permutations):
+    args = parse_args()
+    device = setup_environment(args.seed)
+    if args.game == "Mario":
+        num_tiles = common_settings.MARIO_TILE_COUNT
+        tileset = '..\TheVGLC\Super Mario Bros\smb.json'
+    elif args.game == "LR":
+        num_tiles = common_settings.LR_TILE_COUNT
+        tileset = '..\TheVGLC\Lode Runner\Loderunner.json'
+    else:
+        raise ValueError(f"Unknown game: {args.game}")
+    # Load pipeline
+    pipe = get_pipeline(args.model_path).to(device)
+    # Load tile metadata
+    tile_chars, id_to_char, char_to_id, tile_descriptors = extract_tileset(tileset)
+    perm_captions = []
+    if isinstance(caption, list):
+        # captions is a list of caption strings
+        phrases_per_caption = [
+            [p.strip() for p in cap.split('.') if p.strip()]
+            for cap in caption
+        ]
+        permutations = []
+        for phrases in phrases_per_caption:
+            perms = list(itertools.permutations(phrases))
+            if len(perms) > max_permutations:
+                perms = random.sample(perms, max_permutations)
+            permutations.append(perms)
+        perm_captions = ['.'.join(perm) + '.' for perms in permutations for perm in perms]
+    elif isinstance(caption, str):
+        # Split caption into phrases and get all permutations
+        phrase = [p.strip() for p in caption.split('.') if p.strip()]
+        permutations_cap = []
+        perms = list(itertools.permutations(phrase))
+        if len(perms) > max_permutations:
+            perms = random.sample(perms, max_permutations)
+        permutations_cap.append(perms)
+        perm_captions = ['.'.join(perm) + '.' for perms in permutations_cap for perm in perms]
+     # Create a list of dicts as expected by LevelDataset
+    caption_data = [{"scene": None, "caption": cap} for cap in perm_captions]
+    # Initialize dataset
+    dataset = LevelDataset(
+        data_as_list=caption_data,
+        shuffle=False,
+        mode="text",
+        augment=False,
+        num_tiles=common_settings.MARIO_TILE_COUNT,
+        negative_captions=False,
+        block_embeddings=None
+    )
+    # Create dataloader
+    dataloader = DataLoader(
+        dataset,
+        batch_size=min(16, len(perm_captions)),
+        shuffle=False,
+        num_workers=4,
+        drop_last=False,
+        persistent_workers=True
+    )
+    return pipe, device, id_to_char, char_to_id, tile_descriptors, num_tiles, dataloader, perm_captions, caption_data
+def statistics_of_captions(captions, dataloader, compare_all_scores, pipe=None, device=None, id_to_char=None, char_to_id=None, tile_descriptors=None, num_tiles=None):
+    """
+    Calculate statistics of the captions.
+    Returns average, standard deviation, minimum, maximum, and median of caption scores.
+    """
+    args = parse_args()
+    if not captions:
+        print("No captions found in the provided JSON file.")
+        return
+    print(f"\nLoaded {len(captions)} captions from {args.json}")
+    avg_score = np.mean(compare_all_scores)
+    std_dev_score = np.std(compare_all_scores)
+    min_score = np.min(compare_all_scores)
+    max_score = np.max(compare_all_scores)
+    median_score = np.median(compare_all_scores)
+    print("\n-----Scores for each caption permutation-----")
+    for i, score in enumerate(compare_all_scores):
+        print(f"Scores for caption {i + 1}:", score)
+    print("\n-----Statistics of captions-----")
+    print(f"Average score: {avg_score:.4f}")
+    print(f"Standard deviation: {std_dev_score:.4f}")
+    print(f"Minimum score: {min_score:.4f}")
+    print(f"Maximum score: {max_score:.4f}")
+    print(f"Median score: {median_score:.4f}")
+    return compare_all_scores, avg_score, std_dev_score, min_score, max_score, median_score
+def main():
+    args = parse_args()
+    if args.caption is None or args.caption == "":
+        caption = load_captions_from_json(args.json)
+    else:
+        caption = args.caption
+        #caption = ("many pipes. many coins. , many enemies. many blocks. , many platforms. many question blocks.").split(',')
+    all_scores = []
+    all_avg_scores = []
+    all_std_dev_scores = []
+    all_min_scores = []
+    all_max_scores = []
+    all_median_scores = []
+    all_captions =  [item.strip() for s in caption for item in s.split(",")]
+    one_caption = []
+    count = 0
+    output_jsonl_path = os.path.join(args.output_dir, "evaluation_caption_order_results.jsonl")
+    with open(output_jsonl_path, "w") as f:
+        for cap in all_captions:
+            one_caption = cap
+            # Initialize dataset
+            pipe, device, id_to_char, char_to_id, tile_descriptors, num_tiles, dataloader, perm_caption, caption_data = creation_of_parameters(one_caption, args.max_permutations)
+            if not pipe:
+                print("Failed to create pipeline.")
+                return
+            avg_score, all_samples, all_prompts, compare_all_scores = calculate_caption_score_and_samples(device, pipe, dataloader, args.inference_steps, args.guidance_scale, args.seed, id_to_char, char_to_id, tile_descriptors, args.describe_absence, output=True, height=common_settings.MARIO_HEIGHT, width=common_settings.MARIO_WIDTH)
+            scores, avg_score, std_dev_score, min_score, max_score, median_score = statistics_of_captions(perm_caption, dataloader, compare_all_scores, pipe, device, id_to_char, char_to_id, tile_descriptors, num_tiles)
+            if args.save_as_json:
+                result_entry = {
+                        "Caption": one_caption,
+                        "Average score for all permutations": avg_score,
+                        "Standard deviation": std_dev_score,
+                        "Minimum score": min_score,
+                        "Maximum score": max_score,
+                        "Median score": median_score
+                            #"samples": all_samples[i].tolist() if hasattr(all_samples, "__getitem__") else None,
+                            #"prompt": all_prompts[i] if i < len(all_prompts) else "N/A"
+                    }
+                f.write(json.dumps(result_entry) + "\n")
+            all_avg_scores.append(avg_score)
+            #scores, avg_score, std_dev_score, min_score, max_score, median_score = statistics_of_captions(perm_caption, dataloader, compare_all_scores, pipe, device, id_to_char, char_to_id, tile_descriptors, num_tiles)
+            for score in enumerate(scores):
+                all_scores.append(score)
+            all_std_dev_scores.append(std_dev_score)
+            all_min_scores.append(min_score)
+            all_max_scores.append(max_score)
+            all_median_scores.append(median_score)
+            if (count % 10) == 0:
+                f.flush()  # Ensure each result is written immediately
+                os.fsync(f.fileno())  # Ensure file is flushed to disk
+            count = count + 1
+    print(f"\nAverage score across all captions: {avg_score:.4f}")
+    visualizations_dir = os.path.join(os.path.dirname(__file__), "visualizations")
+    if args.caption is not None or "":
+        caption_folder = args.caption.replace(" ", "_").replace(".", "_")
+        output_directory = os.path.join(visualizations_dir, caption_folder)
+        visualize_samples(
+            all_samples,
+            output_dir=output_directory,
+            prompts=all_prompts[0] if all_prompts else "No prompts available"
+        )
+        print(f"\nVisualizations saved to: {output_directory}")
+    print("\nAll samples shape:", all_samples.shape)
+    print("\nAll prompts:", all_prompts)
+    all_avg_score = np.mean(all_avg_scores)
+    all_std_dev_score = np.std(all_std_dev_scores)
+    all_min_score = np.min(all_min_scores)
+    all_max_score = np.max(all_max_scores)
+    all_median_score = np.median(all_median_scores)
+    if args.save_as_json:
+        output_jsonl_path = os.path.join(args.output_dir, "evaluation_caption_order_results.jsonl")
+        with open(output_jsonl_path, "w") as f:
+            if isinstance(caption, list) or (args.caption is None or args.caption == ""):
+                # Multiple captions (permuted)
+                for i, score in enumerate(all_avg_scores):
+                    result_entry = {
+                        "Caption": caption[i] if i < len(caption) else "N/A",
+                        "Average score for all permutations": score,
+                        #"samples": all_samples[i].tolist() if hasattr(all_samples, "__getitem__") else None,
+                        #"prompt": all_prompts[i] if i < len(all_prompts) else "N/A"
+                    }
+                    f.write(json.dumps(result_entry) + "\n")
+            else:
+                # Single caption
+                result_entry = {
+                    "caption": caption,
+                    "avg_score": avg_score,
+                    "samples": all_samples.tolist(),
+                    "prompts": all_prompts
+                }
+                f.write(json.dumps(result_entry) + "\n")
+            results = {
+                "Scores of all captions": {
+                "Scores": all_scores,
+                    "Number of captions": len(all_scores),
+                    "Average of all permutations": all_avg_score,
+                    "Standard deviation of all permutations": all_std_dev_score,
+                    "Min score of all permutations": all_min_score,
+                    "Max score of all permutations": all_max_score,
+                    "Median score of all permutations": all_median_score
+                },
+            }
+            json.dump(results, f, indent=4)
+        print(f"Results saved to {output_jsonl_path}")
+if __name__ == "__main__":
+    main()

captions/util.py ADDED Viewed

	@@ -0,0 +1,216 @@

+import json
+import sys
+import os
+from collections import Counter
+# This file contains utility functions for analyzing and describing levels in both Lode Runner and Super Mario Bros.
+# Could define these via the command line, but for now they are hardcoded
+coarse_locations = True
+coarse_counts = True
+pluralize = True
+give_staircase_lengths = False
+def describe_size(count):
+    if count <= 4: return "small"
+    else: return "big"
+def describe_quantity(count):
+    if count == 0: return "no"
+    elif count == 1: return "one"
+    elif count == 2: return "two"
+    elif count < 5: return "a few"
+    elif count < 10: return "several"
+    else: return "many"
+def get_tile_descriptors(tileset):
+    """Creates a mapping from tile character to its list of descriptors."""
+    result = {char: set(attrs) for char, attrs in tileset["tiles"].items()}
+    # Fake tiles. Should these contain anything? Note that code elsewhere expects everything to be passable or solid
+    result["!"] = {"passable"}
+    result["*"] = {"passable"}
+    return result
+def analyze_floor(scene, id_to_char, tile_descriptors, describe_absence):
+    """Analyzes the last row of the 32x32 scene and generates a floor description."""
+    WIDTH = len(scene[0])
+    last_row = scene[-1]  # The FLOOR row of the scene
+    solid_count = sum(
+        1 for tile in last_row
+        if tile in id_to_char and (
+            "solid" in tile_descriptors.get(id_to_char[tile], []) or
+            "diggable" in tile_descriptors.get(id_to_char[tile], [])
+        )
+    )
+    passable_count = sum(
+        1 for tile in last_row if "passable" in tile_descriptors.get(id_to_char[tile], [])
+    )
+    if solid_count == WIDTH:
+        return "full floor"
+    elif passable_count == WIDTH:
+        if describe_absence:
+            return "no floor"
+        else:
+            return ""
+    elif solid_count > passable_count:
+        # Count contiguous groups of passable tiles
+        gaps = 0
+        in_gap = False
+        for tile in last_row:
+            # Enemies are also a gap since they immediately fall into the gap
+            if "passable" in tile_descriptors.get(id_to_char[tile], []) or "enemy" in tile_descriptors.get(id_to_char[tile], []):
+                if not in_gap:
+                    gaps += 1
+                    in_gap = True
+            elif "solid" in tile_descriptors.get(id_to_char[tile], []):
+                in_gap = False
+            else:
+                print("error")
+                print(tile)
+                print(id_to_char[tile])
+                print(tile_descriptors)
+                print(tile_descriptors.get(id_to_char[tile], []))
+                raise ValueError("Every tile should be passable, solid, or enemy")
+        return f"floor with {describe_quantity(gaps) if coarse_counts else gaps} gap" + ("s" if pluralize and gaps != 1 else "")
+    else:
+        # Count contiguous groups of solid tiles
+        chunks = 0
+        in_chunk = False
+        for tile in last_row:
+            if "solid" in tile_descriptors.get(id_to_char[tile], []):
+                if not in_chunk:
+                    chunks += 1
+                    in_chunk = True
+            elif "passable" in tile_descriptors.get(id_to_char[tile], []) or "enemy" in tile_descriptors.get(id_to_char[tile], []):
+                in_chunk = False
+            else:
+                print("error")
+                print(tile)
+                print(tile_descriptors)
+                print(tile_descriptors.get(tile, []))
+                raise ValueError("Every tile should be either passable or solid")
+        return f"giant gap with {describe_quantity(chunks) if coarse_counts else chunks} chunk"+("s" if pluralize and chunks != 1 else "")+" of floor"
+def count_in_scene(scene, tiles, exclude=set()):
+    """ counts standalone tiles, unless they are in the exclude set """
+    count = 0
+    for r, row in enumerate(scene):
+        for c, t in enumerate(row):
+            #if exclude and t in tiles: print(r,c, exclude)
+            if (r,c) not in exclude and t in tiles:
+                #if exclude: print((r,t), exclude, (r,t) in exclude)
+                count += 1
+    #if exclude: print(tiles, exclude, count)
+    return count
+def count_caption_phrase(scene, tiles, name, names, offset = 0, describe_absence=False, exclude=set()):
+    """ offset modifies count used in caption """
+    count = offset + count_in_scene(scene, tiles, exclude)
+    #if name == "loose block": print("count", count)
+    if count > 0:
+        return f" {describe_quantity(count) if coarse_counts else count} " + (names if pluralize and count > 1 else name) + "."
+    elif describe_absence:
+        return f" no {names}."
+    else:
+        return ""
+def in_column(scene, x, tile):
+    for row in scene:
+        if row[x] == tile:
+            return True
+    return False
+def analyze_ceiling(scene, id_to_char, tile_descriptors, describe_absence, ceiling_row = 1):
+    """
+    Analyzes ceiling row (0-based index) to detect a ceiling.
+    Returns a caption phrase or an empty string if no ceiling is detected.
+    """
+    WIDTH = len(scene[0])
+    row = scene[ceiling_row]
+    solid_count = sum(1 for tile in row if "solid" in tile_descriptors.get(id_to_char[tile], []))
+    if solid_count == WIDTH:
+        return " full ceiling."
+    elif solid_count > WIDTH//2:
+        # Count contiguous gaps of passable tiles
+        gaps = 0
+        in_gap = False
+        for tile in row:
+            # Enemies are also a gap since they immediately fall into the gap, but they are marked as "moving" and not "passable"
+            if "passable" in tile_descriptors.get(id_to_char[tile], []) or "moving" in tile_descriptors.get(id_to_char[tile], []):
+                if not in_gap:
+                    gaps += 1
+                    in_gap = True
+            else:
+                in_gap = False
+        result = f" ceiling with {describe_quantity(gaps) if coarse_counts else gaps} gap" + ("s" if pluralize and gaps != 1 else "") + "."
+        # Adding the "moving" check should make this code unnecessary
+        #if result == ' ceiling with no gaps.':
+        #    print("This should not happen: ceiling with no gaps")
+        #    print("ceiling_row:", scene[ceiling_row])
+        #    result = " full ceiling."
+        return result
+    elif describe_absence:
+        return " no ceiling."
+    else:
+        return ""  # Not enough solid tiles for a ceiling
+def extract_tileset(tileset_path):
+    # Load tileset
+    with open(tileset_path, "r") as f:
+        tileset = json.load(f)
+        #print(f"tileset: {tileset}")
+        tile_chars = sorted(tileset['tiles'].keys())
+        # Wiggle room for the tileset to be a bit more flexible.
+        # However, this requires me to add some bogus tiles to the list.
+        # tile_chars.append('!')
+        # tile_chars.append('*')
+        #print(f"tile_chars: {tile_chars}")
+        id_to_char = {idx: char for idx, char in enumerate(tile_chars)}
+        #print(f"id_to_char: {id_to_char}")
+        char_to_id = {char: idx for idx, char in enumerate(tile_chars)}
+        #print(f"char_to_id: {char_to_id}")
+        tile_descriptors = get_tile_descriptors(tileset)
+        #print(f"tile_descriptors: {tile_descriptors}")
+    return tile_chars, id_to_char, char_to_id, tile_descriptors
+def flood_fill(scene, visited, start_row, start_col, id_to_char, tile_descriptors, excluded, pipes=False, target_descriptor=None):
+    stack = [(start_row, start_col)]
+    structure = []
+    while stack:
+        row, col = stack.pop()
+        if (row, col) in visited or (row, col) in excluded:
+            continue
+        tile = scene[row][col]
+        descriptors = tile_descriptors.get(id_to_char[tile], [])
+        # Use target_descriptor if provided, otherwise default to old solid/pipe logic
+        if target_descriptor is not None:
+            if target_descriptor not in descriptors:
+                continue
+        else:
+            if "solid" not in descriptors or (not pipes and "pipe" in descriptors) or (pipes and "pipe" not in descriptors):
+                continue
+        visited.add((row, col))
+        structure.append((row, col))
+        # Check neighbors
+        for d_row, d_col in [(-1,0), (1,0), (0,-1), (0,1)]:
+            # Weird special case for adjacent pipes
+            if (id_to_char[tile] == '>' or id_to_char[tile] == ']') and d_col == 1: # if on the right edge of a pipe
+                continue # Don't go right if on the right edge of a pipe
+            if (id_to_char[tile] == '<' or id_to_char[tile] == '[') and d_col == -1: # if on the left edge of a pipe
+                continue # Don't go left if on the left edge of a pipe
+            n_row, n_col = row + d_row, col + d_col
+            if 0 <= n_row < len(scene) and 0 <= n_col < len(scene[0]):
+                stack.append((n_row, n_col))
+    return structure

mapsheet.png ADDED Viewed

models/general_training_helper.py ADDED Viewed

	@@ -0,0 +1,172 @@

+from torch.utils.data import DataLoader
+from level_dataset import LevelDataset
+import random
+from util.plotter import Plotter
+from datetime import datetime
+import os
+import threading
+import json
+import torch.nn.functional as F
+import torch
+def create_dataloaders(json_path, val_json, tokenizer, data_mode, augment, num_tiles,
+                       negative_prompt_training, block_embeddings, batch_size):
+    # Initialize dataset
+    train_dataset = LevelDataset(
+        json_path=json_path,
+        tokenizer=tokenizer,
+        shuffle=True,
+        mode=data_mode,
+        augment=augment,
+        num_tiles=num_tiles,
+        negative_captions=negative_prompt_training,
+        block_embeddings=block_embeddings
+    )
+    val_dataset = None
+    if val_json is not None:
+        val_dataset = LevelDataset(
+            json_path=val_json,
+            tokenizer=tokenizer,
+            shuffle=False,
+            mode=data_mode,
+            augment=False,
+            num_tiles=num_tiles,
+            negative_captions=negative_prompt_training,
+            block_embeddings=block_embeddings
+        )
+    # Create dataloader
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_size=batch_size,
+        shuffle=True,
+        num_workers=4,
+        drop_last=True,
+        persistent_workers=True
+    )
+    val_dataloader = None
+    if val_dataset is not None:
+        val_dataloader = DataLoader(
+            val_dataset,
+            batch_size=batch_size,
+            shuffle=False,
+            num_workers=4,
+            drop_last=False,
+            persistent_workers=True
+        )
+    return train_dataloader, val_dataloader
+def get_random_training_samples(train_dataloader, negative_prompt_training, output_dir = None):
+    train_dataset = train_dataloader.dataset
+    # Sample four random captions from the dataset
+    sample_indices = [random.randint(0, len(train_dataset) - 1) for _ in range(4)]
+    sample_captions = [train_dataset[i][1] for i in sample_indices]
+    print("Sample captions:")
+    for caption in sample_captions:
+        print(caption)
+    sample_negative_captions = ""
+    if negative_prompt_training:
+        sample_negative_captions = [train_dataset[i][2] for i in sample_indices]
+        print("Sample negative captions:")
+        for caption in sample_negative_captions:
+            print(f"  NEG: {caption}")
+    #Write captions to a file
+    if output_dir is not None:
+        os.makedirs(output_dir, exist_ok=True)
+        out_path = os.path.join(output_dir, "sample_captions.txt")
+        with open(out_path, "w", encoding="utf-8") as f:
+            f.write("Sample captions:\n")
+            for caption in sample_captions:
+                f.write(str(caption) + "\n")
+            if negative_prompt_training:
+                f.write("\nSample negative captions:\n")
+                for caption in sample_negative_captions:
+                    f.write(str(caption) + "\n")
+        print(f"Sample captions written to {out_path}")
+    return sample_captions, sample_negative_captions
+def start_plotter(log_file, output_dir, left_key, right_key, left_label, right_label, png_name):
+    formatted_date = datetime.now().strftime(r'%Y%m%d-%H%M%S')
+    plotter = Plotter(log_file, update_interval=5.0, left_key=left_key, right_key=right_key,
+                            left_label=left_label, right_label=right_label, output_png=f'{png_name}_{formatted_date}.png')
+    plot_thread = threading.Thread(target=plotter.start_plotting)
+    plot_thread.daemon = True
+    plot_thread.start()
+    print(f"{png_name} plotting enabled. Progress will be saved to {os.path.join(output_dir, f'{png_name}_{formatted_date}.png')}")
+    return plotter, plot_thread
+def kill_plotter(plotter, plot_thread):
+    if plot_thread and plot_thread.is_alive():
+        plotter.stop_plotting()
+        plot_thread.join(timeout=5.0)
+        if plot_thread.is_alive():
+            print("Warning: Plot thread did not terminate properly")
+def load_config_from_json(config_path):
+    """Load hyperparameters from a JSON config file."""
+    try:
+        with open(config_path, 'r') as f:
+            config = json.load(f)
+            print(f"Configuration loaded from {config_path}")
+            # Print the loaded config for verification
+            print("Loaded hyperparameters:")
+            for key, value in config.items():
+                print(f"  {key}: {value}")
+            return config
+    except (json.JSONDecodeError, FileNotFoundError) as e:
+        print(f"Error loading config file: {e}")
+        raise e
+def update_args_from_config(args, config):
+    """Update argparse namespace with values from config."""
+    # Convert config dict to argparse namespace
+    for key, value in config.items():
+        if hasattr(args, key):
+            setattr(args, key, value)
+    return args
+def get_scene_from_embeddings(image, block_embeddings):
+    """Code copied over from level_dataset, should give limited support for block embeddings"""
+    # Reshape sample to [batch_size * height * width, embedding_dim]
+    batch_size, embedding_dim, height, width = image.shape
+    flat_samples = image.permute(0, 2, 3, 1).reshape(-1, embedding_dim)
+    # Normalize vectors for cosine similarity
+    flat_samples = F.normalize(flat_samples, p=2, dim=1).cpu()
+    block_embeddings = F.normalize(block_embeddings, p=2, dim=1)
+    # Calculate cosine similarity between each position and all tile embeddings
+    similarities = torch.matmul(flat_samples, block_embeddings.t())
+    # Get indices of most similar tiles
+    indices = torch.softmax(similarities, dim=1)
+    # Reshape back to [batch_size, height, width]
+    indices = indices.reshape(batch_size, height, width, 13)
+    indices = indices.permute(0, 3, 1, 2)
+    image=indices.detach().cpu()
+    return image

models/latent_diffusion_pipeline.py ADDED Viewed

	@@ -0,0 +1,99 @@

+from diffusers import DDPMPipeline
+import torch
+import torch.nn.functional as F
+from typing import Optional, Union, List, Tuple
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.ddpm.pipeline_ddpm import ImagePipelineOutput
+import util.common_settings as common_settings
+import os
+import json
+from models.general_training_helper import get_scene_from_embeddings
+class UnconditionalDDPMPipeline(DDPMPipeline):
+    def __init__(self, unet, scheduler, block_embeddings=None):
+        super().__init__(unet, scheduler)
+        self.block_embeddings = block_embeddings
+    def save_pretrained(self, save_directory):
+        os.makedirs(save_directory, exist_ok=True)
+        super().save_pretrained(save_directory)
+        # Save block_embeddings tensor if it exists
+        if self.block_embeddings is not None:
+            torch.save(self.block_embeddings, os.path.join(save_directory, "block_embeddings.pt"))
+    @classmethod
+    def from_pretrained(cls, pretrained_model_path, **kwargs):
+        pipeline = super().from_pretrained(pretrained_model_path, **kwargs)
+        # Load block_embeddings tensor if it exists
+        block_embeds_path = os.path.join(pretrained_model_path, "block_embeddings.pt")
+        if os.path.exists(block_embeds_path):
+            pipeline.block_embeddings = torch.load(block_embeds_path, map_location="cpu")
+        else:
+            pipeline.block_embeddings = None
+        return pipeline
+    def give_sprite_scaling_factors(self, sprite_scaling_factors):
+        """
+        Set the sprite scaling factors for the pipeline.
+        This is used to apply per-sprite temperature scaling during inference.
+        """
+        self.sprite_scaling_factors = sprite_scaling_factors
+    def __call__(
+        self,
+        batch_size: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        num_inference_steps: int = common_settings.NUM_INFERENCE_STEPS,
+        output_type: Optional[str] = "tensor",
+        return_dict: bool = True,
+        height: int = common_settings.MARIO_HEIGHT, width: int = common_settings.MARIO_WIDTH,
+        latents: Optional[torch.FloatTensor] = None,
+        show_progress_bar=True,
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        self.unet.eval()
+        with torch.no_grad():
+            if latents is not None:
+                image = latents.to(self.device)
+            else:
+                image_shape = (
+                    batch_size,
+                    self.unet.config.in_channels,
+                    height,
+                    width
+                )
+                image = torch.randn(image_shape, generator=generator, device=self.device)
+            self.scheduler.set_timesteps(num_inference_steps)
+            iterator = self.progress_bar(self.scheduler.timesteps) if show_progress_bar else self.scheduler.timesteps
+            for t in iterator:
+                #print(image.shape)
+                model_output = self.unet(image, t).sample
+                image = self.scheduler.step(model_output, t, image, generator=generator).prev_sample
+            # Apply per-sprite temperature scaling if enabled
+            if hasattr(self,"sprite_scaling_factors") and self.sprite_scaling_factors is not None:
+                image = image / self.sprite_scaling_factors.view(1, -1, 1, 1)
+            if self.block_embeddings is not None:
+                image = get_scene_from_embeddings(image, self.block_embeddings)
+            else:
+                image = F.softmax(image, dim=1)
+                image = image.detach().cpu()
+            if not return_dict:
+                return (image,)
+            return ImagePipelineOutput(images=image)
+    def print_unet_architecture(self):
+        """Prints the architecture of the UNet model."""
+        print(self.unet)

models/pipeline_loader.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from models.text_diffusion_pipeline import TextConditionalDDPMPipeline
+from models.latent_diffusion_pipeline import UnconditionalDDPMPipeline
+import os
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+def get_pipeline(model_path):
+    # If model_path is a local directory, use the original logic
+    if os.path.isdir(model_path):
+        #Diffusion models
+        if os.path.exists(os.path.join(model_path, "unet")):
+            if os.path.exists(os.path.join(model_path, "text_encoder")):
+                #If it has a text encoder and a unet, it's text conditional diffusion
+                pipe = TextConditionalDDPMPipeline.from_pretrained(model_path)
+            else:
+                #If it has no text encoder, use the unconditional diffusion model
+                pipe = UnconditionalDDPMPipeline.from_pretrained(model_path)
+    else:
+        # Assume it's a Hugging Face Hub model ID
+        # Try to load config to determine if it's text-conditional
+        try:
+            config, _ = DiffusionPipeline.load_config(model_path)
+            components = config.get("components", {})
+        except Exception:
+            components = {}
+        if "text_encoder" in components or "text_encoder" in str(components):
+            # Use the local pipeline file for custom_pipeline
+            pipe = DiffusionPipeline.from_pretrained(
+                model_path,
+                custom_pipeline="models.text_diffusion_pipeline.TextConditionalDDPMPipeline",
+                trust_remote_code=True,
+            )
+        else:
+            # Fallback: try unconditional
+            pipe = DiffusionPipeline.from_pretrained(
+                model_path,
+                custom_pipeline="models.latent_diffusion_pipeline.UnconditionalDDPMPipeline",
+                trust_remote_code=True,
+            )
+    return pipe

models/sentence_transformers_helper.py ADDED Viewed

	@@ -0,0 +1,114 @@

+from transformers import AutoTokenizer, AutoModel
+import torch
+import torch.nn.functional as F
+#Mean Pooling - Take average of all tokens
+def mean_pooling(model_output, attention_mask):
+    token_embeddings = model_output.last_hidden_state
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+#Encode text
+def encode(texts, tokenizer, model, device='cpu'):
+    # Tokenize sentences
+    encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
+    encoded_input.to(device)
+    # Compute token embeddings
+    with torch.no_grad():
+        model_output = model(**encoded_input, return_dict=True)
+    # Perform pooling
+    embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
+    # Normalize embeddings
+    embeddings = F.normalize(embeddings, p=2, dim=1)
+    embeddings = embeddings.to(device)
+    return embeddings
+# Get embeddings for a batch of captions and optional negative captions
+def get_embeddings(batch_size, tokenizer, model, captions=None, neg_captions=None, device='cpu'):
+    embeddings = encode([""]*batch_size, tokenizer, model, device)
+    if captions is not None:
+        caption_embeddings = encode(captions, tokenizer, model, device)
+        embeddings = torch.cat((embeddings, caption_embeddings), dim=0)
+    if neg_captions is not None:
+        neg_embeddings = encode(neg_captions, tokenizer, model, device)
+        embeddings = torch.cat((neg_embeddings, embeddings), dim=0)
+    embeddings = embeddings.unsqueeze(1)
+    return embeddings
+def get_embeddings_split(batch_size, tokenizer, model, captions=None, neg_captions=None, device='cpu', max_length=20):
+    padding_length = max(max([s.count(".") for s in captions]) if captions else 1,
+                     max([s.count(".") for s in neg_captions]) if neg_captions else 1)
+    if (padding_length>max_length):
+        raise ValueError(f"Token sequence length {padding_length} exceeds specified length {max_length}.")
+    empty_split = split_sentences([""] * batch_size, padding_length)
+    embeddings = get_embeddings_from_split(empty_split, tokenizer, model, device)
+    if(captions is not None):
+        captions_split = split_sentences(captions, padding_length)
+        caption_embeddings = get_embeddings_from_split(captions_split, tokenizer, model, device)
+        embeddings = torch.cat((embeddings, caption_embeddings), dim=0)
+    if(neg_captions is not None):
+        neg_split = split_sentences(neg_captions, padding_length)
+        neg_embeddings = get_embeddings_from_split(neg_split, tokenizer, model, device)
+        embeddings = torch.cat((neg_embeddings, embeddings), dim=0)
+    #We don't need to unsqueeze this, we have an array of (batch_size, padding_length, encoding_size) already
+    return embeddings.to(device)
+#This method takes a caption batch in list form, and outputs a 2d list where every caption has been split by period
+def split_sentences(caption_array, padding_length=20):
+    split_caption_array = []
+    #Padding happens here
+    for caption in caption_array:
+        split_caption = [s.strip() for s in caption.split(".") if s.strip()]
+        #This is the token padding, we just use an empty string
+        split_caption += [""] * (padding_length - len(split_caption))
+        split_caption_array.append(split_caption)
+    return split_caption_array
+#Expects all split vectors to be the same length
+def get_embeddings_from_split(caption_batch, tokenizer, model, device='cpu'):
+    all_caption_encodings = []
+    for caption_sequence in caption_batch:
+        #Encode the sequence of split captions as if it was a batch, should now be a [maxlength, embeddingsize] tensor
+        caption_sequence = encode(caption_sequence, tokenizer, model, device)
+        #We don't reshape this to avoid having to unsqueeze it later
+        all_caption_encodings.append(caption_sequence)
+    all_caption_encodings = torch.stack(all_caption_encodings, dim=0)
+    return all_caption_encodings
+if __name__ == "__main__":
+    cap = split_sentences(["Hello. My name is George. How. Are you doing. Today?", "I am doing. Just fine. Thanks."])
+    model_url = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
+    device = 'cuda'
+    tokenizer = AutoTokenizer.from_pretrained(model_url)
+    model = AutoModel.from_pretrained(model_url, trust_remote_code=True).to(device)
+    get_embeddings_from_split(cap, tokenizer, model, device)

models/text_diffusion_pipeline.py ADDED Viewed

	@@ -0,0 +1,442 @@

+import torch
+import torch.nn.functional as F
+from typing import NamedTuple, Optional
+import os
+from diffusers import DDPMPipeline, UNet2DConditionModel, DDPMScheduler
+import json
+# Running the main at the end of this requires messing with this import
+from models.text_model import TransformerModel
+import torch
+import torch.nn.functional as F
+from transformers import AutoTokenizer, AutoModel
+import util.common_settings as common_settings
+import models.sentence_transformers_helper as st_helper
+import models.text_model as text_model
+from models.general_training_helper import get_scene_from_embeddings
+class PipelineOutput(NamedTuple):
+    images: torch.Tensor
+# Create a custom pipeline for text-conditional generation
+class TextConditionalDDPMPipeline(DDPMPipeline):
+    def __init__(self, unet, scheduler, text_encoder=None, tokenizer=None, supports_pretrained_split=False, block_embeddings=None):
+        super().__init__(unet=unet, scheduler=scheduler)
+        self.text_encoder = text_encoder
+        self.tokenizer = tokenizer
+        self.supports_negative_prompt = hasattr(unet, 'negative_prompt_support') and unet.negative_prompt_support
+        self.supports_pretrained_split = supports_pretrained_split
+        self.block_embeddings = block_embeddings
+        if self.tokenizer is None and self.text_encoder is not None:
+            # Use the tokenizer from the text encoder if not provided
+            self.tokenizer = self.text_encoder.tokenizer
+        # Register the text_encoder so that .to(), .cpu(), .cuda(), etc. work correctly
+        self.register_modules(
+            unet=unet,
+            scheduler=scheduler,
+            text_encoder=self.text_encoder,
+            tokenizer=self.tokenizer,
+        )
+    # Override the to() method to ensure text_encoder is moved to the correct device
+    def to(self, device=None, dtype=None):
+        # Call the parent's to() method first
+        pipeline = super().to(device, dtype)
+        # Additionally move the text_encoder to the device
+        if self.text_encoder is not None:
+            self.text_encoder.to(device)
+        return pipeline
+    def save_pretrained(self, save_directory):
+        os.makedirs(save_directory, exist_ok=True)
+        super().save_pretrained(save_directory)  # saves UNet and scheduler
+        # Save block_embeddings tensor if it exists
+        if self.block_embeddings is not None:
+            torch.save(self.block_embeddings, os.path.join(save_directory, "block_embeddings.pt"))
+        # Save supports_negative_prompt and supports_pretrained_split flags
+        with open(os.path.join(save_directory, "pipeline_config.json"), "w") as f:
+            json.dump({
+                "supports_negative_prompt": self.supports_negative_prompt,
+                "supports_pretrained_split": self.supports_pretrained_split,
+                "text_encoder_type": type(self.text_encoder).__name__
+            }, f)
+        #Text encoder/tokenizer saving is different depending on if we're using a larger pretrained model
+        if isinstance(self.text_encoder, TransformerModel):
+            # Save custom text encoder
+            if self.text_encoder is not None:
+                self.text_encoder.save_pretrained(os.path.join(save_directory, "text_encoder"))
+        else:
+            #Save pretrained tokenizer by name, so we can load from huggingface instead of saving a giant local model
+            text_encoder_info = {
+                "text_encoder_name": self.text_encoder.config.name_or_path,
+                "tokenizer_name": self.tokenizer.name_or_path,
+            }
+            text_encoder_directory = os.path.join(save_directory, "text_encoder")
+            os.makedirs(text_encoder_directory, exist_ok=True)
+            with open(os.path.join(text_encoder_directory, "loading_info.json"), "w") as f:
+                json.dump(text_encoder_info, f)
+    @classmethod
+    def from_pretrained(cls, pretrained_model_path, **kwargs):
+        #from diffusers.utils import load_config, load_state_dict
+        # Load model_index.json
+        #model_index = load_config(pretrained_model_path)
+        # Load components manually
+        unet_path = os.path.join(pretrained_model_path, "unet")
+        unet = UNet2DConditionModel.from_pretrained(unet_path)
+        scheduler_path = os.path.join(pretrained_model_path, "scheduler")
+        # Have heard that DDIMScheduler might be faster for inference, though not necessarily better
+        scheduler = DDPMScheduler.from_pretrained(scheduler_path)
+        tokenizer = None
+        text_encoder_path = os.path.join(pretrained_model_path, "text_encoder")
+        if os.path.exists(text_encoder_path):
+            #Test for the new saving system, where we save a simple config file
+            if os.path.exists(os.path.join(text_encoder_path, "loading_info.json")):
+                with open(os.path.join(text_encoder_path, "loading_info.json"), "r") as f:
+                    encoder_config = json.load(f)
+                text_encoder = AutoModel.from_pretrained(encoder_config['text_encoder_name'], trust_remote_code=True)
+                tokenizer = AutoTokenizer.from_pretrained(encoder_config['tokenizer_name'])
+            #Legacy loading system, loads models directly if the whole thing is saved in the directory
+            else:
+                try:
+                    text_encoder = AutoModel.from_pretrained(text_encoder_path, local_files_only=True, trust_remote_code=True)
+                    tokenizer = AutoTokenizer.from_pretrained(text_encoder_path, local_files_only=True)
+                except (ValueError, KeyError):
+                    text_encoder = TransformerModel.from_pretrained(text_encoder_path)
+                    tokenizer = text_encoder.tokenizer
+        else:
+            text_encoder = None
+        # Instantiate your pipeline
+        pipeline = cls(
+            unet=unet,
+            scheduler=scheduler,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            **kwargs,
+        )
+        #Loads block embeddings if present
+        block_embeds_path = os.path.join(pretrained_model_path, "block_embeddings.pt")
+        if os.path.exists(block_embeds_path):
+            pipeline.block_embeddings = torch.load(block_embeds_path, map_location="cpu")
+        else:
+            pipeline.block_embeddings = None
+        # Load supports_negative_prompt flag if present
+        config_path = os.path.join(pretrained_model_path, "pipeline_config.json")
+        if os.path.exists(config_path):
+            with open(config_path, "r") as f:
+                config = json.load(f)
+            pipeline.supports_negative_prompt = config.get("supports_negative_prompt", False)
+            pipeline.supports_pretrained_split = config.get("supports_pretrained_split", False)
+        return pipeline
+    # --- Handle batching for captions ---
+    def _prepare_text_batch(self, text: Optional[str | list[str]], batch_size: int, name: str) -> Optional[list[str]]:
+        if text is None:
+            return None
+        if isinstance(text, str):
+            return [text] * batch_size
+        if isinstance(text, list):
+            if len(text) == 1:
+                return text * batch_size
+            if len(text) != batch_size:
+                raise ValueError(f"{name} list length {len(text)} does not match batch_size {batch_size}")
+            return text
+        raise ValueError(f"{name} must be a string or list of strings")
+    def _prepare_initial_sample(self,
+                                raw_latent_sample: Optional[torch.Tensor],
+                                input_scene: Optional[torch.Tensor],
+                                batch_size: int, height: int, width: int,
+                                generator: Optional[torch.Generator]) -> torch.Tensor:
+        """Prepare the initial sample for diffusion."""
+        sample_shape = (batch_size, self.unet.config.in_channels, height, width)
+        if raw_latent_sample is not None:
+            if input_scene is not None:
+                raise ValueError("Cannot provide both raw_latent_sample and input_scene")
+            sample = raw_latent_sample.to(self.device)
+            if sample.shape[1] != sample_shape[1]:
+                raise ValueError(f"Wrong number of channels in raw_latent_sample: Expected {self.unet.config.in_channels} but got {sample.shape[1]}")
+            if sample.shape[0] == 1 and batch_size > 1:
+                sample = sample.repeat(batch_size, 1, 1, 1)
+            elif sample.shape[0] != batch_size:
+                raise ValueError(f"raw_latent_sample batch size {sample.shape[0]} does not match batch_size {batch_size}")
+        elif input_scene is not None:
+            # input_scene can be (H, W) or (batch_size, H, W)
+            scene_tensor = torch.tensor(input_scene, dtype=torch.long, device=self.device)
+            if scene_tensor.dim() == 2:
+                # (H, W) -> repeat for batch
+                scene_tensor = scene_tensor.unsqueeze(0).repeat(batch_size, 1, 1)
+            elif scene_tensor.shape[0] == 1 and batch_size > 1:
+                scene_tensor = scene_tensor.repeat(batch_size, 1, 1)
+            elif scene_tensor.shape[0] != batch_size:
+                raise ValueError(f"input_scene batch size {scene_tensor.shape[0]} does not match batch_size {batch_size}")
+            # One-hot encode: (batch, H, W, C)
+            one_hot = F.one_hot(scene_tensor, num_classes=self.unet.config.in_channels).float()
+            # (batch, H, W, C) -> (batch, C, H, W)
+            sample = one_hot.permute(0, 3, 1, 2)
+        else:
+            # Start from random noise
+            sample = torch.randn(sample_shape, generator=generator, device=self.device)
+        return sample
+    def __call__(
+        self,
+        caption: Optional[str | list[str]] = None,
+        negative_prompt: Optional[str | list[str]] = None,
+        generator: Optional[torch.Generator] = None,
+        num_inference_steps: int = common_settings.NUM_INFERENCE_STEPS,
+        guidance_scale: float = common_settings.GUIDANCE_SCALE,
+        height: int = common_settings.MARIO_HEIGHT,
+        width: int = common_settings.MARIO_WIDTH,
+        raw_latent_sample: Optional[torch.FloatTensor] = None,
+        input_scene: Optional[torch.Tensor] = None,
+        output_type: str = "tensor",
+        batch_size: int = 1,
+        show_progress_bar: bool = True,
+    ) -> PipelineOutput:
+        """Generate a batch of images based on text input using the diffusion model.
+        Args:
+            caption: Text description(s) of the desired output. Can be a string or list of strings.
+            negative_prompt: Text description(s) of what should not appear in the output. String or list.
+            generator: Random number generator for reproducibility.
+            num_inference_steps: Number of denoising steps (more = higher quality, slower).
+            guidance_scale: How strongly the generation follows the text prompt (higher = stronger).
+            height: Height of generated image in tiles.
+            width: Width of generated image in tiles.
+            raw_latent_sample: Optional starting point for diffusion instead of random noise.
+                Must have correct number of channels matching the UNet.
+            input_scene: Optional 2D or 3D int tensor where each value corresponds to a tile type.
+                Will be converted to one-hot encoding as starting point.
+            output_type: Currently only "tensor" is supported.
+            batch_size: Number of samples to generate in parallel.
+        Returns:
+            PipelineOutput containing the generated image tensor (batch_size, ...).
+        """
+        #       I would like to simplify the code to this, but the AI suggestion didn't work, and
+        #       I did not feel good just pasting it all in. Will need to tackle it bit by bit.
+        #        if caption is not None and self.text_encoder is None:
+        #            raise ValueError("Text encoder required for conditional generation")
+        #        self.unet.eval()
+        #        if self.text_encoder is not None:
+        #            self.text_encoder.to(self.device)
+        #            self.text_encoder.eval()
+        #
+        #        with torch.no_grad():
+        #            # Process text inputs
+        #            captions = self.prepare_text_batch(caption, batch_size, "caption")
+        #            negatives = self.prepare_text_batch(negative_prompt, batch_size, "negative_prompt")
+        #            # Get embeddings
+        #            text_embeddings = self.prepare_embeddings(captions, negatives, batch_size)
+        #
+        #            # Set up initial latent state
+        #            sample = self.prepare_initial_sample(raw_latent_sample, input_scene,
+        #                                              batch_size, height, width, generator)
+        #            # Run diffusion process
+        #            sample = self.run_diffusion(sample, text_embeddings, num_inference_steps,
+        #                                      guidance_scale, generator, show_progress_bar,
+        #                                      has_caption=caption is not None,
+        #                                      has_negative=negative_prompt is not None)
+        #            # Format output
+        #            if output_type == "tensor":
+        #                sample = F.softmax(sample, dim=1)
+        #            else:
+        #                raise ValueError(f"Unsupported output type: {output_type}")
+        #        return PipelineOutput(images=sample)
+        # Validate text encoder if we need it
+        if caption is not None and self.text_encoder is None:
+            raise ValueError("Text encoder is required for conditional generation")
+        self.unet.eval()
+        if self.text_encoder is not None:
+            self.text_encoder.to(self.device)
+            self.text_encoder.eval()
+        with torch.no_grad():
+            captions = self._prepare_text_batch(caption, batch_size, "caption")
+            negatives = self._prepare_text_batch(negative_prompt, batch_size, "negative_prompt")
+            # --- Prepare text embeddings ---
+            if(isinstance(self.text_encoder, TransformerModel)):
+                text_embeddings = text_model.get_embeddings(batch_size=batch_size,
+                                                            tokenizer=self.text_encoder.tokenizer,
+                                                            text_encoder=self.text_encoder,
+                                                            captions=captions,
+                                                            neg_captions=negatives,
+                                                            device=self.device)
+            else: #Case for the pre-trained text encoder
+                if(self.supports_pretrained_split): #If we have a split flag incorporated
+                    text_embeddings = st_helper.get_embeddings_split(batch_size = batch_size,
+                                                            tokenizer=self.tokenizer,
+                                                            model=self.text_encoder,
+                                                            captions=captions,
+                                                            neg_captions=negatives,
+                                                            device=self.device)
+                else:
+                    text_embeddings = st_helper.get_embeddings(batch_size = batch_size,
+                                                                tokenizer=self.tokenizer,
+                                                                model=self.text_encoder,
+                                                                captions=captions,
+                                                                neg_captions=negatives,
+                                                                device=self.device)
+            # --- Set up initial latent state ---
+            sample = self._prepare_initial_sample(raw_latent_sample, input_scene,
+                                                 batch_size, height, width, generator)
+            # --- Set up diffusion process ---
+            self.scheduler.set_timesteps(num_inference_steps)
+            # Denoising loop
+            iterator = self.progress_bar(self.scheduler.timesteps) if show_progress_bar else self.scheduler.timesteps
+            for t in iterator:
+                # Handle conditional generation
+                if captions is not None:
+                    if negatives is not None:
+                        # Three copies for negative prompt guidance
+                        model_input = torch.cat([sample, sample, sample], dim=0)
+                    else:
+                        # Two copies for standard classifier-free guidance
+                        model_input = torch.cat([sample, sample], dim=0)
+                else:
+                    model_input = sample
+                # Predict noise residual
+                model_kwargs = {"encoder_hidden_states": text_embeddings}
+                noise_pred = self.unet(model_input, t, **model_kwargs).sample
+                # Apply guidance
+                if captions is not None:
+                    if negatives is not None:
+                        # Split predictions for negative, unconditional, and text-conditional
+                        noise_pred_neg, noise_pred_uncond, noise_pred_text = noise_pred.chunk(3)
+                        noise_pred_guided = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                        noise_pred = noise_pred_guided - guidance_scale * (noise_pred_neg - noise_pred_uncond)
+                    else:
+                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                # Compute previous sample: x_{t-1} = scheduler(x_t, noise_pred)
+                sample = self.scheduler.step(noise_pred, t, sample, generator=generator).prev_sample
+            # Convert to output format
+            if output_type == "tensor":
+                if self.block_embeddings is not None:
+                    sample = get_scene_from_embeddings(sample, self.block_embeddings)
+                else:
+                    # Apply softmax to get probabilities for each tile type
+                    sample = F.softmax(sample, dim=1)
+                    sample = sample.detach().cpu()
+            else:
+                raise ValueError(f"Unsupported output type: {output_type}")
+        return PipelineOutput(images=sample)
+    def print_unet_architecture(self):
+        """Prints the architecture of the UNet model."""
+        print(self.unet)
+    def print_text_encoder_architecture(self):
+        """Prints the architecture of the text encoder model, if it exists."""
+        if self.text_encoder is not None:
+            print(self.text_encoder)
+        else:
+            print("No text encoder is set.")
+    def save_unet_architecture_pdf(self, height, width, filename="unet_architecture", batch_size=1, device=None):
+        """
+        Have to separately install torchview for this to work
+        Saves a visualization of the UNet architecture as a PDF using torchview.
+        Args:
+            height: Height of the dummy input.
+            width: Width of the dummy input.
+            filename: Output PDF filename.
+            batch_size: Batch size for dummy input.
+            device: Device to run the dummy input on (defaults to pipeline device).
+        """
+        from torchview import draw_graph
+        import graphviz
+        if device is None:
+            device = self.device if hasattr(self, 'device') else 'cpu'
+        in_channels = self.unet.config.in_channels if hasattr(self.unet, 'config') else 1
+        sample_shape = tuple([batch_size, in_channels, height, width])
+        dummy_x = torch.randn(size=sample_shape, device=device)
+        dummy_t = torch.tensor([0] * batch_size, dtype=torch.long, device=device)
+        # Prepare dummy text embedding (match what your UNet expects)
+        if hasattr(self.unet, 'config') and hasattr(self.unet.config, 'cross_attention_dim'):
+            cross_attention_dim = self.unet.config.cross_attention_dim
+        else:
+            cross_attention_dim = 128  # fallback
+        encoder_hidden_states = torch.randn(batch_size, 1, cross_attention_dim, device=device)
+        self.unet.eval()
+        inputs = (dummy_x, dummy_t, encoder_hidden_states)
+        #self.unet.down_blocks = self.unet.down_blocks[:2]
+        graph = draw_graph(
+            model=self.unet,
+            input_data=inputs,
+            expand_nested=False,
+            #enable_output_shape=True,
+            #roll_out="nested",
+            depth=1
+        )
+        #graph.visual_graph.engine = "neato"
+        graph.visual_graph.attr(#rankdir="LR",
+                                nodesep="0.1",      # decrease space between nodes in the same rank (default ~0.25)
+                                ranksep="0.2",       # decrease space between ranks (default ~0.5)
+                                concentrate="true"  # merge edges between nodes in the same rank
+                            )
+        graph.visual_graph.node_attr.update(
+            shape="rectangle",
+            width="1.5",   # narrow width
+            height="0.5"  # taller height to make vertical rectangles
+            #fixedsize="true"
+        )
+        graph.visual_graph.render(filename, format='pdf', cleanup=False)  # Cleanup removes intermediate files
+        graph.visual_graph.save('unet_architecture.dot')
+        # Save the graph to a PDF file
+        print(f"UNet architecture saved to {filename}")

models/text_model.py ADDED Viewed

	@@ -0,0 +1,206 @@

+import argparse
+from xml.parsers.expat import model
+import torch
+import torch.nn as nn
+import math
+import os
+import json
+from safetensors.torch import save_file, load_file
+from tokenizer import Tokenizer
+def get_embeddings(batch_size, tokenizer, text_encoder, captions=None, neg_captions=None, device='cpu'):
+    max_length = text_encoder.max_seq_length
+    empty_ids = encode_token_captions([""] * batch_size, tokenizer, max_length, device=device)
+    embeddings = text_encoder.get_embeddings(empty_ids)
+    if(captions is not None):
+        caption_ids = encode_token_captions(captions, tokenizer, max_length, device=device)
+        caption_embeddings = text_encoder.get_embeddings(caption_ids)
+        embeddings = torch.cat((embeddings, caption_embeddings), dim=0)
+    if(neg_captions is not None):
+        neg_ids = encode_token_captions(neg_captions, tokenizer, max_length, device=device)
+        neg_embeddings = text_encoder.get_embeddings(neg_ids)
+        embeddings = torch.cat((neg_embeddings, embeddings), dim=0)
+    return embeddings.to(device)
+def encode_token_captions(captions, tokenizer, max_length, device='cpu'):
+    caption_ids = []
+    for caption in captions:
+        tokens = tokenizer.encode(caption)
+        caption_tokens = tokenizer.pad_sequence(tokens, max_length)
+        caption_ids.append(torch.tensor(caption_tokens, dtype=torch.long).unsqueeze(0))
+    return torch.cat(caption_ids, dim=0).to(device)
+# Transformer model for MLM training
+class TransformerModel(nn.Module):
+    def __init__(self, vocab_size, embedding_dim, hidden_dim, tokenizer=None, num_heads=8, num_layers=4, max_seq_length=100):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.vocab_size = vocab_size
+        self.hidden_dim = hidden_dim
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.max_seq_length = max_seq_length
+        self.embedding = nn.Embedding(vocab_size, embedding_dim)
+        self.positional_encoding = self.create_positional_encoding(max_seq_length, embedding_dim)
+        encoder_layers = nn.TransformerEncoderLayer(
+            d_model=embedding_dim,
+            nhead=num_heads,
+            dim_feedforward=hidden_dim,
+            batch_first=True
+        )
+        self.transformer = nn.TransformerEncoder(encoder_layers, num_layers)
+        self.fc = nn.Linear(embedding_dim, vocab_size)
+        self.tokenizer = tokenizer
+    def create_positional_encoding(self, max_seq_length, embedding_dim):
+        # The implementation uses a sinusoidal positional encoding, which creates a unique pattern for each position in the sequence.
+        # The frequencies create unique values, the sin/cos bounds values
+        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
+        # Creates a set of divisors that create different frequencies
+        div_term = torch.exp(torch.arange(0, embedding_dim, 2).float() * (-math.log(10000.0) / embedding_dim))
+        pe = torch.zeros(max_seq_length, embedding_dim)
+        # Even dimensions use sin, odd dimensions use cos
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        return pe.unsqueeze(0)
+    def get_embeddings(self, x):
+        """ This gets the actual latent embedding vectors """
+        # Ensure positional encoding is on the same device as input
+        pe = self.positional_encoding[:, :x.size(1), :].to(x.device)
+        # Embed input and add positional encoding
+        embedded = self.embedding(x) + pe
+        return self.transformer(embedded)
+    def forward(self, x):
+        """ This gets the token within the vocabulary """
+        transformer_out = self.get_embeddings(x)
+        # Project to vocabulary size
+        return self.fc(transformer_out)
+    def save_pretrained(self, save_directory):
+        os.makedirs(save_directory, exist_ok=True)
+        config = {
+            "vocab_size": self.vocab_size,
+            "embedding_dim": self.embedding_dim,
+            "hidden_dim": self.hidden_dim,
+            "num_heads": self.num_heads,
+            "num_layers": self.num_layers,
+            "max_seq_length": self.max_seq_length,
+        }
+        with open(os.path.join(save_directory, "config.json"), "w") as f:
+            json.dump(config, f)
+        # Save model weights
+        save_file(self.state_dict(), os.path.join(save_directory, "model.safetensors"))
+        # Save tokenizer if present
+        if self.tokenizer is not None:
+            self.tokenizer.save(os.path.join(save_directory, "tokenizer.pkl"))
+    @classmethod
+    def from_pretrained(cls, load_directory):
+        with open(os.path.join(load_directory, "config.json")) as f:
+            config = json.load(f)
+        model = cls(**config)
+        # Load weights
+        state_dict = load_file(os.path.join(load_directory, "model.safetensors"))
+        model.load_state_dict(state_dict)
+        # Load tokenizer if available
+        tokenizer_path = os.path.join(load_directory, "tokenizer.pkl")
+        if os.path.exists(tokenizer_path):
+            tokenizer = Tokenizer()
+            tokenizer.load(tokenizer_path)
+            model.tokenizer = tokenizer
+        return model
+    def print_architecture(self, inputs=None):
+        parser = argparse.ArgumentParser()
+        parser.add_argument("--model_path", type=str, required=True, help="Path to trained transformer model")
+        parser.add_argument("--json", type=str, default="SMB1_LevelsAndCaptions-regular-test.json", help="Path to dataset json file")
+        parser.add_argument("--num_samples", type=int, default=10, help="Number of captions to evaluate")
+        parser.add_argument("--mask_prob", type=float, default=0.15, help="Probability of masking each token")
+        parser.add_argument("--compare_checkpoints", action="store_true", default=False, help="Run comparison across all model checkpoints")
+        args = parser.parse_args()
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        model = TransformerModel.from_pretrained(args.model_path).to(device)
+        print(f"Loaded model from {args.model_path}")
+        import os
+        import re
+        import json
+        import matplotlib.pyplot as plt
+        from torchview import draw_graph
+        import graphviz
+        graph = draw_graph(
+            model=model,
+            input_data=inputs,
+            expand_nested=False,
+            #enable_output_shape=True,
+            #roll_out="nested",
+            depth=1
+        )
+        # Save plot
+        filename = 'mlm_architecture'
+        graph.visual_graph.render(filename, format='pdf', cleanup=False)  # Cleanup removes intermediate files
+        #graph.visual_graph.save('unet_architecture.dot')
+    def save_architecture_pdf(self, filename="transformer_architecture.pdf", input_length=32):
+        """Save a visualization of the model architecture as a PDF using torchview."""
+        try:
+            from torchview import draw_graph
+        except ImportError:
+            raise ImportError("torchview is required for model visualization. Install with 'pip install torchview'.")
+        import torch
+        import os
+        # Create a dummy input of the correct type for the model
+        captions = ["full floor. two coins. one pipe.", "floor with two gaps. one cannon. many enemies."]
+        tensor = encode_token_captions(captions, self.tokenizer, self.max_seq_length, device=next(self.parameters()).device)
+        input_length = tensor.size(1) if tensor.dim() > 1 else self.max_seq_length
+        num_tokens_list = [len(self.tokenizer.encode(c)) for c in captions]
+        input_length = max(num_tokens_list) if num_tokens_list else input_length
+        dummy_input = torch.zeros((1, input_length), dtype=torch.long, device=next(self.parameters()).device)
+        # Draw the graph and save as PNG
+        graph = draw_graph(self, input_data=dummy_input, expand_nested=True, save_graph=True, filename=filename.replace('.pdf',''), directory=".", depth=2)
+        png_file = filename.replace('.pdf', '.png')
+        # Convert PNG to PDF
+        if os.path.exists(png_file):
+            try:
+                from PIL import Image
+                im = Image.open(png_file)
+                im.save(filename, "PDF", resolution=100.0)
+                print(f"Saved architecture PDF to {filename}")
+                # Optionally, remove the PNG file
+                os.remove(png_file)
+            except ImportError:
+                print(f"PIL not installed. Architecture saved as PNG: {png_file}")
+            except Exception as e:
+                print(f"Could not convert PNG to PDF: {e}")
+        else:
+            print(f"Could not find PNG file to convert: {png_file}")

util/common_settings.py ADDED Viewed

	@@ -0,0 +1,18 @@

+NUM_INFERENCE_STEPS = 30
+GUIDANCE_SCALE = 7.5
+MARIO_HEIGHT = 16
+MARIO_WIDTH = 16
+MARIO_TILE_PIXEL_DIM = 16
+MARIO_TILE_COUNT = 13
+LR_HEIGHT = 32
+LR_WIDTH = 32
+LR_TILE_PIXEL_DIM = 8
+LR_TILE_COUNT = 8
+MEGAMAN_HEIGHT = 14
+MEGAMAN_WIDTH = 16

util/naming_conventions.py ADDED Viewed

	@@ -0,0 +1,29 @@

+model_name_map = [
+    ("Mar1and2-conditional-regular", "MLM-regular"),
+    ("Mar1and2-conditional-absence", "MLM-absence"),
+    ("Mar1and2-conditional-negative", "MLM-negative"),
+    ("Mar1and2-conditional-MiniLM-regular", "MiniLM-single-regular"),
+    ("Mar1and2-conditional-MiniLM-absence", "MiniLM-single-absence"),
+    ("Mar1and2-conditional-MiniLM-negative", "MiniLM-single-negative"),
+    ("Mar1and2-conditional-MiniLMsplit-regular", "MiniLM-multiple-regular"),
+    ("Mar1and2-conditional-MiniLMsplit-absence", "MiniLM-multiple-absence"),
+    ("Mar1and2-conditional-MiniLMsplit-negative", "MiniLM-multiple-negative"),
+    ("Mar1and2-conditional-GTE-regular", "GTE-single-regular"),
+    ("Mar1and2-conditional-GTE-absence", "GTE-single-absence"),
+    ("Mar1and2-conditional-GTE-negative", "GTE-single-negative"),
+    ("Mar1and2-conditional-GTEsplit-regular", "GTE-multiple-regular"),
+    ("Mar1and2-conditional-GTEsplit-absence", "GTE-multiple-absence"),
+    ("Mar1and2-conditional-GTEsplit-negative", "GTE-multiple-negative"),
+    ("Mar1and2-fdm-MiniLM-regular", "FDM-MiniLM-regular"),
+    ("Mar1and2-fdm-MiniLM-absence", "FDM-MiniLM-absence"),
+    ("Mar1and2-fdm-GTE-regular", "FDM-GTE-regular"),
+    ("Mar1and2-fdm-GTE-absence", "FDM-GTE-absence"),
+    ("Mar1and2-wgan", "WGAN"),
+    ("Mar1and2-unconditional", "Unconditional"),
+    ("MarioGPT_metrics", "MarioGPT"),
+]
+def get_model_name_map_and_order():
+    mapping = dict(model_name_map)
+    order = [v for k, v in model_name_map]
+    return mapping, order

util/plotter.py ADDED Viewed

	@@ -0,0 +1,173 @@

+# Track changes in loss and learning rate during execution
+import argparse
+import matplotlib
+import matplotlib.pyplot as plt
+import os
+import time
+import json
+import tempfile
+import shutil
+from pathlib import Path
+def parse_args():
+    parser = argparse.ArgumentParser(description="Train a text-conditional diffusion model for tile-based level generation")
+    # Dataset args
+    parser.add_argument("--log_file", type=str, default=None, help="The the filepath of the file to get the data from")
+    parser.add_argument("--left_key", type=str, default=None, help="The key for the left y-axis")
+    parser.add_argument("--right_key", type=str, default=None, help="The key for the right y-axis")
+    parser.add_argument("--left_label", type=str, default=None, help="The label for the left y-axis")
+    parser.add_argument("--right_label", type=str, default=None, help="The label for the right y-axis")
+    parser.add_argument("--output_png", type=str, default="output.png", help="The output png file")
+    parser.add_argument("--update_interval", type=int, default=1.0, help="The update inteval in epochs")
+    parser.add_argument("--start_point", type=int, default=None, help="The start point for the plot")
+    return parser.parse_args()
+def main():
+    args = parse_args()
+    log_file = args.log_file
+    left_key = args.left_key
+    right_key = args.right_key
+    left_label = args.left_label
+    right_label = args.right_label
+    output_png = args.output_png
+    update_interval = args.update_interval
+    start_point = args.start_point
+    general_update_plot(log_file, left_key, right_key, left_label, right_label, output_png, update_interval=update_interval, startPoint=start_point)
+def general_update_plot(log_file, left_key, right_key, left_label, right_label, output_png, update_interval=1.0, startPoint=None):
+    log_dir = os.path.dirname(log_file)
+    # Create figure here and ensure it's closed
+    fig = plt.figure(figsize=(10, 6))
+    ax = fig.add_subplot(111)
+    try:
+        if os.path.exists(log_file):
+            with open(log_file, 'r') as f:
+                data = [json.loads(line) for line in f if line.strip()]
+            if not data:
+                return
+            if startPoint is not None:
+                data = [entry for entry in data if entry.get('epoch', 0) >= startPoint]
+            if not data:
+                return
+            epochs = [entry.get('epoch', 0) for entry in data]
+            left = [entry.get(left_key, 0) for entry in data]
+            # For right axis (e.g., lr), only include points where right_key exists
+            right_points = [(entry.get('epoch', 0), entry.get(right_key))
+                            for entry in data if right_key in entry]
+            if right_points:
+                right_epochs, right_values = zip(*right_points)
+            else:
+                right_epochs, right_values = [], []
+            # Clear axis
+            ax.clear()
+            # Plot both metrics on the same axis
+            ax.plot(epochs, left, 'b-', label=left_label)
+            if right_epochs:
+                ax.plot(right_epochs, right_values, 'r-', label=right_label)
+            ax.set_xlabel('Epoch')
+            ax.set_ylabel(left_label) # "Loss" as y-axis label
+            ax.set_title('Training Progress')
+            ax.legend(loc='upper left')
+            #Limit x-axis to startPoint if provided
+            if startPoint is not None:
+                ax.set_xlim(left=startPoint)
+            fig.tight_layout()
+            # Use the stored base directory instead of getting it from log_file
+            if os.path.isabs(output_png) or os.path.dirname(output_png):
+                output_path = output_png
+            else:
+                output_path = os.path.join(log_dir, output_png)
+            save_figure_safely(fig, output_path)
+    finally:
+        plt.close(fig)  # Ensure figure is closed even if an error occurs
+def save_figure_safely(fig, output_path):
+    """Save figure to a temporary file first, then move it to the final location"""
+    output_path = str(Path(output_path))  # Convert to string path
+    # Create temporary file with .png extension
+    with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp_file:
+        tmp_path = tmp_file.name
+    try:
+        # Save to temporary file
+        fig.savefig(tmp_path)
+        # Create output directory if it doesn't exist
+        os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True)
+        # Try to move the file to final destination
+        # If move fails, try to copy and then delete
+        try:
+            shutil.move(tmp_path, output_path)
+        except OSError:
+            shutil.copy2(tmp_path, output_path)
+            os.unlink(tmp_path)
+    except Exception as e:
+        # Clean up temporary file if anything goes wrong
+        if os.path.exists(tmp_path):
+            os.unlink(tmp_path)
+        raise e
+class Plotter:
+    def __init__(self, log_file, update_interval=1.0, left_key='loss', right_key='lr',
+                 left_label='Loss', right_label='Learning Rate', output_png='training_progress.png'):
+        self.log_dir = os.path.dirname(log_file)
+        self.log_file = log_file
+        self.update_interval = update_interval
+        self.running = True
+        self.output_png = output_png
+        self.left_key = left_key
+        self.right_key = right_key
+        self.left_label = left_label
+        self.right_label = right_label
+        matplotlib.use('Agg')
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.stop_plotting()
+    def __del__(self):
+        self.stop_plotting()
+    def update_plot(self):
+        general_update_plot(self.log_file, self.left_key, self.right_key,
+                          self.left_label, self.right_label, self.output_png,
+                          update_interval=self.update_interval)
+    def start_plotting(self):
+        print("Starting plotting in background")
+        while self.running:
+            self.update_plot()
+            time.sleep(self.update_interval)
+    def stop_plotting(self):
+        if hasattr(self, 'running'):  # Check if already stopped
+            self.running = False
+            self.update_plot()
+            print("Plotting stopped")
+if __name__ == "__main__":
+    main()

util/sampler.py ADDED Viewed

	@@ -0,0 +1,473 @@

+from __future__ import annotations
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+import os
+import subprocess
+import tempfile
+import numpy as np
+import torch
+from PIL.Image import Image
+from tqdm import tqdm
+from transformers import LogitsProcessorList, TemperatureLogitsWarper, TopKLogitsWarper
+from mario_gpt.lm.base import BaseMarioLM
+from mario_gpt.prompter import Prompter
+from mario_gpt.simulator import Simulator
+from mario_gpt.utils import (
+    convert_level_to_png,
+    load_level,
+    save_level,
+    trim_level,
+    view_level,
+)
+def scene_to_ascii(scene, id_to_char, shorten: bool = True) -> List[str]:
+    """
+    Convert JSON scene files from a list of lists of ints
+    to a list of ASCII strings using id_to_char mapping.
+    If shorten is True, only the last 15 rows are kept.
+    Args:
+        scene: List[List[int]] - 2D array of tile IDs
+        id_to_char: Dict[int, str] - mapping from tile ID to ASCII character
+        shorten: bool - If True, will shorten the output to only include the first 15 rows
+                        so A* Mario (for SNES graphics) to run without glitching
+    Returns:
+        List[str]: List of strings, each representing a row in ASCII
+    """
+    if shorten and len(scene) > 15:
+        scene = scene[-15:]  # Keep only the last 15 rows
+    return ["".join(id_to_char[num] for num in row) for row in scene]
+@dataclass
+class SampleOutput:
+    level: Optional[List[str]]
+    prompt: Optional[str] = None
+    img: Optional[Image] = None
+    sample_predictions_str: Optional[List[str]] = None
+    sample_predictions_img: Optional[Image] = None
+    level_tensor: Optional[torch.Tensor] = None
+    sample_predictions_tensor: Optional[torch.Tensor] = None
+    # Uses MarioEval graphics for rendering levels when True
+    use_snes_graphics: bool = False
+    @classmethod
+    def create(
+        cls,
+        level_tensor: torch.Tensor,
+        sample_predictions_tensor: torch.Tensor,
+        tokenizer,
+        prompter: Optional[Prompter] = None,
+    ) -> SampleOutput:
+        # batch = 1
+        level = None
+        img = None
+        try:
+            level = view_level(level_tensor, tokenizer)
+            img = convert_level_to_png(level)[0]
+        except Exception as e:
+            print(
+                f"Failed to generate string or image representation for full level! Got error {e}"
+            )
+            level = None
+            img = None
+        try:
+            sample_predictions_str = view_level(sample_predictions_tensor, tokenizer)
+            sample_predictions_img = convert_level_to_png(sample_predictions_str)[0]
+        except Exception as e:
+            print(
+                f"Failed to generate string or image representation for sampled predictions! Got error {e}"
+            )
+            sample_predictions_str = None
+            sample_predictions_img = None
+        prompt = None
+        if prompter is not None:
+            prompt = prompter(level_tensor)[0]
+        return SampleOutput(
+            level,
+            prompt,
+            img,
+            sample_predictions_str,
+            sample_predictions_img,
+            level_tensor,
+            sample_predictions_tensor,
+        )
+    @classmethod
+    def from_level_predictions(
+        cls,
+        level: torch.Tensor,
+        sample_predictions: torch.Tensor,
+        tokenizer,
+        prompter: Optional[Prompter] = None,
+    ) -> Union[SampleOutput, List[SampleOutput]]:
+        level_tensor = trim_level(level).squeeze().detach().cpu()
+        sample_predictions_tensor = (
+            trim_level(sample_predictions).squeeze().detach().cpu()
+        )
+        if len(level_tensor.shape) == 1:
+            return SampleOutput.create(
+                level_tensor, sample_predictions_tensor, tokenizer, prompter
+            )
+        out = []
+        for _level_tensor, _sample_predictions_tensor in zip(
+            level_tensor, sample_predictions_tensor
+        ):
+            sample_output = SampleOutput.create(
+                _level_tensor, _sample_predictions_tensor, tokenizer, prompter
+            )
+            out.append(sample_output)
+        return out
+    def save(self, filename: str) -> str:
+        save_level(self.level, filename)
+    @classmethod
+    def load(cls, filename: str) -> SampleOutput:
+        level = load_level(filename)
+        return SampleOutput(level=level)
+    def play(self, game="mario", level_idx=None, dataset_path=None):
+        """
+        Play the level using the specified game engine.
+        game: "mario" (default) or "loderunner"
+        """
+        if game == "loderunner":
+            import tempfile, json
+            # Convert self.level (list of strings) to Lode Runner JSON format
+            scene = [[c for c in row] for row in self.level]
+            lr_json = [{
+                "scene": scene,
+                "caption": ""
+            }]
+            with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as tmp:
+                json.dump(lr_json, tmp)
+                tmp_path = tmp.name
+            import sys, os
+            #sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
+            from LodeRunner.loderunner import main
+            tmp_path = tmp_path if dataset_path is None else dataset_path
+            print(f"Playing Lode Runner level interactively -- {tmp_path}!")
+            main.play_lr_level(tmp_path, level_index=level_idx if level_idx is not None else 1)
+        else:
+            if self.use_snes_graphics:
+                simulator = CustomSimulator(level=self.level, jar_path="MarioEval.jar")
+            else:
+                simulator = CustomSimulator(level=self.level, jar_path="NESMarioEval.jar")
+            simulator.interactive()
+    def run_astar(self, render=True):
+        if self.use_snes_graphics:
+            simulator = CustomSimulator(level=self.level, jar_path="MarioEval.jar")
+        else:
+            simulator = CustomSimulator(level=self.level, jar_path="NESMarioEval.jar")
+        return simulator.astar(render)
+class CustomSimulator:
+    """
+        The classic Mario simulator used by MarioGPT is generally,
+        better, but it doesn't return any information about
+        Mario's performance. The main point of this simulator
+        is that information about the performance of the agent
+        is printed to the console (though I still need a way
+        to caption and return that information)
+    """
+    def __init__(self, level, jar_path="MarioEval.jar"):
+        while len(level) > 15:
+            level.pop(0)
+        # For some reason, my older A* agent
+        # crashes on Mario levels with 16 rows or more
+        self.level = level
+        self.jar_path = jar_path
+    def interactive(self):
+        t = tempfile.NamedTemporaryFile(suffix=".txt", delete=False)
+        save_level(self.level, t.name)
+        print(f"Playing level interactively -- {t.name}!")
+        _ = subprocess.run(
+            ["java", "-jar", self.jar_path, "human", t.name, "human"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+        t.close()
+        os.unlink(t.name)
+    def astar(self, render: bool = True):
+        t = tempfile.NamedTemporaryFile(suffix=".txt", delete=False)
+        save_level(self.level, t.name)
+        print(f"Running Astar agent on level! -- {t.name}")
+        render_str = "human" if render else "norender"
+        result = subprocess.run(
+            ["java", "-jar", self.jar_path, "astar", t.name, render_str],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+        t.close()
+        os.unlink(t.name)
+        # Combine stdout and stderr, decode to string, and return
+        output = result.stdout.decode("utf-8") + result.stderr.decode("utf-8")
+        return output
+def save_level(level: List[str], filename: str):
+    concatenated = "\n".join(level)
+    with open(filename, "w") as f:
+        f.write(concatenated)
+    return filename
+class GPTSampler:
+    def __init__(
+        self,
+        mario_lm: BaseMarioLM,
+        temperature: float = 2.0,
+        top_k: int = 16,
+        context_len: int = 700,
+        use_tqdm: bool = False,
+        use_argmax: bool = False,
+    ):
+        self.mario_lm = mario_lm
+        self.temperature = temperature
+        self.top_k = top_k
+        self.context_len = context_len
+        self.use_tqdm = use_tqdm
+        self.use_argmax = use_argmax
+        self.logits_processor = LogitsProcessorList()
+        self.logits_warper = LogitsProcessorList(
+            [
+                TopKLogitsWarper(top_k),  # number of characters
+                TemperatureLogitsWarper(temperature),
+            ]
+        )
+    @property
+    def device(self) -> torch.device:
+        return self.mario_lm.device
+    def step(
+        self,
+        seed: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        with torch.no_grad():
+            attention_mask = torch.ones_like(seed).to(seed.device)
+            input_ids = seed
+            out = self.mario_lm.lm(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                token_type_ids=None,
+            )
+            logits = out.logits.detach()
+            if len(logits.shape) == 2:
+                logits = logits.view(1, 1, -1)
+            next_token_logits = logits[:, -1, :]
+            if self.use_argmax:
+                next_tokens = next_token_logits.argmax(-1)
+            else:
+                next_token_scores = self.logits_processor(input_ids, next_token_logits)
+                next_token_scores = self.logits_warper(input_ids, next_token_scores)
+                probs = torch.nn.functional.softmax(next_token_scores, dim=-1)
+                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+        return next_tokens, encoder_hidden_states
+    def sample(
+        self,
+        seed: Union[Optional[torch.Tensor], Optional[SampleOutput]] = None,
+        prompts: Optional[List[str]] = None,
+        num_steps: int = 1,
+        encoder_hidden_states: torch.Tensor = None,
+        return_tensor: bool = False,
+    ):
+        self.mario_lm.eval()
+        context_len = self.context_len - 28
+        with torch.no_grad():
+            if seed is None:
+                seed = self.mario_lm.generate_seed(1, batch_size=len(prompts)).to(
+                    self.device
+                )
+                out_tensor = seed.to(self.device)
+            elif isinstance(seed, SampleOutput):
+                out_tensor = seed.level_tensor.to(self.device).squeeze()
+            else:
+                out_tensor = seed.to(self.device).squeeze()
+            if len(out_tensor.shape) < 2:
+                # if we pass in a single seed vector, then we repeat for each prompt
+                # Otherwise, we treat inputs as separate seed-prompt pairs
+                out_tensor = out_tensor.view(1, -1).repeat(len(prompts), 1)
+            if encoder_hidden_states is None:
+                if prompts is not None:
+                    encoder_hidden_states = torch.stack(
+                        [
+                            self.mario_lm.prompter.output_hidden(prompt)
+                            for prompt in prompts
+                        ]
+                    )
+                else:
+                    encoder_hidden_states = torch.stack(
+                        [
+                            self.mario_lm.prompter(sample_prompt=True)[1]
+                            for _ in range(seed.shape[0])
+                        ]
+                    )
+            encoder_hidden_states = encoder_hidden_states.to(
+                self.device
+            )  # b x 1 x hidden_dim
+            encoder_hidden_states = encoder_hidden_states.view(
+                out_tensor.shape[0], 1, -1
+            )
+            if not self.use_tqdm:
+                bar = np.arange(num_steps)
+            else:
+                bar = tqdm(np.arange(num_steps))
+            with torch.no_grad():
+                for i in bar:
+                    inp = out_tensor * 1
+                    if len(out_tensor.shape) > 0 and out_tensor.shape[-1] > context_len:
+                        diff = inp.shape[-1] % 14  # height of mario level
+                        ctx = context_len + diff
+                        inp = inp[:, -ctx:] * 1
+                    next_tokens, encoder_hidden_states = self.step(
+                        inp,
+                        encoder_hidden_states=encoder_hidden_states,
+                    )
+                    out_tensor = torch.cat(
+                        [out_tensor, next_tokens.unsqueeze(-1)], dim=-1
+                    )
+                    if self.use_tqdm:
+                        bar.set_description(
+                            f"shape: {inp.shape}, {out_tensor.shape} first: {inp[0][0]}, last: {out_tensor[0][-1]}"
+                        )
+            if self.use_tqdm:
+                bar.close()
+        sample_out = SampleOutput.from_level_predictions(
+            out_tensor,
+            out_tensor[:, -num_steps:],
+            self.mario_lm.tokenizer,
+            self.mario_lm.prompter,
+        )
+        self.mario_lm.train()
+        if return_tensor:
+            return sample_out, out_tensor
+        return sample_out
+    def __call__(self, *args, **kwargs):
+        return self.sample(*args, **kwargs)
+class BertSampler:
+    def __init__(
+        self,
+        mario_lm: BaseMarioLM,
+        temperature: float = 2.0,
+        top_k: int = 16,
+        context_len: int = 448,
+        mask_proportion: float = 0.16,
+    ):
+        self.mario_lm = mario_lm
+        self.temperature = temperature
+        self.top_k = top_k
+        self.logits_processor = LogitsProcessorList()
+        self.logits_warper = LogitsProcessorList(
+            [
+                TopKLogitsWarper(top_k),  # number of characters
+                TemperatureLogitsWarper(temperature),
+            ]
+        )
+        self.context_len = context_len
+        self.mask_proportion = mask_proportion
+        self.mask_portion = int(self.context_len * self.mask_proportion)
+        self.mask_portion = self.mask_portion - self.mask_portion % 14 + 14
+    @property
+    def device(self) -> torch.device:
+        return self.mario_lm.device
+    def get_context(self, input_ids, mask_indices):
+        start_idx = mask_indices[0]
+        end_idx = mask_indices[-1]
+        if input_ids.shape[-1] <= self.context_len:
+            clipped = input_ids.shape[-1] % 14
+            input_ids = input_ids[:clipped]
+        portion = (self.context_len - self.mask_portion) / 2
+        remainder = 0
+        left = start_idx - portion
+        if left < 0:
+            remainder = -1 * left
+        right = end_idx + portion + remainder
+        return input_ids[left:right]
+    def sample(
+        self,
+        seed: Union[torch.Tensor, SampleOutput],
+        mask: torch.Tensor,
+        return_tensor: bool = False,
+    ):
+        self.mario_lm.eval()
+        mask_indices = mask.nonzero()
+        input_ids = seed
+        if isinstance(seed, SampleOutput):
+            input_ids = seed.level_tensor.to(self.device).squeeze()
+        input_id_list = []
+        for i in range(input_ids.shape[0]):
+            input_id = input_ids[i]
+            mask_index = mask_indices[mask_indices[:, 0] == i][:, -1]
+            input_id = self.get_context(input_id, mask_index)
+            input_id_list.append(input_id)
+        input_ids = torch.stack(input_ids, dim=0).to(self.device)
+        attention_mask = torch.ones_like(input_ids).to(seed.device)
+        if len(input_ids.shape) < 2:
+            # if we pass in a single seed vector, then we repeat for each prompt
+            # Otherwise, we treat inputs as separate seed-prompt pairs
+            input_ids = input_ids.view(1, -1)
+        out = self.mario_lm.lm(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=None,
+        )
+        logits = out.logits.detach()
+        if len(logits.shape) == 2:
+            logits = logits.view(1, 1, -1)
+        if self.use_argmax:
+            tokens = logits.argmax(-1)
+        else:
+            tokens_scores = self.logits_processor(input_ids, tokens)
+            tokens_scores = self.logits_warper(input_ids, tokens_scores)
+            probs = torch.nn.functional.softmax(tokens_scores, dim=-1)
+            tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+        out = input_ids.detach()
+        for i in range(input_ids.shape[0]):
+            mask_index = mask_indices[mask_indices[:, 0] == i][:, -1]
+            out[i, mask_index] = tokens[i, mask_index].detach()
+        sample_out = SampleOutput.from_level_predictions(
+            out,
+            tokens,
+            self.mario_lm.tokenizer,
+            self.mario_lm.prompter,
+        )
+        self.mario_lm.train()
+        if return_tensor:
+            return sample_out, tokens
+        return sample_out