Spaces:

Agents-MCP-Hackathon
/

UnlimitedMusicGen

Sleeping

App Files Files Community

Surn commited on Jun 10, 2025

Commit

e18bcfc

1 Parent(s): 11d1fa0

Convert to MCP Client

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +1 -1
app.py +224 -329
audiocraft/__init__.py +0 -10
audiocraft/data/__init__.py +0 -8
audiocraft/data/audio.py +0 -422
audiocraft/data/audio_dataset.py +0 -587
audiocraft/data/audio_utils.py +0 -296
audiocraft/data/info_audio_dataset.py +0 -110
audiocraft/data/zip.py +0 -76
audiocraft/environment.py +0 -176
audiocraft/models/__init__.py +0 -21
audiocraft/models/builders.py +0 -351
audiocraft/models/encodec.py +0 -506
audiocraft/models/flow_matching.py +0 -516
audiocraft/models/genmodel.py +0 -273
audiocraft/models/lm.py +0 -588
audiocraft/models/lm_magnet.py +0 -500
audiocraft/models/loaders.py +0 -291
audiocraft/models/magnet.py +0 -88
audiocraft/models/musicgen.py +0 -566
audiocraft/models/unet.py +0 -214
audiocraft/modules/__init__.py +0 -21
audiocraft/modules/activations.py +0 -96
audiocraft/modules/chroma.py +0 -66
audiocraft/modules/codebooks_patterns.py +0 -548
audiocraft/modules/conditioners.py +0 -1763
audiocraft/modules/conv.py +0 -245
audiocraft/modules/diffusion_schedule.py +0 -272
audiocraft/modules/jasco_conditioners.py +0 -300
audiocraft/modules/lstm.py +0 -25
audiocraft/modules/rope.py +0 -125
audiocraft/modules/seanet.py +0 -258
audiocraft/modules/streaming.py +0 -135
audiocraft/modules/transformer.py +0 -755
audiocraft/modules/unet_transformer.py +0 -67
audiocraft/py.typed +0 -0
audiocraft/quantization/__init__.py +0 -9
audiocraft/quantization/base.py +0 -107
audiocraft/quantization/core_vq.py +0 -405
audiocraft/quantization/vq.py +0 -116
audiocraft/utils/__init__.py +0 -5
audiocraft/utils/autocast.py +0 -40
audiocraft/utils/cache.py +0 -324
audiocraft/utils/cluster.py +0 -75
audiocraft/utils/export.py +0 -79
audiocraft/utils/export_legacy.py +0 -56
audiocraft/utils/extend.py +0 -440
audiocraft/utils/notebook.py +0 -32
audiocraft/utils/utils.py +0 -328
modules/constants.py +63 -0

README.md CHANGED Viewed

@@ -10,7 +10,7 @@ app_file: app.py
 pinned: true
 license: creativeml-openrail-m
 tags:
-agent-demo-track
 - musicgen
 - unlimited
 - user history

 pinned: true
 license: creativeml-openrail-m
 tags:
+- agent-demo-track
 - musicgen
 - unlimited
 - user history

app.py CHANGED Viewed

@@ -19,11 +19,6 @@ import typing as tp
 import warnings
 import gc
 from tqdm import tqdm
-from audiocraft.models import MusicGen
-from audiocraft.data.audio import audio_write
-from audiocraft.data.audio_utils import apply_fade, apply_tafade, apply_splice_effect
-from audiocraft.utils.extend import generate_music_segments, add_settings_to_image, INTERRUPTING
-from audiocraft.utils import utils
 import numpy as np
 import random
 import shutil
@@ -35,9 +30,14 @@ from modules.version_info import versions_html, commit_hash, get_xformers_versio
 from modules.gradio import *
 from modules.file_utils import get_file_parts, get_filename_from_filepath, convert_title_to_filename, get_unique_file_path, delete_file
 MODEL = None
 MODELS = None
-IS_SHARED_SPACE = "Surn/UnlimitedMusicGen" in os.environ.get('SPACE_ID', '')
 INTERRUPTED = False
 UNLOAD_MODEL = False
 MOVE_TO_CPU = False
@@ -239,343 +239,238 @@ def load_melody_filepath(melody_filepath, title, assigned_model, topp, temperatu
     return  gr.update(value=melody_name), gr.update(maximum=MAX_PROMPT_INDEX, value=-1), gr.update(value=assigned_model, interactive=True), gr.update(value=topp), gr.update(value=temperature), gr.update(value=cfg_coef), gr.update(maximum=MAX_OVERLAP)
-def predict(model, text, melody_filepath, duration, dimension, topk, topp, temperature, cfg_coef, background, title, settings_font, settings_font_color, seed, overlap=1, prompt_index = 0, include_title = True, include_settings = True, harmony_only = False, profile = gr.OAuthProfile, segment_length = 30, settings_font_size=28, settings_animate_waveform=False, video_orientation="Landscape", excerpt_duration=3.5, progress=gr.Progress(track_tqdm=True)):
     """
-    Generate music and video based on the provided parameters and model.
-    Args:
-        model (str): Model name to use for generation.
-        text (str): Prompt describing the music.
-        melody_filepath (str): Path to melody conditioning file. default to None.
-        duration (int): Total duration in seconds.
-        dimension (int): Audio stacking/concatenation dimension.
-        topk (int): Top-k sampling value.
-        topp (float): Top-p sampling value.
-        temperature (float): Sampling temperature.
-        cfg_coef (float): Classifier-free guidance coefficient.
-        background (str): Path to background image. default to "./assets/background.png".
-        title (str): Song title.
-        settings_font (str): Path to font file.
-        settings_font_color (str): Font color for settings text.
-        seed (int): Random seed.
-        overlap (int, optional): Segment overlap in seconds.
-        prompt_index (int, optional): Melody segment index.
-        include_title (bool, optional): Whether to add title to video.
-        include_settings (bool, optional): Whether to add settings to video.
-        harmony_only (bool, optional): Whether to use harmony only.
-        profile (gr.OAuthProfile): User profile.
-        segment_length (int, optional): Segment length in seconds.
-        settings_font_size (int, optional): Font size for settings text.
-        settings_animate_waveform (bool, optional): Animate waveform in video.
-        video_orientation (str, optional): Video orientation.
-        excerpt_duration (float, optional): Excerpt duration for style conditioning.
-        progress (gr.Progress, optional): Gradio progress tracker.
-    Returns:
-        tuple: (waveform_video_path, wave_file_path, seed_used)
     """
-    global MODEL, INTERRUPTED, INTERRUPTING, MOVE_TO_CPU
-    output_segments = None
-    melody_name = "Not Used"
-    melody_extension = "Not Used"
-    melody = None
-    if melody_filepath in ["None", ""]:
-        melody_filepath = None
-    if background in ["None", ""]:
-        background = "./assets/background.png"
-    if melody_filepath:
-        melody_name, melody_extension = get_filename_from_filepath(melody_filepath)
-        melody = get_melody(melody_filepath)
     INTERRUPTED = False
     INTERRUPTING = False
-    if temperature < 0:
-        temperature = 0.1
-        raise gr.Error("Temperature must be >= 0.")
-    if topk < 0:
-        topk = 1
-        raise gr.Error("Topk must be non-negative.")
-    if topp < 0:
-        topp =1
-        raise gr.Error("Topp must be non-negative.")
-    # Clean up GPU resources only if the model changes
-    if MODEL is not None and model not in MODEL.name:
-        print(f"Switching model from {MODEL.name} to {model}. Cleaning up resources.")
-        del MODEL  # Delete the current model
-        torch.cuda.empty_cache()  # Clear GPU memory
-        gc.collect()  # Force garbage collection
-        MODEL = None
     try:
-        if MODEL is None or model not in MODEL.name:
-            MODEL = load_model(model)
-        else:
-            if MOVE_TO_CPU:
-                MODEL.to('cuda')
-    except Exception as e:
-        raise gr.Error(f"Error loading model '{model}': {str(e)}. Try a different model.")
-    # prevent hacking
-    duration = min(duration, 720)
-    overlap =  min(overlap, 15)
-    #
-    output = None
-    segment_duration = duration
-    initial_duration = duration
-    output_segments = []
-    while duration > 0:
-        if not output_segments: # first pass of long or short song
-            if segment_duration > MODEL.lm.cfg.dataset.segment_duration:
-                segment_duration = MODEL.lm.cfg.dataset.segment_duration
-            else:
-                segment_duration = duration
-        else: # next pass of long song
-            if duration + overlap < MODEL.lm.cfg.dataset.segment_duration:
-                segment_duration = duration + overlap
-            else:
-                segment_duration = MODEL.lm.cfg.dataset.segment_duration
-        if (segment_length + overlap) < segment_duration:
-            segment_duration = segment_length + overlap
-        # implement seed
-        if seed < 0:
-            seed = random.randint(0, 0xffff_ffff_ffff)
-        torch.manual_seed(seed)
-        print(f'Segment duration: {segment_duration}, duration: {duration}, overlap: {overlap}')
-        if ("style" in model) and melody:
-            # style and text-to-music
-            MODEL.set_generation_params(
-                use_sampling=True,
-                top_k=topk,
-                top_p=topp,
-                temperature=temperature,
-                cfg_coef=cfg_coef,
-                duration=segment_duration,
-                two_step_cfg=False,
-                cfg_coef_beta=5, # double CFG is only useful for text-and-style conditioning
-            )
-            MODEL.set_style_conditioner_params(
-                eval_q=3, # integer between 1 and 6
-                            # eval_q is the level of quantization that passes
-                            # through the conditioner. When low, the models adheres less to the
-                            # audio conditioning
-                excerpt_length=excerpt_duration, # the length in seconds that is taken by the model in the provided excerpt, can be
-                                    # between 1.5 and 4.5 seconds but it has to be shortest to the length of the provided conditioning
-            )
-        else:
-            MODEL.set_generation_params(
-                use_sampling=True,
-                top_k=topk,
-                top_p=topp,
-                temperature=temperature,
-                cfg_coef=cfg_coef,
-                duration=segment_duration,
-                two_step_cfg=False,
-                extend_stride=2,
-                rep_penalty=0.5,
-                cfg_coef_beta=None, # double CFG is only useful for text-and-style conditioning
             )
-        MODEL.set_custom_progress_callback(gr.Progress(track_tqdm=True))
-        try:
-            if melody and ("melody" or "style" in model):
-                # return excess duration, load next model and continue in loop structure building up output_segments
-                if duration > MODEL.duration:
-                    output_segments, duration = generate_music_segments(text, melody, seed, MODEL, duration, overlap, MODEL.duration, prompt_index, harmony_only, excerpt_duration, progress=gr.Progress(track_tqdm=True))
-                else:
-                    # pure original code
-                    sr, melody = melody[0], torch.from_numpy(melody[1]).to(MODEL.device).float().t().unsqueeze(0)
-                    print(melody.shape)
-                    if melody.dim() == 2:
-                        melody = melody[None]
-                    melody = melody[..., :int(sr * MODEL.lm.cfg.dataset.segment_duration)]
-                    output = MODEL.generate_with_chroma(
-                        descriptions=[text],
-                        melody_wavs=melody,
-                        melody_sample_rate=sr,
-                        progress=False, progress_callback=gr.Progress(track_tqdm=True)
-                    )
-                # All output_segments are populated, so we can break the loop or set duration to 0
-                break
-            else:
-                #output = MODEL.generate(descriptions=[text], progress=False)
-                if not output_segments:
-                    next_segment = MODEL.generate(descriptions=[text], progress=False, progress_callback=gr.Progress(track_tqdm=True))
-                    duration -= segment_duration
-                else:
-                    last_chunk = output_segments[-1][:, :, -overlap*MODEL.sample_rate:]
-                    next_segment = MODEL.generate_continuation(last_chunk, MODEL.sample_rate, descriptions=[text], progress=False, progress_callback=gr.Progress(track_tqdm=True))
-                    duration -= segment_duration - overlap
-                if next_segment != None:
-                    output_segments.append(next_segment)
-        except Exception as e:
-            print(f"Error generating audio: {e}")
-            gr.Error(f"Error generating audio: {e}")
-            return None, None, seed
-        if INTERRUPTING:
-            INTERRUPTED = True
-            INTERRUPTING = False
-            print("Function execution interrupted!")
-            raise gr.Error("Interrupted.")
-    print(f"\nOutput segments: {len(output_segments)}\n")
-    if output_segments:
-        try:
-            # Combine the output segments into one long audio file or stack tracks
-            #output_segments = [segment.detach().cpu().float()[0] for segment in output_segments]
-            #output = torch.cat(output_segments, dim=dimension)
-            output = output_segments[0]
-            for i in range(1, len(output_segments)):
-                if overlap > 0:
-                    overlap_samples = overlap * MODEL.sample_rate
-                    #stack tracks and fade out/in
-                    overlapping_output_fadeout = output[:, :, -overlap_samples:]
-                    #overlapping_output_fadeout = apply_fade(overlapping_output_fadeout,sample_rate=MODEL.sample_rate,duration=overlap,out=True,start=True, curve_end=0.0, current_device=MODEL.device)
-                    overlapping_output_fadeout = apply_tafade(overlapping_output_fadeout,sample_rate=MODEL.sample_rate,duration=overlap,out=True,start=True,shape="linear")
-                    overlapping_output_fadein = output_segments[i][:, :, :overlap_samples]
-                    #overlapping_output_fadein = apply_fade(overlapping_output_fadein,sample_rate=MODEL.sample_rate,duration=overlap,out=False,start=False, curve_start=0.0, current_device=MODEL.device)
-                    overlapping_output_fadein = apply_tafade(overlapping_output_fadein,sample_rate=MODEL.sample_rate,duration=overlap,out=False,start=False, shape="linear")
-                    overlapping_output = torch.cat([overlapping_output_fadeout[:, :, :-(overlap_samples // 2)], overlapping_output_fadein],dim=2)
-                    ###overlapping_output, overlap_sample_rate = apply_splice_effect(overlapping_output_fadeout, MODEL.sample_rate, overlapping_output_fadein, MODEL.sample_rate, overlap)
-                    print(f" overlap size Fade:{overlapping_output.size()}\n output: {output.size()}\n segment: {output_segments[i].size()}")
-                    ##overlapping_output = torch.cat([output[:, :, -overlap_samples:], output_segments[i][:, :, :overlap_samples]], dim=1) #stack tracks
-                    ##print(f" overlap size stack:{overlapping_output.size()}\n output: {output.size()}\n segment: {output_segments[i].size()}")
-                    #overlapping_output = torch.cat([output[:, :, -overlap_samples:], output_segments[i][:, :, :overlap_samples]], dim=2) #stack tracks
-                    #print(f" overlap size cat:{overlapping_output.size()}\n output: {output.size()}\n segment: {output_segments[i].size()}")
-                    output = torch.cat([output[:, :, :-overlap_samples], overlapping_output, output_segments[i][:, :, overlap_samples:]], dim=dimension)
-                else:
-                    output = torch.cat([output, output_segments[i]], dim=dimension)
-            output = output.detach().cpu().float()[0]
-        except Exception as e:
-            print(f"Error combining segments: {e}. Using the first segment only.")
-            output = output_segments[0].detach().cpu().float()[0]
-    else:
-        if (output is None) or (output.dim() == 0):
-            return None, None, seed
-        else:
-            output = output.detach().cpu().float()[0]
-    video_width, video_height = 768, 512
-    if video_orientation == "Portait":
-        video_width, video_height = 512, 768
-    title_file_name = convert_title_to_filename(title)
-    with NamedTemporaryFile("wb", suffix=".wav", delete=False, prefix=title_file_name) as file:
-        video_description = f"{text}\n Duration: {str(initial_duration)} Dimension: {dimension}\n Top-k:{topk} Top-p:{topp}\n Randomness:{temperature}\n cfg:{cfg_coef} overlap: {overlap}\n Seed: {seed}\n Model: {model}\n Melody Condition:{melody_name}\n Sample Segment: {prompt_index}"
-        if include_settings or include_title:
-            background = add_settings_to_image(title if include_title else "",video_description if include_settings else "",width=video_width, height=video_height, background_path=background,font=settings_font,font_color=settings_font_color, font_size=settings_font_size)
-        audio_write(
-            file.name, output, MODEL.sample_rate, strategy="loudness",
-            loudness_headroom_db=18, loudness_compressor=True, add_suffix=False, channels=2)
-        waveform_video_path = get_waveform(file.name, bg_image=background, bar_count=45, name=title_file_name, animate=settings_animate_waveform, progress=gr.Progress(track_tqdm=True))
-        # Remove the extension from file.name
-        file_name_without_extension = os.path.splitext(file.name)[0]
-        # Get the directory, filename, name, extension, and new extension of the waveform video path
-        video_dir, video_name, video_name, video_ext, video_new_ext = get_file_parts(waveform_video_path)
-        new_video_path = get_unique_file_path(video_dir, title_file_name, video_new_ext)
-        mp4 = MP4(waveform_video_path)
-        mp4["©nam"] = title_file_name        # Title tag
-        mp4["desc"] = f"{text}\n Duration: {str(initial_duration)}" # Description tag
-        commit = commit_hash()
-        metadata = {
-            "Title": title,
-            "Year": time.strftime("%Y"),
-            "prompt": text,
-            "negative_prompt": "",
-            "Seed": seed,
-            "steps": 1,
-            "wdth": video_width,
-            "hght": video_height,
-            "Dimension": dimension,
-            "Top-k": topk,
-            "Top-p": topp,
-            "Randomness": temperature,
-            "cfg": cfg_coef,
-            "overlap": overlap,
-            "Melody Condition": melody_name,
-            "Sample Segment": prompt_index,
-            "Duration": initial_duration,
-            "Audio": file.name,
-            "font": settings_font,
-            "font_color": settings_font_color,
-            "font_size": settings_font_size,
-            "harmony_only": harmony_only,
-            "background": background,
-            "include_title": include_title,
-            "include_settings": include_settings,
-            "profile": "Satoshi Nakamoto" if profile.value is None else profile.value.username,
-            "commit": commit_hash(),
-            "tag": git_tag(),
-            "version": gr.__version__,
-            "model_version": MODEL.version,
-            "model_name": MODEL.name,
-            "model_description": f"{MODEL.audio_channels} channels, {MODEL.sample_rate} Hz",
-            "melody_name": melody_name if melody_name else "",
-            "melody_extension": melody_extension if melody_extension else "",
-            "hostname": "https://huggingface.co/spaces/Surn/UnlimitedMusicGen",
-            "version": f"https://huggingface.co/spaces/Surn/UnlimitedMusicGen/commit/{'huggingface' if commit == '<none>' else commit}",
-            "python": sys.version,
-            "torch": getattr(torch, '__long_version__', torch.__version__),
-            "xformers": get_xformers_version(),
-            "gradio": gr.__version__,
-            "huggingface_space": os.environ.get('SPACE_ID', ''),
-            "CUDA": f"{'CUDA is available. device: ' + torch.cuda.get_device_name(0) + ' version: ' + torch.version.cuda if torch.cuda.is_available() else 'CUDA is not available.'}",
-        }
-        # Add additional metadata from the metadata dictionary (if it exists)
-        for key, value in metadata.items():
-            mp4[key] = str(value)  # Convert values to strings as required by mutagen
-        # Save the metadata changes to the file
-        mp4.save()
-        try:
-            os.replace(waveform_video_path, new_video_path)
-            waveform_video_path = new_video_path
-        except Exception as e:
-            print(f"Error renaming file: {e}")
-        if waveform_video_path:
-            modules.user_history.save_file(
-            profile=profile.value,
-            image=background,
-            audio=file.name,
-            video=waveform_video_path,
-            label=title,
-            metadata=metadata,
-            progress=gr.Progress(track_tqdm=True)
-        )
-    if MOVE_TO_CPU:
-        MODEL.to('cpu')
-    if UNLOAD_MODEL:
-        MODEL = None
-    # Explicitly delete large tensors or objects
-    del output_segments, output, melody, melody_name, melody_extension, metadata, mp4
-    # Force garbage collection
-    #gc.collect()
-    # Synchronize CUDA streams
-    torch.cuda.synchronize()
-    #torch.cuda.empty_cache()
-    torch.cuda.ipc_collect()
-    return waveform_video_path, file.name, seed
 gr.set_static_paths(paths=["fonts/","assets/","images/"])
 def ui(**kwargs):

 import warnings
 import gc
 from tqdm import tqdm
 import numpy as np
 import random
 import shutil
 from modules.gradio import *
 from modules.file_utils import get_file_parts, get_filename_from_filepath, convert_title_to_filename, get_unique_file_path, delete_file
+# Added for MCP call
+from smolagents.mcp_client import MCPClient
+from modules.storage import upload_files_to_repo  # Added import
+from modules.constants import HF_REPO_ID  # Added import
 MODEL = None
 MODELS = None
+IS_SHARED_SPACE = "Agents-MCP-Hackathon/UnlimitedMusicGen" in os.environ.get('SPACE_ID', '')
 INTERRUPTED = False
 UNLOAD_MODEL = False
 MOVE_TO_CPU = False
     return  gr.update(value=melody_name), gr.update(maximum=MAX_PROMPT_INDEX, value=-1), gr.update(value=assigned_model, interactive=True), gr.update(value=topp), gr.update(value=temperature), gr.update(value=cfg_coef), gr.update(maximum=MAX_OVERLAP)
+def predict(
+    model_name_arg,  # Renamed from 'model'
+    text_arg,
+    melody_filepath_arg,
+    duration_arg,
+    dimension_arg,
+    topk_arg,
+    topp_arg,
+    temperature_arg,
+    cfg_coef_arg,
+    background_image_arg, # Renamed from 'background'
+    title_arg,
+    settings_font_path_arg, # Renamed from 'settings_font'
+    settings_font_color_arg,
+    seed_arg,
+    overlap_arg=1,
+    prompt_index_arg=0,
+    include_title_arg=True,
+    include_settings_arg=True,
+    harmony_only_arg=False,
+    profile_arg: tp.Optional[gr.OAuthProfile] = None, # Type hint for clarity, Gradio passes OAuthProfile or None
+    segment_length_arg=30,
+    settings_font_size_arg=28,
+    settings_animate_waveform_arg=False,
+    video_orientation_arg="Landscape",
+    excerpt_duration_arg=3.5,
+    progress_arg=gr.Progress(track_tqdm=True) # Renamed from 'progress', Gradio handles this
+):
     """
+    Generate music and video by calling a remote MCP endpoint tool.
+    This function replaces the original local model inference.
     """
+    global INTERRUPTED, INTERRUPTING # Retained, though effect on remote job is indirect
     INTERRUPTED = False
     INTERRUPTING = False
+    # Helper to get value if it's a Gradio State object
+    def get_value_if_state(arg):
+        if hasattr(arg, 'value') and arg.value is not None:
+            return arg.value
+        return arg
+    model_name_arg = get_value_if_state(model_name_arg)
+    text_arg = get_value_if_state(text_arg)
+    melody_filepath_arg = get_value_if_state(melody_filepath_arg)
+    duration_arg = get_value_if_state(duration_arg)
+    dimension_arg = get_value_if_state(dimension_arg)
+    topk_arg = get_value_if_state(topk_arg)
+    topp_arg = get_value_if_state(topp_arg)
+    temperature_arg = get_value_if_state(temperature_arg)
+    cfg_coef_arg = get_value_if_state(cfg_coef_arg)
+    background_image_arg = get_value_if_state(background_image_arg)
+    title_arg = get_value_if_state(title_arg)
+    settings_font_path_arg = get_value_if_state(settings_font_path_arg)
+    settings_font_color_arg = get_value_if_state(settings_font_color_arg)
+    seed_arg = get_value_if_state(seed_arg)
+    overlap_arg = get_value_if_state(overlap_arg)
+    prompt_index_arg = get_value_if_state(prompt_index_arg)
+    include_title_arg = get_value_if_state(include_title_arg)
+    include_settings_arg = get_value_if_state(include_settings_arg)
+    # harmony_only_arg is handled specifically below
+    # profile_arg is handled specifically below
+    segment_length_arg = get_value_if_state(segment_length_arg)
+    settings_font_size_arg = get_value_if_state(settings_font_size_arg)
+    settings_animate_waveform_arg = get_value_if_state(settings_animate_waveform_arg)
+    video_orientation_arg = get_value_if_state(video_orientation_arg)
+    excerpt_duration_arg = get_value_if_state(excerpt_duration_arg)
+    print("Initiating MCP call to https://surn-unlimitedmusicgen.hf.space/gradio_api/mcp/sse tool UnlimitedMusicGen_predict")
+    mcp_client = None
+    melody_file_url = None  # Changed from melody_file_obj
+    background_image_url = None  # Changed from background_reference
     try:
+        # Upload files to Hugging Face Hub and get URLs
+        files_to_upload = []
+        if melody_filepath_arg and melody_filepath_arg not in ["None", ""]:
+            files_to_upload.append(melody_filepath_arg)
+        if background_image_arg and background_image_arg not in ["None", ""] and not background_image_arg.startswith("http"):  # only upload if not already a URL
+            files_to_upload.append(background_image_arg)
+        uploaded_file_urls = {}
+        if files_to_upload:
+            # Use a unique folder name for each upload session, e.g., based on timestamp or a random string
+            # For simplicity, using a fixed folder name here, but consider making it unique.
+            # The username from profile_arg could be used to create a user-specific folder.
+            profile_username_for_folder = "default_user"
+            if profile_arg:
+                # Check if profile_arg is a Gradio State object holding an OAuthProfile or string
+                actual_profile_data = profile_arg
+                if hasattr(profile_arg, 'value') and profile_arg.value is not None: # Handles gr.State wrapping OAuthProfile or string
+                    actual_profile_data = profile_arg.value
+                if hasattr(actual_profile_data, 'username') and actual_profile_data.username: # OAuthProfile object
+                    profile_username_for_folder = actual_profile_data.username
+                elif isinstance(actual_profile_data, str) and actual_profile_data: # String username
+                    profile_username_for_folder = actual_profile_data
+            folder_name = f"user_uploads/{profile_username_for_folder}/{time.strftime('%Y%m%d%H%M%S')}"
+            upload_results = upload_files_to_repo(
+                files=files_to_upload,
+                repo_id=HF_REPO_ID,  # Make sure HF_REPO_ID is defined in constants
+                folder_name=folder_name,
+                create_permalink=False  # We need individual links
             )
+            print(f"Upload results: {upload_results}")
+            if isinstance(upload_results, list):
+                for i, file_path in enumerate(files_to_upload):
+                    original_filename = os.path.basename(file_path)
+                    # Find the corresponding URL from upload_results
+                    # The upload_results list contains tuples of (response, link)
+                    # We need to match the uploaded file with its original path to assign the correct URL
+                    # Assuming the order is preserved or filenames in links are reliable
+                    for _, link in upload_results:
+                        if original_filename in link:
+                            uploaded_file_urls[file_path] = link
+                            break
+            else:  # Handle dict case or errors if necessary, though create_permalink=False should yield a list
+                print(f"Warning: Expected a list from upload_files_to_repo, got {type(upload_results)}")
+        if melody_filepath_arg and melody_filepath_arg in uploaded_file_urls:
+            melody_file_url = uploaded_file_urls[melody_filepath_arg]
+            print(f"Melody file uploaded to: {melody_file_url}")
+        elif melody_filepath_arg and melody_filepath_arg not in ["None", ""]:  # File was provided but not uploaded (e.g. error)
+            print(f"Warning: Melody file {melody_filepath_arg} was provided but not successfully uploaded or URL not found.")
+        if background_image_arg and background_image_arg in uploaded_file_urls:
+            background_image_url = uploaded_file_urls[background_image_arg]
+            print(f"Background image uploaded to: {background_image_url}")
+        elif background_image_arg and background_image_arg.startswith("http"):
+            background_image_url = background_image_arg  # It's already a URL
+            print(f"Using existing background image URL: {background_image_url}")
+        elif background_image_arg and background_image_arg not in ["None", ""]:  # File was provided but not uploaded
+            print(f"Warning: Background image {background_image_arg} was provided but not successfully uploaded or URL not found.")
+        mcp_client = MCPClient({"url": "https://surn-unlimitedmusicgen.hf.space/gradio_api/mcp/sse"})
+        tools = mcp_client.get_tools()
+        predict_tool = next((t for t in tools if t.name == "UnlimitedMusicGen_predict"), None)
+        if not predict_tool:
+            raise gr.Error("MCP tool 'UnlimitedMusicGen_predict' not found on the server.")
+        profile_username_to_send = "Satoshi Nakamoto"
+        if profile_arg:
+            actual_profile_data = profile_arg
+            # Unwrap if it's a gr.State object
+            if hasattr(profile_arg, 'value') and profile_arg.value is not None:
+                actual_profile_data = profile_arg.value
+            # Now actual_profile_data is either an OAuthProfile or a string username
+            if hasattr(actual_profile_data, 'username') and actual_profile_data.username: # OAuthProfile
+                profile_username_to_send = actual_profile_data.username
+            elif isinstance(actual_profile_data, str) and actual_profile_data: # string username
+                profile_username_to_send = actual_profile_data
+        actual_harmony_only = False
+        if isinstance(harmony_only_arg, str):
+            actual_harmony_only = harmony_only_arg.lower() == "yes"
+        elif isinstance(harmony_only_arg, bool):
+            actual_harmony_only = harmony_only_arg
+        tool_args = {
+            "model_name_arg": model_name_arg,
+            "text_arg": text_arg,
+            "melody_filepath_arg": melody_file_url,  # Pass URL instead of file object
+            "duration_arg": duration_arg,
+            "dimension_arg": dimension_arg,
+            "topk_arg": topk_arg,
+            "topp_arg": topp_arg,
+            "temperature_arg": temperature_arg,
+            "cfg_coef_arg": cfg_coef_arg,
+            "background_image_arg": background_image_url,  # Pass URL
+            "title_arg": title_arg,
+            "settings_font_path_arg": settings_font_path_arg,
+            "settings_font_color_arg": settings_font_color_arg,
+            "seed_arg": seed_arg,
+            "overlap_arg": overlap_arg,
+            "prompt_index_arg": prompt_index_arg,
+            "include_title_arg": include_title_arg,
+            "include_settings_arg": include_settings_arg,
+            "harmony_only_arg": actual_harmony_only,
+            "profile_arg": profile_username_to_send,
+            "segment_length_arg": segment_length_arg,
+            "settings_font_size_arg": settings_font_size_arg,
+            "settings_animate_waveform_arg": settings_animate_waveform_arg,
+            "video_orientation_arg": video_orientation_arg,
+            "excerpt_duration_arg": excerpt_duration_arg,
+        }
+        print(f"Calling remote MCP tool 'UnlimitedMusicGen_predict' with arguments (URLs for files).")
+        results = predict_tool(**tool_args)
+        print(f"MCP tool call completed. Raw results: {results}")
+        if not isinstance(results, (list, tuple)) or len(results) != 3:
+            raise gr.Error(f"MCP tool 'UnlimitedMusicGen_predict' did not return the expected 3 values. Received: {results}")
+        waveform_video_path, wave_file_path, seed_used = results
+        if not ((waveform_video_path is None or isinstance(waveform_video_path, str)) and
+              (wave_file_path is None or isinstance(wave_file_path, str))):
+            error_msg = (f"MCP tool returned invalid file paths. "
+                         f"Video path type: {type(waveform_video_path)}, "
+                         f"Audio path type: {type(wave_file_path)}")
+            raise gr.Error(error_msg)
+        if not isinstance(seed_used, (int, float)):  # Allow float for seed then cast later
+            raise gr.Error(f"MCP tool returned a non-numeric seed. Received type: {type(seed_used)}, value: {seed_used}")
+        return waveform_video_path, wave_file_path, int(seed_used)
+    except Exception as e:
+        error_message = f"Error during MCP tool call or file upload: {str(e)}"
+        print(error_message)
+        import traceback
+        traceback.print_exc()
+        if isinstance(e, gr.Error):
+            raise
+        else:
+            raise gr.Error(error_message)
+    finally:
+        # No file objects to close here as we are passing URLs
+        if mcp_client:
+            try:
+                mcp_client.disconnect()
+                print("MCP client disconnected.")
+            except Exception as e_disconnect:
+                print(f"Error disconnecting MCP client: {e_disconnect}")
 gr.set_static_paths(paths=["fonts/","assets/","images/"])
 def ui(**kwargs):

audiocraft/__init__.py DELETED Viewed

@@ -1,10 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-# flake8: noqa
-from . import data, modules, models
-__version__ = '1.3.Surn-MCP'

audiocraft/data/__init__.py DELETED Viewed

@@ -1,8 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-# flake8: noqa
-from . import audio, audio_dataset, info_audio_dataset

audiocraft/data/audio.py DELETED Viewed

@@ -1,422 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Audio IO methods are defined in this module (info, read, write),
-We rely on av library for faster read when possible, otherwise on torchaudio.
-"""
-from dataclasses import dataclass
-from pathlib import Path
-import logging
-import typing as tp
-import numpy as np
-import soundfile
-import torch
-from torch.nn import functional as F
-import torchaudio as ta
-import av
-import subprocess as sp
-from .audio_utils import f32_pcm, i16_pcm, normalize_audio, convert_audio
-_av_initialized = False
-def _init_av():
-    global _av_initialized
-    if _av_initialized:
-        return
-    logger = logging.getLogger('libav.mp3')
-    logger.setLevel(logging.ERROR)
-    _av_initialized = True
-@dataclass(frozen=True)
-class AudioFileInfo:
-    sample_rate: int
-    duration: float
-    channels: int
-def _av_info(filepath: tp.Union[str, Path]) -> AudioFileInfo:
-    _init_av()
-    with av.open(str(filepath)) as af:
-        stream = af.streams.audio[0]
-        sample_rate = stream.codec_context.sample_rate
-        duration = float(stream.duration * stream.time_base)
-        channels = stream.channels
-        return AudioFileInfo(sample_rate, duration, channels)
-def _soundfile_info(filepath: tp.Union[str, Path]) -> AudioFileInfo:
-    info = soundfile.info(filepath)
-    return AudioFileInfo(info.samplerate, info.duration, info.channels)
-def audio_info(filepath: tp.Union[str, Path]) -> AudioFileInfo:
-    # torchaudio no longer returns useful duration informations for some formats like mp3s.
-    filepath = Path(filepath)
-    if filepath.suffix in ['.flac', '.ogg']:  # TODO: Validate .ogg can be safely read with av_info
-        # ffmpeg has some weird issue with flac.
-        return _soundfile_info(filepath)
-    else:
-        return _av_info(filepath)
-def _av_read(filepath: tp.Union[str, Path], seek_time: float = 0, duration: float = -1.) -> tp.Tuple[torch.Tensor, int]:
-    """FFMPEG-based audio file reading using PyAV bindings.
-    Soundfile cannot read mp3 and av_read is more efficient than torchaudio.
-    Args:
-        filepath (str or Path): Path to audio file to read.
-        seek_time (float): Time at which to start reading in the file.
-        duration (float): Duration to read from the file. If set to -1, the whole file is read.
-    Returns:
-        tuple of torch.Tensor, int: Tuple containing audio data and sample rate
-    """
-    _init_av()
-    with av.open(str(filepath)) as af:
-        stream = af.streams.audio[0]
-        sr = stream.codec_context.sample_rate
-        num_frames = int(sr * duration) if duration >= 0 else -1
-        frame_offset = int(sr * seek_time)
-        # we need a small negative offset otherwise we get some edge artifact
-        # from the mp3 decoder.
-        af.seek(int(max(0, (seek_time - 0.1)) / stream.time_base), stream=stream)
-        frames = []
-        length = 0
-        for frame in af.decode(streams=stream.index):
-            current_offset = int(frame.rate * frame.pts * frame.time_base)
-            strip = max(0, frame_offset - current_offset)
-            buf = torch.from_numpy(frame.to_ndarray())
-            if buf.shape[0] != stream.channels:
-                buf = buf.view(-1, stream.channels).t()
-            buf = buf[:, strip:]
-            frames.append(buf)
-            length += buf.shape[1]
-            if num_frames > 0 and length >= num_frames:
-                break
-        assert frames
-        # If the above assert fails, it is likely because we seeked past the end of file point,
-        # in which case ffmpeg returns a single frame with only zeros, and a weird timestamp.
-        # This will need proper debugging, in due time.
-        wav = torch.cat(frames, dim=1)
-        assert wav.shape[0] == stream.channels
-        if num_frames > 0:
-            wav = wav[:, :num_frames]
-        return f32_pcm(wav), sr
-def audio_read(filepath: tp.Union[str, Path], seek_time: float = 0.,
-               duration: float = -1.0, pad: bool = False) -> tp.Tuple[torch.Tensor, int]:
-    """Read audio by picking the most appropriate backend tool based on the audio format.
-    Args:
-        filepath (str or Path): Path to audio file to read.
-        seek_time (float): Time at which to start reading in the file.
-        duration (float): Duration to read from the file. If set to -1, the whole file is read.
-        pad (bool): Pad output audio if not reaching expected duration.
-    Returns:
-        tuple of torch.Tensor, int: Tuple containing audio data and sample rate.
-    """
-    fp = Path(filepath)
-    if fp.suffix in ['.flac', '.ogg']:  # TODO: check if we can safely use av_read for .ogg
-        # There is some bug with ffmpeg and reading flac
-        info = _soundfile_info(filepath)
-        frames = -1 if duration <= 0 else int(duration * info.sample_rate)
-        frame_offset = int(seek_time * info.sample_rate)
-        wav, sr = soundfile.read(filepath, start=frame_offset, frames=frames, dtype=np.float32)
-        assert info.sample_rate == sr, f"Mismatch of sample rates {info.sample_rate} {sr}"
-        wav = torch.from_numpy(wav).t().contiguous()
-        if len(wav.shape) == 1:
-            wav = torch.unsqueeze(wav, 0)
-    elif (
-        fp.suffix in ['.wav', '.mp3'] and fp.suffix[1:] in ta.utils.sox_utils.list_read_formats()
-        and duration <= 0 and seek_time == 0
-    ):
-        # Torchaudio is faster if we load an entire file at once.
-        wav, sr = ta.load(fp)
-    else:
-        wav, sr = _av_read(filepath, seek_time, duration)
-    if pad and duration > 0:
-        expected_frames = int(duration * sr)
-        wav = F.pad(wav, (0, expected_frames - wav.shape[-1]))
-    return wav, sr
-def _piping_to_ffmpeg(out_path: tp.Union[str, Path], wav: torch.Tensor, sample_rate: int, flags: tp.List[str]):
-    # ffmpeg is always installed and torchaudio is a bit unstable lately, so let's bypass it entirely.
-    assert wav.dim() == 2, wav.shape
-    command = [
-        'ffmpeg',
-        '-loglevel', 'error',
-        '-y', '-f', 'f32le', '-ar', str(sample_rate), '-ac', str(wav.shape[0]),
-        '-i', '-'] + flags + [str(out_path)]
-    input_ = f32_pcm(wav).t().detach().cpu().numpy().tobytes()
-    sp.run(command, input=input_, check=True)
-def audio_write(stem_name: tp.Union[str, Path],
-                wav: torch.Tensor, sample_rate: int,
-                format: str = 'wav', mp3_rate: int = 320, normalize: bool = True,
-                strategy: str = 'peak', peak_clip_headroom_db: float = 1,
-                rms_headroom_db: float = 18, loudness_headroom_db: float = 14,
-                loudness_compressor: bool = False,
-                log_clipping: bool = True, make_parent_dir: bool = True,
-                add_suffix: bool = True, channels:int = 1) -> Path:
-    """Convenience function for saving audio to disk. Returns the filename the audio was written to.
-    Args:
-        stem_name (str or Path): Filename without extension which will be added automatically.
-        format (str): Either "wav" or "mp3".
-        mp3_rate (int): kbps when using mp3s.
-        normalize (bool): if `True` (default), normalizes according to the prescribed
-            strategy (see after). If `False`, the strategy is only used in case clipping
-            would happen.
-        strategy (str): Can be either 'clip', 'peak', or 'rms'. Default is 'peak',
-            i.e. audio is normalized by its largest value. RMS normalizes by root-mean-square
-            with extra headroom to avoid clipping. 'clip' just clips.
-        peak_clip_headroom_db (float): Headroom in dB when doing 'peak' or 'clip' strategy.
-        rms_headroom_db (float): Headroom in dB when doing 'rms' strategy. This must be much larger
-            than the `peak_clip` one to avoid further clipping.
-        loudness_headroom_db (float): Target loudness for loudness normalization.
-        loudness_compressor (bool): Uses tanh for soft clipping when strategy is 'loudness'.
-         when strategy is 'loudness'log_clipping (bool): If True, basic logging on stderr when clipping still
-            occurs despite strategy (only for 'rms').
-        make_parent_dir (bool): Make parent directory if it doesn't exist.
-    Returns:
-        Path: Path of the saved audio.
-    """
-    assert wav.dtype.is_floating_point, "wav is not floating point"
-    if wav.dim() == 1:
-        wav = wav[None]
-    elif wav.dim() > 2:
-        raise ValueError("Input wav should be at most 2 dimension.")
-    assert wav.isfinite().all()
-    wav = normalize_audio(wav, normalize, strategy, peak_clip_headroom_db,
-                          rms_headroom_db, loudness_headroom_db, log_clipping=log_clipping,
-                          sample_rate=sample_rate, stem_name=str(stem_name))
-    if channels > 1:
-        wav = convert_audio(wav,sample_rate, sample_rate, channels)
-    kwargs: dict = {}
-    if format == 'mp3':
-        suffix = '.mp3'
-        kwargs.update({"compression": mp3_rate})
-    elif format == 'wav':
-        wav = i16_pcm(wav)
-        suffix = '.wav'
-        kwargs.update({"encoding": "PCM_S", "bits_per_sample": 16})
-    else:
-        raise RuntimeError(f"Invalid format {format}. Only wav or mp3 are supported.")
-    if not add_suffix:
-        suffix = ''
-    path = Path(str(stem_name) + suffix)
-    if make_parent_dir:
-        path.parent.mkdir(exist_ok=True, parents=True)
-    try:
-        ta.save(path, wav, sample_rate, **kwargs)
-    except Exception:
-        if path.exists():
-            # we do not want to leave half written files around.
-            path.unlink()
-        raise
-    return path
-def audio_write2(stem_name: tp.Union[str, Path],
-                wav: torch.Tensor, sample_rate: int,
-                format: str = 'wav', mp3_rate: int = 320, ogg_rate: tp.Optional[int] = None,
-                normalize: bool = True, strategy: str = 'peak', peak_clip_headroom_db: float = 1,
-                rms_headroom_db: float = 18, loudness_headroom_db: float = 14,
-                loudness_compressor: bool = False,
-                log_clipping: bool = True, make_parent_dir: bool = True,
-                add_suffix: bool = True) -> Path:
-    """Convenience function for saving audio to disk. Returns the filename the audio was written to.
-    Args:
-        stem_name (str or Path): Filename without extension which will be added automatically.
-        wav (torch.Tensor): Audio data to save.
-        sample_rate (int): Sample rate of audio data.
-        format (str): Either "wav", "mp3", "ogg", or "flac".
-        mp3_rate (int): kbps when using mp3s.
-        ogg_rate (int): kbps when using ogg/vorbis. If not provided, let ffmpeg decide for itself.
-        normalize (bool): if `True` (default), normalizes according to the prescribed
-            strategy (see after). If `False`, the strategy is only used in case clipping
-            would happen.
-        strategy (str): Can be either 'clip', 'peak', or 'rms'. Default is 'peak',
-            i.e. audio is normalized by its largest value. RMS normalizes by root-mean-square
-            with extra headroom to avoid clipping. 'clip' just clips.
-        peak_clip_headroom_db (float): Headroom in dB when doing 'peak' or 'clip' strategy.
-        rms_headroom_db (float): Headroom in dB when doing 'rms' strategy. This must be much larger
-            than the `peak_clip` one to avoid further clipping.
-        loudness_headroom_db (float): Target loudness for loudness normalization.
-        loudness_compressor (bool): Uses tanh for soft clipping when strategy is 'loudness'.
-         when strategy is 'loudness' log_clipping (bool): If True, basic logging on stderr when clipping still
-            occurs despite strategy (only for 'rms').
-        make_parent_dir (bool): Make parent directory if it doesn't exist.
-    Returns:
-        Path: Path of the saved audio.
-    """
-    assert wav.dtype.is_floating_point, "wav is not floating point"
-    if wav.dim() == 1:
-        wav = wav[None]
-    elif wav.dim() > 2:
-        raise ValueError("Input wav should be at most 2 dimension.")
-    assert wav.isfinite().all()
-    wav = normalize_audio(wav, normalize, strategy, peak_clip_headroom_db,
-                          rms_headroom_db, loudness_headroom_db, loudness_compressor,
-                          log_clipping=log_clipping, sample_rate=sample_rate,
-                          stem_name=str(stem_name))
-    if format == 'mp3':
-        suffix = '.mp3'
-        flags = ['-f', 'mp3', '-c:a', 'libmp3lame', '-b:a', f'{mp3_rate}k']
-    elif format == 'wav':
-        suffix = '.wav'
-        flags = ['-f', 'wav', '-c:a', 'pcm_s16le']
-    elif format == 'ogg':
-        suffix = '.ogg'
-        flags = ['-f', 'ogg', '-c:a', 'libvorbis']
-        if ogg_rate is not None:
-            flags += ['-b:a', f'{ogg_rate}k']
-    elif format == 'flac':
-        suffix = '.flac'
-        flags = ['-f', 'flac']
-    else:
-        raise RuntimeError(f"Invalid format {format}. Only wav or mp3 are supported.")
-    if not add_suffix:
-        suffix = ''
-    path = Path(str(stem_name) + suffix)
-    if make_parent_dir:
-        path.parent.mkdir(exist_ok=True, parents=True)
-    try:
-        _piping_to_ffmpeg(path, wav, sample_rate, flags)
-    except Exception:
-        if path.exists():
-            # we do not want to leave half written files around.
-            path.unlink()
-        raise
-    return path
-def get_spec(y, sr=16000, n_fft=4096, hop_length=128, dur=8) -> np.ndarray:
-    """Get the mel-spectrogram from the raw audio.
-    Args:
-        y (numpy array): raw input
-        sr (int): Sampling rate
-        n_fft (int): Number of samples per FFT. Default is 2048.
-        hop_length (int): Number of samples between successive frames. Default is 512.
-        dur (float): Maxium duration to get the spectrograms
-    Returns:
-        spectro histogram as a numpy array
-    """
-    import librosa
-    import librosa.display
-    spectrogram = librosa.feature.melspectrogram(
-        y=y, sr=sr, n_fft=n_fft, hop_length=hop_length
-    )
-    spectrogram_db = librosa.power_to_db(spectrogram, ref=np.max)
-    return spectrogram_db
-def save_spectrograms(
-    ys: tp.List[np.ndarray],
-    sr: int,
-    path: str,
-    names: tp.List[str],
-    n_fft: int = 4096,
-    hop_length: int = 128,
-    dur: float = 8.0,
-):
-    """Plot a spectrogram for an audio file.
-    Args:
-        ys: List of audio spectrograms
-        sr (int): Sampling rate of the audio file. Default is 22050 Hz.
-        path (str): Path to the plot file.
-        names: name of each spectrogram plot
-        n_fft (int): Number of samples per FFT. Default is 2048.
-        hop_length (int): Number of samples between successive frames. Default is 512.
-        dur (float): Maxium duration to plot the spectrograms
-    Returns:
-        None (plots the spectrogram using matplotlib)
-    """
-    import matplotlib as mpl  # type: ignore
-    import matplotlib.pyplot as plt  # type: ignore
-    import librosa.display
-    if not names:
-        names = ["Ground Truth", "Audio Watermarked", "Watermark"]
-    ys = [wav[: int(dur * sr)] for wav in ys]  # crop
-    assert len(names) == len(
-        ys
-    ), f"There are {len(ys)} wavs but {len(names)} names ({names})"
-    # Set matplotlib stuff
-    BIGGER_SIZE = 10
-    SMALLER_SIZE = 8
-    linewidth = 234.8775  # linewidth in pt
-    plt.rc("font", size=BIGGER_SIZE, family="serif")  # controls default text sizes
-    plt.rcParams["font.family"] = "DeJavu Serif"
-    plt.rcParams["font.serif"] = ["Times New Roman"]
-    plt.rc("axes", titlesize=BIGGER_SIZE)  # fontsize of the axes title
-    plt.rc("axes", labelsize=BIGGER_SIZE)  # fontsize of the x and y labels
-    plt.rc("xtick", labelsize=BIGGER_SIZE)  # fontsize of the tick labels
-    plt.rc("ytick", labelsize=SMALLER_SIZE)  # fontsize of the tick labels
-    plt.rc("legend", fontsize=BIGGER_SIZE)  # legend fontsize
-    plt.rc("figure", titlesize=BIGGER_SIZE)
-    height = 1.6 * linewidth / 72.0
-    fig, ax = plt.subplots(
-        nrows=len(ys),
-        ncols=1,
-        sharex=True,
-        figsize=(linewidth / 72.0, height),
-    )
-    fig.tight_layout()
-    # Plot the spectrogram
-    for i, ysi in enumerate(ys):
-        spectrogram_db = get_spec(ysi, sr=sr, n_fft=n_fft, hop_length=hop_length)
-        if i == 0:
-            cax = fig.add_axes(
-                [
-                    ax[0].get_position().x1 + 0.01,  # type: ignore
-                    ax[-1].get_position().y0,
-                    0.02,
-                    ax[0].get_position().y1 - ax[-1].get_position().y0,
-                ]
-            )
-            fig.colorbar(
-                mpl.cm.ScalarMappable(
-                    norm=mpl.colors.Normalize(
-                        np.min(spectrogram_db), np.max(spectrogram_db)
-                    ),
-                    cmap="magma",
-                ),
-                ax=ax,
-                orientation="vertical",
-                format="%+2.0f dB",
-                cax=cax,
-            )
-        librosa.display.specshow(
-            spectrogram_db,
-            sr=sr,
-            hop_length=hop_length,
-            x_axis="time",
-            y_axis="mel",
-            ax=ax[i],
-        )
-        ax[i].set(title=names[i])
-        ax[i].yaxis.set_label_text(None)
-        ax[i].label_outer()
-    fig.savefig(path, bbox_inches="tight")
-    plt.close()

audiocraft/data/audio_dataset.py DELETED Viewed

@@ -1,587 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""AudioDataset support. In order to handle a larger number of files
-without having to scan again the folders, we precompute some metadata
-(filename, sample rate, duration), and use that to efficiently sample audio segments.
-"""
-import argparse
-import copy
-from concurrent.futures import ThreadPoolExecutor, Future
-from dataclasses import dataclass, fields
-from contextlib import ExitStack
-from functools import lru_cache
-import gzip
-import json
-import logging
-import os
-from pathlib import Path
-import random
-import sys
-import typing as tp
-import torch
-import torch.nn.functional as F
-from .audio import audio_read, audio_info
-from .audio_utils import convert_audio
-from .zip import PathInZip
-try:
-    import dora
-except ImportError:
-    dora = None  # type: ignore
-@dataclass(order=True)
-class BaseInfo:
-    @classmethod
-    def _dict2fields(cls, dictionary: dict):
-        return {
-            field.name: dictionary[field.name]
-            for field in fields(cls) if field.name in dictionary
-        }
-    @classmethod
-    def from_dict(cls, dictionary: dict):
-        _dictionary = cls._dict2fields(dictionary)
-        return cls(**_dictionary)
-    def to_dict(self):
-        return {
-            field.name: self.__getattribute__(field.name)
-            for field in fields(self)
-            }
-@dataclass(order=True)
-class AudioMeta(BaseInfo):
-    path: str
-    duration: float
-    sample_rate: int
-    amplitude: tp.Optional[float] = None
-    weight: tp.Optional[float] = None
-    # info_path is used to load additional information about the audio file that is stored in zip files.
-    info_path: tp.Optional[PathInZip] = None
-    @classmethod
-    def from_dict(cls, dictionary: dict):
-        base = cls._dict2fields(dictionary)
-        if 'info_path' in base and base['info_path'] is not None:
-            base['info_path'] = PathInZip(base['info_path'])
-        return cls(**base)
-    def to_dict(self):
-        d = super().to_dict()
-        if d['info_path'] is not None:
-            d['info_path'] = str(d['info_path'])
-        return d
-@dataclass(order=True)
-class SegmentInfo(BaseInfo):
-    meta: AudioMeta
-    seek_time: float
-    # The following values are given once the audio is processed, e.g.
-    # at the target sample rate and target number of channels.
-    n_frames: int      # actual number of frames without padding
-    total_frames: int  # total number of frames, padding included
-    sample_rate: int   # actual sample rate
-    channels: int      # number of audio channels.
-DEFAULT_EXTS = ['.wav', '.mp3', '.flac', '.ogg', '.m4a']
-logger = logging.getLogger(__name__)
-def _get_audio_meta(file_path: str, minimal: bool = True) -> AudioMeta:
-    """AudioMeta from a path to an audio file.
-    Args:
-        file_path (str): Resolved path of valid audio file.
-        minimal (bool): Whether to only load the minimal set of metadata (takes longer if not).
-    Returns:
-        AudioMeta: Audio file path and its metadata.
-    """
-    info = audio_info(file_path)
-    amplitude: tp.Optional[float] = None
-    if not minimal:
-        wav, sr = audio_read(file_path)
-        amplitude = wav.abs().max().item()
-    return AudioMeta(file_path, info.duration, info.sample_rate, amplitude)
-def _resolve_audio_meta(m: AudioMeta, fast: bool = True) -> AudioMeta:
-    """If Dora is available as a dependency, try to resolve potential relative paths
-    in list of AudioMeta. This method is expected to be used when loading meta from file.
-    Args:
-        m (AudioMeta): Audio meta to resolve.
-        fast (bool): If True, uses a really fast check for determining if a file
-            is already absolute or not. Only valid on Linux/Mac.
-    Returns:
-        AudioMeta: Audio meta with resolved path.
-    """
-    def is_abs(m):
-        if fast:
-            return str(m)[0] == '/'
-        else:
-            os.path.isabs(str(m))
-    if not dora:
-        return m
-    if not is_abs(m.path):
-        m.path = dora.git_save.to_absolute_path(m.path)
-    if m.info_path is not None and not is_abs(m.info_path.zip_path):
-        m.info_path.zip_path = dora.git_save.to_absolute_path(m.path)
-    return m
-def find_audio_files(path: tp.Union[Path, str],
-                     exts: tp.List[str] = DEFAULT_EXTS,
-                     resolve: bool = True,
-                     minimal: bool = True,
-                     progress: bool = False,
-                     workers: int = 0) -> tp.List[AudioMeta]:
-    """Build a list of AudioMeta from a given path,
-    collecting relevant audio files and fetching meta info.
-    Args:
-        path (str or Path): Path to folder containing audio files.
-        exts (list of str): List of file extensions to consider for audio files.
-        minimal (bool): Whether to only load the minimal set of metadata (takes longer if not).
-        progress (bool): Whether to log progress on audio files collection.
-        workers (int): number of parallel workers, if 0, use only the current thread.
-    Returns:
-        list of AudioMeta: List of audio file path and its metadata.
-    """
-    audio_files = []
-    futures: tp.List[Future] = []
-    pool: tp.Optional[ThreadPoolExecutor] = None
-    with ExitStack() as stack:
-        if workers > 0:
-            pool = ThreadPoolExecutor(workers)
-            stack.enter_context(pool)
-        if progress:
-            print("Finding audio files...")
-        for root, folders, files in os.walk(path, followlinks=True):
-            for file in files:
-                full_path = Path(root) / file
-                if full_path.suffix.lower() in exts:
-                    audio_files.append(full_path)
-                    if pool is not None:
-                        futures.append(pool.submit(_get_audio_meta, str(audio_files[-1]), minimal))
-                    if progress:
-                        print(format(len(audio_files), " 8d"), end='\r', file=sys.stderr)
-        if progress:
-            print("Getting audio metadata...")
-        meta: tp.List[AudioMeta] = []
-        for idx, file_path in enumerate(audio_files):
-            try:
-                if pool is None:
-                    m = _get_audio_meta(str(file_path), minimal)
-                else:
-                    m = futures[idx].result()
-                if resolve:
-                    m = _resolve_audio_meta(m)
-            except Exception as err:
-                print("Error with", str(file_path), err, file=sys.stderr)
-                continue
-            meta.append(m)
-            if progress:
-                print(format((1 + idx) / len(audio_files), " 3.1%"), end='\r', file=sys.stderr)
-    meta.sort()
-    return meta
-def load_audio_meta(path: tp.Union[str, Path],
-                    resolve: bool = True, fast: bool = True) -> tp.List[AudioMeta]:
-    """Load list of AudioMeta from an optionally compressed json file.
-    Args:
-        path (str or Path): Path to JSON file.
-        resolve (bool): Whether to resolve the path from AudioMeta (default=True).
-        fast (bool): activates some tricks to make things faster.
-    Returns:
-        list of AudioMeta: List of audio file path and its total duration.
-    """
-    open_fn = gzip.open if str(path).lower().endswith('.gz') else open
-    with open_fn(path, 'rb') as fp:  # type: ignore
-        lines = fp.readlines()
-    meta = []
-    for line in lines:
-        d = json.loads(line)
-        m = AudioMeta.from_dict(d)
-        if resolve:
-            m = _resolve_audio_meta(m, fast=fast)
-        meta.append(m)
-    return meta
-def save_audio_meta(path: tp.Union[str, Path], meta: tp.List[AudioMeta]):
-    """Save the audio metadata to the file pointer as json.
-    Args:
-        path (str or Path): Path to JSON file.
-        metadata (list of BaseAudioMeta): List of audio meta to save.
-    """
-    Path(path).parent.mkdir(exist_ok=True, parents=True)
-    open_fn = gzip.open if str(path).lower().endswith('.gz') else open
-    with open_fn(path, 'wb') as fp:  # type: ignore
-        for m in meta:
-            json_str = json.dumps(m.to_dict()) + '\n'
-            json_bytes = json_str.encode('utf-8')
-            fp.write(json_bytes)
-class AudioDataset:
-    """Base audio dataset.
-    The dataset takes a list of AudioMeta and create a dataset composed of segments of audio
-    and potentially additional information, by creating random segments from the list of audio
-    files referenced in the metadata and applying minimal data pre-processing such as resampling,
-    mixing of channels, padding, etc.
-    If no segment_duration value is provided, the AudioDataset will return the full wav for each
-    audio file. Otherwise, it will randomly sample audio files and create a segment of the specified
-    duration, applying padding if required.
-    By default, only the torch Tensor corresponding to the waveform is returned. Setting return_info=True
-    allows to return a tuple containing the torch Tensor and additional metadata on the segment and the
-    original audio meta.
-    Note that you can call `start_epoch(epoch)` in order to get
-    a deterministic "randomization" for `shuffle=True`.
-    For a given epoch and dataset index, this will always return the same extract.
-    You can get back some diversity by setting the `shuffle_seed` param.
-    Args:
-        meta (list of AudioMeta): List of audio files metadata.
-        segment_duration (float, optional): Optional segment duration of audio to load.
-            If not specified, the dataset will load the full audio segment from the file.
-        shuffle (bool): Set to `True` to have the data reshuffled at every epoch.
-        sample_rate (int): Target sample rate of the loaded audio samples.
-        channels (int): Target number of channels of the loaded audio samples.
-        sample_on_duration (bool): Set to `True` to sample segments with probability
-            dependent on audio file duration. This is only used if `segment_duration` is provided.
-        sample_on_weight (bool): Set to `True` to sample segments using the `weight` entry of
-            `AudioMeta`. If `sample_on_duration` is also True, the actual weight will be the product
-            of the file duration and file weight. This is only used if `segment_duration` is provided.
-        min_segment_ratio (float): Minimum segment ratio to use when the audio file
-            is shorter than the desired segment.
-        max_read_retry (int): Maximum number of retries to sample an audio segment from the dataset.
-        return_info (bool): Whether to return the wav only or return wav along with segment info and metadata.
-        min_audio_duration (float, optional): Minimum audio file duration, in seconds, if provided
-            audio shorter than this will be filtered out.
-        max_audio_duration (float, optional): Maximal audio file duration in seconds, if provided
-            audio longer than this will be filtered out.
-        shuffle_seed (int): can be used to further randomize
-        load_wav (bool): if False, skip loading the wav but returns a tensor of 0
-            with the expected segment_duration (which must be provided if load_wav is False).
-        permutation_on_files (bool): only if `sample_on_weight` and `sample_on_duration`
-            are False. Will ensure a permutation on files when going through the dataset.
-            In that case the epoch number must be provided in order for the model
-            to continue the permutation across epochs. In that case, it is assumed
-            that `num_samples = total_batch_size * num_updates_per_epoch`, with
-            `total_batch_size` the overall batch size accounting for all gpus.
-    """
-    def __init__(self,
-                 meta: tp.List[AudioMeta],
-                 segment_duration: tp.Optional[float] = None,
-                 shuffle: bool = True,
-                 num_samples: int = 10_000,
-                 sample_rate: int = 48_000,
-                 channels: int = 2,
-                 pad: bool = True,
-                 sample_on_duration: bool = True,
-                 sample_on_weight: bool = True,
-                 min_segment_ratio: float = 0.5,
-                 max_read_retry: int = 10,
-                 return_info: bool = False,
-                 min_audio_duration: tp.Optional[float] = None,
-                 max_audio_duration: tp.Optional[float] = None,
-                 shuffle_seed: int = 0,
-                 load_wav: bool = True,
-                 permutation_on_files: bool = False,
-                 ):
-        assert len(meta) > 0, "No audio meta provided to AudioDataset. Please check loading of audio meta."
-        assert segment_duration is None or segment_duration > 0
-        assert segment_duration is None or min_segment_ratio >= 0
-        self.segment_duration = segment_duration
-        self.min_segment_ratio = min_segment_ratio
-        self.max_audio_duration = max_audio_duration
-        self.min_audio_duration = min_audio_duration
-        if self.min_audio_duration is not None and self.max_audio_duration is not None:
-            assert self.min_audio_duration <= self.max_audio_duration
-        self.meta: tp.List[AudioMeta] = self._filter_duration(meta)
-        assert len(self.meta)  # Fail fast if all data has been filtered.
-        self.total_duration = sum(d.duration for d in self.meta)
-        if segment_duration is None:
-            num_samples = len(self.meta)
-        self.num_samples = num_samples
-        self.shuffle = shuffle
-        self.sample_rate = sample_rate
-        self.channels = channels
-        self.pad = pad
-        self.sample_on_weight = sample_on_weight
-        self.sample_on_duration = sample_on_duration
-        self.sampling_probabilities = self._get_sampling_probabilities()
-        self.max_read_retry = max_read_retry
-        self.return_info = return_info
-        self.shuffle_seed = shuffle_seed
-        self.current_epoch: tp.Optional[int] = None
-        self.load_wav = load_wav
-        if not load_wav:
-            assert segment_duration is not None
-        self.permutation_on_files = permutation_on_files
-        if permutation_on_files:
-            assert not self.sample_on_duration
-            assert not self.sample_on_weight
-            assert self.shuffle
-    def start_epoch(self, epoch: int):
-        self.current_epoch = epoch
-    def __len__(self):
-        return self.num_samples
-    def _get_sampling_probabilities(self, normalized: bool = True):
-        """Return the sampling probabilities for each file inside `self.meta`."""
-        scores: tp.List[float] = []
-        for file_meta in self.meta:
-            score = 1.
-            if self.sample_on_weight and file_meta.weight is not None:
-                score *= file_meta.weight
-            if self.sample_on_duration:
-                score *= file_meta.duration
-            scores.append(score)
-        probabilities = torch.tensor(scores)
-        if normalized:
-            probabilities /= probabilities.sum()
-        return probabilities
-    @staticmethod
-    @lru_cache(16)
-    def _get_file_permutation(num_files: int, permutation_index: int, base_seed: int):
-        # Used to keep the most recent files permutation in memory implicitely.
-        # will work unless someone is using a lot of Datasets in parallel.
-        rng = torch.Generator()
-        rng.manual_seed(base_seed + permutation_index)
-        return torch.randperm(num_files, generator=rng)
-    def sample_file(self, index: int, rng: torch.Generator) -> AudioMeta:
-        """Sample a given file from `self.meta`. Can be overridden in subclasses.
-        This is only called if `segment_duration` is not None.
-        You must use the provided random number generator `rng` for reproducibility.
-        You can further make use of the index accessed.
-        """
-        if self.permutation_on_files:
-            assert self.current_epoch is not None
-            total_index = self.current_epoch * len(self) + index
-            permutation_index = total_index // len(self.meta)
-            relative_index = total_index % len(self.meta)
-            permutation = AudioDataset._get_file_permutation(
-                len(self.meta), permutation_index, self.shuffle_seed)
-            file_index = permutation[relative_index]
-            return self.meta[file_index]
-        if not self.sample_on_weight and not self.sample_on_duration:
-            file_index = int(torch.randint(len(self.sampling_probabilities), (1,), generator=rng).item())
-        else:
-            file_index = int(torch.multinomial(self.sampling_probabilities, 1, generator=rng).item())
-        return self.meta[file_index]
-    def _audio_read(self, path: str, seek_time: float = 0, duration: float = -1):
-        # Override this method in subclass if needed.
-        if self.load_wav:
-            return audio_read(path, seek_time, duration, pad=False)
-        else:
-            assert self.segment_duration is not None
-            n_frames = int(self.sample_rate * self.segment_duration)
-            return torch.zeros(self.channels, n_frames), self.sample_rate
-    def __getitem__(self, index: int) -> tp.Union[torch.Tensor, tp.Tuple[torch.Tensor, SegmentInfo]]:
-        if self.segment_duration is None:
-            file_meta = self.meta[index]
-            out, sr = audio_read(file_meta.path)
-            out = convert_audio(out, sr, self.sample_rate, self.channels)
-            n_frames = out.shape[-1]
-            segment_info = SegmentInfo(file_meta, seek_time=0., n_frames=n_frames, total_frames=n_frames,
-                                       sample_rate=self.sample_rate, channels=out.shape[0])
-        else:
-            rng = torch.Generator()
-            if self.shuffle:
-                # We use index, plus extra randomness, either totally random if we don't know the epoch.
-                # otherwise we make use of the epoch number and optional shuffle_seed.
-                if self.current_epoch is None:
-                    rng.manual_seed(index + self.num_samples * random.randint(0, 2**24))
-                else:
-                    rng.manual_seed(index + self.num_samples * (self.current_epoch + self.shuffle_seed))
-            else:
-                # We only use index
-                rng.manual_seed(index)
-            for retry in range(self.max_read_retry):
-                file_meta = self.sample_file(index, rng)
-                # We add some variance in the file position even if audio file is smaller than segment
-                # without ending up with empty segments
-                max_seek = max(0, file_meta.duration - self.segment_duration * self.min_segment_ratio)
-                seek_time = torch.rand(1, generator=rng).item() * max_seek
-                try:
-                    out, sr = audio_read(file_meta.path, seek_time, self.segment_duration, pad=False)
-                    out = convert_audio(out, sr, self.sample_rate, self.channels)
-                    n_frames = out.shape[-1]
-                    target_frames = int(self.segment_duration * self.sample_rate)
-                    if self.pad:
-                        out = F.pad(out, (0, target_frames - n_frames))
-                    segment_info = SegmentInfo(file_meta, seek_time, n_frames=n_frames, total_frames=target_frames,
-                                               sample_rate=self.sample_rate, channels=out.shape[0])
-                except Exception as exc:
-                    logger.warning("Error opening file %s: %r", file_meta.path, exc)
-                    if retry == self.max_read_retry - 1:
-                        raise
-                else:
-                    break
-        if self.return_info:
-            # Returns the wav and additional information on the wave segment
-            return out, segment_info
-        else:
-            return out
-    def collater(self, samples):
-        """The collater function has to be provided to the dataloader
-        if AudioDataset has return_info=True in order to properly collate
-        the samples of a batch.
-        """
-        if self.segment_duration is None and len(samples) > 1:
-            assert self.pad, "Must allow padding when batching examples of different durations."
-        # In this case the audio reaching the collater is of variable length as segment_duration=None.
-        to_pad = self.segment_duration is None and self.pad
-        if to_pad:
-            max_len = max([wav.shape[-1] for wav, _ in samples])
-            def _pad_wav(wav):
-                return F.pad(wav, (0, max_len - wav.shape[-1]))
-        if self.return_info:
-            if len(samples) > 0:
-                assert len(samples[0]) == 2
-                assert isinstance(samples[0][0], torch.Tensor)
-                assert isinstance(samples[0][1], SegmentInfo)
-            wavs = [wav for wav, _ in samples]
-            segment_infos = [copy.deepcopy(info) for _, info in samples]
-            if to_pad:
-                # Each wav could be of a different duration as they are not segmented.
-                for i in range(len(samples)):
-                    # Determines the total length of the signal with padding, so we update here as we pad.
-                    segment_infos[i].total_frames = max_len
-                    wavs[i] = _pad_wav(wavs[i])
-            wav = torch.stack(wavs)
-            return wav, segment_infos
-        else:
-            assert isinstance(samples[0], torch.Tensor)
-            if to_pad:
-                samples = [_pad_wav(s) for s in samples]
-            return torch.stack(samples)
-    def _filter_duration(self, meta: tp.List[AudioMeta]) -> tp.List[AudioMeta]:
-        """Filters out audio files with audio durations that will not allow to sample examples from them."""
-        orig_len = len(meta)
-        # Filter data that is too short.
-        if self.min_audio_duration is not None:
-            meta = [m for m in meta if m.duration >= self.min_audio_duration]
-        # Filter data that is too long.
-        if self.max_audio_duration is not None:
-            meta = [m for m in meta if m.duration <= self.max_audio_duration]
-        filtered_len = len(meta)
-        removed_percentage = 100*(1-float(filtered_len)/orig_len)
-        msg = 'Removed %.2f percent of the data because it was too short or too long.' % removed_percentage
-        if removed_percentage < 10:
-            logging.debug(msg)
-        else:
-            logging.warning(msg)
-        return meta
-    @classmethod
-    def from_meta(cls, root: tp.Union[str, Path], **kwargs):
-        """Instantiate AudioDataset from a path to a directory containing a manifest as a jsonl file.
-        Args:
-            root (str or Path): Path to root folder containing audio files.
-            kwargs: Additional keyword arguments for the AudioDataset.
-        """
-        root = Path(root)
-        if root.is_dir():
-            if (root / 'data.jsonl').exists():
-                root = root / 'data.jsonl'
-            elif (root / 'data.jsonl.gz').exists():
-                root = root / 'data.jsonl.gz'
-            else:
-                raise ValueError("Don't know where to read metadata from in the dir. "
-                                 "Expecting either a data.jsonl or data.jsonl.gz file but none found.")
-        meta = load_audio_meta(root)
-        return cls(meta, **kwargs)
-    @classmethod
-    def from_path(cls, root: tp.Union[str, Path], minimal_meta: bool = True,
-                  exts: tp.List[str] = DEFAULT_EXTS, **kwargs):
-        """Instantiate AudioDataset from a path containing (possibly nested) audio files.
-        Args:
-            root (str or Path): Path to root folder containing audio files.
-            minimal_meta (bool): Whether to only load minimal metadata or not.
-            exts (list of str): Extensions for audio files.
-            kwargs: Additional keyword arguments for the AudioDataset.
-        """
-        root = Path(root)
-        if root.is_file():
-            meta = load_audio_meta(root, resolve=True)
-        else:
-            meta = find_audio_files(root, exts, minimal=minimal_meta, resolve=True)
-        return cls(meta, **kwargs)
-def main():
-    logging.basicConfig(stream=sys.stderr, level=logging.INFO)
-    parser = argparse.ArgumentParser(
-        prog='audio_dataset',
-        description='Generate .jsonl files by scanning a folder.')
-    parser.add_argument('root', help='Root folder with all the audio files')
-    parser.add_argument('output_meta_file',
-                        help='Output file to store the metadata, ')
-    parser.add_argument('--complete',
-                        action='store_false', dest='minimal', default=True,
-                        help='Retrieve all metadata, even the one that are expansive '
-                             'to compute (e.g. normalization).')
-    parser.add_argument('--resolve',
-                        action='store_true', default=False,
-                        help='Resolve the paths to be absolute and with no symlinks.')
-    parser.add_argument('--workers',
-                        default=10, type=int,
-                        help='Number of workers.')
-    args = parser.parse_args()
-    meta = find_audio_files(args.root, DEFAULT_EXTS, progress=True,
-                            resolve=args.resolve, minimal=args.minimal, workers=args.workers)
-    save_audio_meta(args.output_meta_file, meta)
-if __name__ == '__main__':
-    main()

audiocraft/data/audio_utils.py DELETED Viewed

@@ -1,296 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""Various utilities for audio convertion (pcm format, sample rate and channels),
-and volume normalization."""
-import sys
-import typing as tp
-import julius
-import torch
-import torchaudio
-def convert_audio_channels(wav: torch.Tensor, channels: int = 2) -> torch.Tensor:
-    """Convert audio to the given number of channels.
-    Args:
-        wav (torch.Tensor): Audio wave of shape [B, C, T].
-        channels (int): Expected number of channels as output.
-    Returns:
-        torch.Tensor: Downmixed or unchanged audio wave [B, C, T].
-    """
-    *shape, src_channels, length = wav.shape
-    if src_channels == channels:
-        pass
-    elif channels == 1:
-        # Case 1:
-        # The caller asked 1-channel audio, and the stream has multiple
-        # channels, downmix all channels.
-        wav = wav.mean(dim=-2, keepdim=True)
-    elif src_channels == 1:
-        # Case 2:
-        # The caller asked for multiple channels, but the input file has
-        # a single channel, replicate the audio over all channels.
-        wav = wav.expand(*shape, channels, length)
-    elif src_channels >= channels:
-        # Case 3:
-        # The caller asked for multiple channels, and the input file has
-        # more channels than requested. In that case return the first channels.
-        wav = wav[..., :channels, :]
-    else:
-        # Case 4: What is a reasonable choice here?
-        raise ValueError('The audio file has less channels than requested but is not mono.')
-    return wav
-def convert_audio(wav: torch.Tensor, from_rate: float,
-                  to_rate: float, to_channels: int) -> torch.Tensor:
-    """Convert audio to new sample rate and number of audio channels.
-    """
-    wav = julius.resample_frac(wav, int(from_rate), int(to_rate))
-    wav = convert_audio_channels(wav, to_channels)
-    return wav
-def normalize_loudness(wav: torch.Tensor, sample_rate: int, loudness_headroom_db: float = 14,
-                       loudness_compressor: bool = False, energy_floor: float = 2e-3):
-    """Normalize an input signal to a user loudness in dB LKFS.
-    Audio loudness is defined according to the ITU-R BS.1770-4 recommendation.
-    Args:
-        wav (torch.Tensor): Input multichannel audio data.
-        sample_rate (int): Sample rate.
-        loudness_headroom_db (float): Target loudness of the output in dB LUFS.
-        loudness_compressor (bool): Uses tanh for soft clipping.
-        energy_floor (float): anything below that RMS level will not be rescaled.
-    Returns:
-        output (torch.Tensor): Loudness normalized output data.
-    """
-    energy = wav.pow(2).mean().sqrt().item()
-    if energy < energy_floor:
-        return wav
-    transform = torchaudio.transforms.Loudness(sample_rate)
-    input_loudness_db = transform(wav).item()
-    # calculate the gain needed to scale to the desired loudness level
-    delta_loudness = -loudness_headroom_db - input_loudness_db
-    gain = 10.0 ** (delta_loudness / 20.0)
-    output = gain * wav
-    if loudness_compressor:
-        output = torch.tanh(output)
-    assert output.isfinite().all(), (input_loudness_db, wav.pow(2).mean().sqrt())
-    return output
-def _clip_wav(wav: torch.Tensor, log_clipping: bool = False, stem_name: tp.Optional[str] = None) -> None:
-    """Utility function to clip the audio with logging if specified."""
-    max_scale = wav.abs().max()
-    if log_clipping and max_scale > 1:
-        clamp_prob = (wav.abs() > 1).float().mean().item()
-        print(f"CLIPPING {stem_name or ''} happening with proba (a bit of clipping is okay):",
-              clamp_prob, "maximum scale: ", max_scale.item(), file=sys.stderr)
-    wav.clamp_(-1, 1)
-def normalize_audio(wav: torch.Tensor, normalize: bool = True,
-                    strategy: str = 'peak', peak_clip_headroom_db: float = 1,
-                    rms_headroom_db: float = 18, loudness_headroom_db: float = 14,
-                    loudness_compressor: bool = False, log_clipping: bool = False,
-                    sample_rate: tp.Optional[int] = None,
-                    stem_name: tp.Optional[str] = None) -> torch.Tensor:
-    """Normalize the audio according to the prescribed strategy (see after).
-    Args:
-        wav (torch.Tensor): Audio data.
-        normalize (bool): if `True` (default), normalizes according to the prescribed
-            strategy (see after). If `False`, the strategy is only used in case clipping
-            would happen.
-        strategy (str): Can be either 'clip', 'peak', or 'rms'. Default is 'peak',
-            i.e. audio is normalized by its largest value. RMS normalizes by root-mean-square
-            with extra headroom to avoid clipping. 'clip' just clips.
-        peak_clip_headroom_db (float): Headroom in dB when doing 'peak' or 'clip' strategy.
-        rms_headroom_db (float): Headroom in dB when doing 'rms' strategy. This must be much larger
-            than the `peak_clip` one to avoid further clipping.
-        loudness_headroom_db (float): Target loudness for loudness normalization.
-        loudness_compressor (bool): If True, uses tanh based soft clipping.
-        log_clipping (bool): If True, basic logging on stderr when clipping still
-            occurs despite strategy (only for 'rms').
-        sample_rate (int): Sample rate for the audio data (required for loudness).
-        stem_name (Optional[str]): Stem name for clipping logging.
-    Returns:
-        torch.Tensor: Normalized audio.
-    """
-    scale_peak = 10 ** (-peak_clip_headroom_db / 20)
-    scale_rms = 10 ** (-rms_headroom_db / 20)
-    if strategy == 'peak':
-        rescaling = (scale_peak / wav.abs().max())
-        if normalize or rescaling < 1:
-            wav = wav * rescaling
-    elif strategy == 'clip':
-        wav = wav.clamp(-scale_peak, scale_peak)
-    elif strategy == 'rms':
-        mono = wav.mean(dim=0)
-        rescaling = scale_rms / mono.pow(2).mean().sqrt()
-        if normalize or rescaling < 1:
-            wav = wav * rescaling
-        _clip_wav(wav, log_clipping=log_clipping, stem_name=stem_name)
-    elif strategy == 'loudness':
-        assert sample_rate is not None, "Loudness normalization requires sample rate."
-        wav = normalize_loudness(wav, sample_rate, loudness_headroom_db, loudness_compressor)
-        _clip_wav(wav, log_clipping=log_clipping, stem_name=stem_name)
-    else:
-        assert wav.abs().max() < 1
-        assert strategy == '' or strategy == 'none', f"Unexpected strategy: '{strategy}'"
-    return wav
-def f32_pcm(wav: torch.Tensor) -> torch.Tensor:
-    """Convert audio to float 32 bits PCM format.
-    """
-    if wav.dtype.is_floating_point:
-        return wav
-    elif wav.dtype == torch.int16:
-        return wav.float() / 2**15
-    elif wav.dtype == torch.int32:
-        return wav.float() / 2**31
-    raise ValueError(f"Unsupported wav dtype: {wav.dtype}")
-def i16_pcm(wav: torch.Tensor) -> torch.Tensor:
-    """Convert audio to int 16 bits PCM format.
-    ..Warning:: There exist many formula for doing this conversion. None are perfect
-    due to the asymmetry of the int16 range. One either have possible clipping, DC offset,
-    or inconsistencies with f32_pcm. If the given wav doesn't have enough headroom,
-    it is possible that `i16_pcm(f32_pcm)) != Identity`.
-    """
-    if wav.dtype.is_floating_point:
-        assert wav.abs().max() <= 1
-        candidate = (wav * 2 ** 15).round()
-        if candidate.max() >= 2 ** 15:  # clipping would occur
-            candidate = (wav * (2 ** 15 - 1)).round()
-        return candidate.short()
-    else:
-        assert wav.dtype == torch.int16
-        return wav
-def apply_tafade(audio: torch.Tensor, sample_rate, duration=3.0, out=True, start=True, shape: str = "linear", stem_name: tp.Optional[str] = None) -> torch.Tensor:
-    """
-    Apply fade-in and/or fade-out effects to the audio tensor.
-    Args:
-        audio (torch.Tensor): The input audio tensor of shape (C, L).
-        sample_rate (int): The sample rate of the audio.
-        duration (float, optional): The duration of the fade in seconds. Defaults to 3.0.
-        out (bool, optional): Determines whether to apply fade-in (False) or fade-out (True) effect. Defaults to True.
-        start (bool, optional): Determines whether the fade is applied to the beginning (True) or end (False) of the audio. Defaults to True.
-        shape (str, optional): The shape of the fade. Must be one of: "quarter_sine", "half_sine", "linear", "logarithmic", "exponential". Defaults to "linear".
-    Returns:
-        torch.Tensor: The audio tensor with the fade effect applied.
-    """
-    fade_samples = int(sample_rate * duration)  # Number of samples for the fade duration
-    # Create the fade transform
-    fade_transform = torchaudio.transforms.Fade(fade_in_len=0, fade_out_len=0, fade_shape=shape)
-    if out:
-        fade_transform.fade_out_len = fade_samples
-    else:
-        fade_transform.fade_in_len = fade_samples
-    # Select the portion of the audio to apply the fade
-    if start:
-        audio_fade_section = audio[:, :fade_samples]
-    else:
-        audio_fade_section = audio[:, -fade_samples:]
-    # Apply the fade transform to the audio section
-    audio_faded = fade_transform(audio)
-    # Replace the selected portion of the audio with the faded section
-    if start:
-        audio_faded[:, :fade_samples] = audio_fade_section
-    else:
-        audio_faded[:, -fade_samples:] = audio_fade_section
-    wav = normalize_loudness(audio_faded,sample_rate, loudness_headroom_db=18, loudness_compressor=True)
-    _clip_wav(wav, log_clipping=False, stem_name=stem_name)
-    return wav
-def apply_fade(audio: torch.Tensor, sample_rate, duration=3.0, out=True, start=True, curve_start:float=0.0, curve_end:float=1.0, current_device:str="cpu", stem_name: tp.Optional[str] = None) -> torch.Tensor:
-    """
-    Apply fade-in and/or fade-out effects to the audio tensor.
-    Args:
-        audio (torch.Tensor): The input audio tensor of shape (C, L).
-        sample_rate (int): The sample rate of the audio.
-        duration (float, optional): The duration of the fade in seconds. Defaults to 3.0.
-        out (bool, optional): Determines whether to apply fade-in (False) or fade-out (True) effect. Defaults to True.
-        start (bool, optional): Determines whether the fade is applied to the beginning (True) or end (False) of the audio. Defaults to True.
-        curve_start (float, optional): The starting amplitude of the fade curve. Defaults to 0.0.
-        curve_end (float, optional): The ending amplitude of the fade curve. Defaults to 1.0.
-        current_device (str, optional): The device on which the fade curve tensor should be created. Defaults to "cpu".
-    Returns:
-        torch.Tensor: The audio tensor with the fade effect applied.
-    """
-    fade_samples = int(sample_rate * duration)  # Number of samples for the fade duration
-    fade_curve = torch.linspace(curve_start, curve_end, fade_samples, device=current_device)  # Generate linear fade curve
-    if out:
-        fade_curve = fade_curve.flip(0)  # Reverse the fade curve for fade out
-    # Select the portion of the audio to apply the fade
-    if start:
-        audio_fade_section = audio[:, :fade_samples]
-    else:
-        audio_fade_section = audio[:, -fade_samples:]
-    # Apply the fade curve to the audio section
-    audio_faded = audio.clone()
-    audio_faded[:, :fade_samples] *= fade_curve.unsqueeze(0)
-    audio_faded[:, -fade_samples:] *= fade_curve.unsqueeze(0)
-    # Replace the selected portion of the audio with the faded section
-    if start:
-        audio_faded[:, :fade_samples] = audio_fade_section
-    else:
-        audio_faded[:, -fade_samples:] = audio_fade_section
-    wav = normalize_loudness(audio_faded,sample_rate, loudness_headroom_db=18, loudness_compressor=True)
-    _clip_wav(wav, log_clipping=False, stem_name=stem_name)
-    return wav
-def apply_splice_effect(waveform1, sample_rate1, waveform2, sample_rate2, overlap):
-    # Convert sample rates to integers
-    sample_rate1 = int(sample_rate1)
-    sample_rate2 = int(sample_rate2)
-    # Convert tensors to mono-channel if needed
-    if waveform1.ndim > 2:
-        waveform1 = waveform1.mean(dim=1)
-    if waveform2.ndim > 2:
-        waveform2 = waveform2.mean(dim=1)
-    ## Convert tensors to numpy arrays
-    #waveform1_np = waveform1.numpy()
-    #waveform2_np = waveform2.numpy()
-    # Apply splice effect using torchaudio.sox_effects.apply_effects_tensor
-    effects = [
-        ["splice", f"-q {waveform1},{overlap}"],
-    ]
-    output_waveform, output_sample_rate = torchaudio.sox_effects.apply_effects_tensor(
-        torch.cat([waveform1.unsqueeze(0), waveform2.unsqueeze(0)], dim=2),
-        sample_rate1,
-        effects
-    )
-    return output_waveform.squeeze(0), output_sample_rate

audiocraft/data/info_audio_dataset.py DELETED Viewed

@@ -1,110 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""Base classes for the datasets that also provide non-audio metadata,
-e.g. description, text transcription etc.
-"""
-from dataclasses import dataclass
-import logging
-import math
-import re
-import typing as tp
-import torch
-from .audio_dataset import AudioDataset, AudioMeta
-from ..environment import AudioCraftEnvironment
-from ..modules.conditioners import SegmentWithAttributes, ConditioningAttributes
-logger = logging.getLogger(__name__)
-def _clusterify_meta(meta: AudioMeta) -> AudioMeta:
-    """Monkey-patch meta to match cluster specificities."""
-    meta.path = AudioCraftEnvironment.apply_dataset_mappers(meta.path)
-    if meta.info_path is not None:
-        meta.info_path.zip_path = AudioCraftEnvironment.apply_dataset_mappers(meta.info_path.zip_path)
-    return meta
-def clusterify_all_meta(meta: tp.List[AudioMeta]) -> tp.List[AudioMeta]:
-    """Monkey-patch all meta to match cluster specificities."""
-    return [_clusterify_meta(m) for m in meta]
-@dataclass
-class AudioInfo(SegmentWithAttributes):
-    """Dummy SegmentInfo with empty attributes.
-    The InfoAudioDataset is expected to return metadata that inherits
-    from SegmentWithAttributes class and can return conditioning attributes.
-    This basically guarantees all datasets will be compatible with current
-    solver that contain conditioners requiring this.
-    """
-    audio_tokens: tp.Optional[torch.Tensor] = None  # populated when using cached batch for training a LM.
-    def to_condition_attributes(self) -> ConditioningAttributes:
-        return ConditioningAttributes()
-class InfoAudioDataset(AudioDataset):
-    """AudioDataset that always returns metadata as SegmentWithAttributes along with the audio waveform.
-    See `audiocraft.data.audio_dataset.AudioDataset` for initialization arguments.
-    """
-    def __init__(self, meta: tp.List[AudioMeta], **kwargs):
-        super().__init__(clusterify_all_meta(meta), **kwargs)
-    def __getitem__(self, index: int) -> tp.Union[torch.Tensor, tp.Tuple[torch.Tensor, SegmentWithAttributes]]:
-        if not self.return_info:
-            wav = super().__getitem__(index)
-            assert isinstance(wav, torch.Tensor)
-            return wav
-        wav, meta = super().__getitem__(index)
-        return wav, AudioInfo(**meta.to_dict())
-def get_keyword_or_keyword_list(value: tp.Optional[str]) -> tp.Union[tp.Optional[str], tp.Optional[tp.List[str]]]:
-    """Preprocess a single keyword or possible a list of keywords."""
-    if isinstance(value, list):
-        return get_keyword_list(value)
-    else:
-        return get_keyword(value)
-def get_string(value: tp.Optional[str]) -> tp.Optional[str]:
-    """Preprocess a single keyword."""
-    if value is None or (not isinstance(value, str)) or len(value) == 0 or value == 'None':
-        return None
-    else:
-        return value.strip()
-def get_keyword(value: tp.Optional[str]) -> tp.Optional[str]:
-    """Preprocess a single keyword."""
-    if value is None or (not isinstance(value, str)) or len(value) == 0 or value == 'None':
-        return None
-    else:
-        return value.strip().lower()
-def get_keyword_list(values: tp.Union[str, tp.List[str]]) -> tp.Optional[tp.List[str]]:
-    """Preprocess a list of keywords."""
-    if isinstance(values, str):
-        values = [v.strip() for v in re.split(r'[,\s]', values)]
-    elif isinstance(values, float) and math.isnan(values):
-        values = []
-    if not isinstance(values, list):
-        logger.debug(f"Unexpected keyword list {values}")
-        values = [str(values)]
-    kws = [get_keyword(v) for v in values]
-    kw_list = [k for k in kws if k is not None]
-    if len(kw_list) == 0:
-        return None
-    else:
-        return kw_list

audiocraft/data/zip.py DELETED Viewed

@@ -1,76 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""Utility for reading some info from inside a zip file.
-"""
-import typing
-import zipfile
-from dataclasses import dataclass
-from functools import lru_cache
-from typing_extensions import Literal
-DEFAULT_SIZE = 32
-MODE = Literal['r', 'w', 'x', 'a']
-@dataclass(order=True)
-class PathInZip:
-    """Hold a path of file within a zip file.
-    Args:
-        path (str): The convention is <path_to_zip>:<relative_path_inside_zip>.
-            Let's assume there is a zip file /some/location/foo.zip
-            and inside of it is a json file located at /data/file1.json,
-            Then we expect path = "/some/location/foo.zip:/data/file1.json".
-    """
-    INFO_PATH_SEP = ':'
-    zip_path: str
-    file_path: str
-    def __init__(self, path: str) -> None:
-        split_path = path.split(self.INFO_PATH_SEP)
-        assert len(split_path) == 2
-        self.zip_path, self.file_path = split_path
-    @classmethod
-    def from_paths(cls, zip_path: str, file_path: str):
-        return cls(zip_path + cls.INFO_PATH_SEP + file_path)
-    def __str__(self) -> str:
-        return self.zip_path + self.INFO_PATH_SEP + self.file_path
-def _open_zip(path: str, mode: MODE = 'r'):
-    return zipfile.ZipFile(path, mode)
-_cached_open_zip = lru_cache(DEFAULT_SIZE)(_open_zip)
-def set_zip_cache_size(max_size: int):
-    """Sets the maximal LRU caching for zip file opening.
-    Args:
-        max_size (int): the maximal LRU cache.
-    """
-    global _cached_open_zip
-    _cached_open_zip = lru_cache(max_size)(_open_zip)
-def open_file_in_zip(path_in_zip: PathInZip, mode: str = 'r') -> typing.IO:
-    """Opens a file stored inside a zip and returns a file-like object.
-    Args:
-        path_in_zip (PathInZip): A PathInZip object representing the file to return a file-like object of.
-        mode (str): The mode in which to open the file with.
-    Returns:
-        A file-like object for PathInZip.
-    """
-    zf = _cached_open_zip(path_in_zip.zip_path)
-    return zf.open(path_in_zip.file_path)

audiocraft/environment.py DELETED Viewed

@@ -1,176 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Provides cluster and tools configuration across clusters (slurm, dora, utilities).
-"""
-import logging
-import os
-from pathlib import Path
-import re
-import typing as tp
-import omegaconf
-from .utils.cluster import _guess_cluster_type
-logger = logging.getLogger(__name__)
-class AudioCraftEnvironment:
-    """Environment configuration for teams and clusters.
-    AudioCraftEnvironment picks compute cluster settings (slurm, dora) from the current running environment
-    or declared variable and the loaded team configuration. Additionally, the AudioCraftEnvironment
-    provides pointers to a reference folder resolved automatically across clusters that is shared across team members,
-    allowing to share sigs or other files to run jobs. Finally, it provides dataset mappers to automatically
-    map dataset file paths to new locations across clusters, allowing to use the same manifest of files across cluters.
-    The cluster type is identified automatically and base configuration file is read from config/teams.yaml.
-    Use the following environment variables to specify the cluster, team or configuration:
-        AUDIOCRAFT_CLUSTER (optional): Cluster type to enforce. Useful if the cluster type
-            cannot be inferred automatically.
-        AUDIOCRAFT_CONFIG (optional): Path to yaml config holding the teams configuration.
-            If not set, configuration is read from config/teams.yaml.
-        AUDIOCRAFT_TEAM (optional): Name of the team. Recommended to set to your own team.
-            Cluster configuration are shared across teams to match compute allocation,
-            specify your cluster configuration in the configuration file under a key mapping
-            your team name.
-    """
-    _instance = None
-    DEFAULT_TEAM = "default"
-    def __init__(self) -> None:
-        """Loads configuration."""
-        self.team: str = os.getenv("AUDIOCRAFT_TEAM", self.DEFAULT_TEAM)
-        cluster_type = _guess_cluster_type()
-        cluster = os.getenv(
-            "AUDIOCRAFT_CLUSTER", cluster_type.value
-        )
-        logger.info("Detecting cluster type %s", cluster_type)
-        self.cluster: str = cluster
-        config_path = os.getenv(
-            "AUDIOCRAFT_CONFIG",
-            Path(__file__)
-            .parent.parent.joinpath("config/teams", self.team)
-            .with_suffix(".yaml"),
-        )
-        self.config = omegaconf.OmegaConf.load(config_path)
-        self._dataset_mappers = []
-        cluster_config = self._get_cluster_config()
-        if "dataset_mappers" in cluster_config:
-            for pattern, repl in cluster_config["dataset_mappers"].items():
-                regex = re.compile(pattern)
-                self._dataset_mappers.append((regex, repl))
-    def _get_cluster_config(self) -> omegaconf.DictConfig:
-        assert isinstance(self.config, omegaconf.DictConfig)
-        return self.config[self.cluster]
-    @classmethod
-    def instance(cls):
-        if cls._instance is None:
-            cls._instance = cls()
-        return cls._instance
-    @classmethod
-    def reset(cls):
-        """Clears the environment and forces a reload on next invocation."""
-        cls._instance = None
-    @classmethod
-    def get_team(cls) -> str:
-        """Gets the selected team as dictated by the AUDIOCRAFT_TEAM env var.
-        If not defined, defaults to "labs".
-        """
-        return cls.instance().team
-    @classmethod
-    def get_cluster(cls) -> str:
-        """Gets the detected cluster.
-        This value can be overridden by the AUDIOCRAFT_CLUSTER env var.
-        """
-        return cls.instance().cluster
-    @classmethod
-    def get_dora_dir(cls) -> Path:
-        """Gets the path to the dora directory for the current team and cluster.
-        Value is overridden by the AUDIOCRAFT_DORA_DIR env var.
-        """
-        cluster_config = cls.instance()._get_cluster_config()
-        dora_dir = os.getenv("AUDIOCRAFT_DORA_DIR", cluster_config["dora_dir"])
-        logger.warning(f"Dora directory: {dora_dir}")
-        return Path(dora_dir)
-    @classmethod
-    def get_reference_dir(cls) -> Path:
-        """Gets the path to the reference directory for the current team and cluster.
-        Value is overridden by the AUDIOCRAFT_REFERENCE_DIR env var.
-        """
-        cluster_config = cls.instance()._get_cluster_config()
-        return Path(os.getenv("AUDIOCRAFT_REFERENCE_DIR", cluster_config["reference_dir"]))
-    @classmethod
-    def get_slurm_exclude(cls) -> tp.Optional[str]:
-        """Get the list of nodes to exclude for that cluster."""
-        cluster_config = cls.instance()._get_cluster_config()
-        return cluster_config.get("slurm_exclude")
-    @classmethod
-    def get_slurm_partitions(cls, partition_types: tp.Optional[tp.List[str]] = None) -> str:
-        """Gets the requested partitions for the current team and cluster as a comma-separated string.
-        Args:
-            partition_types (list[str], optional): partition types to retrieve. Values must be
-                from ['global', 'team']. If not provided, the global partition is returned.
-        """
-        if not partition_types:
-            partition_types = ["global"]
-        cluster_config = cls.instance()._get_cluster_config()
-        partitions = [
-            cluster_config["partitions"][partition_type]
-            for partition_type in partition_types
-        ]
-        return ",".join(partitions)
-    @classmethod
-    def resolve_reference_path(cls, path: tp.Union[str, Path]) -> Path:
-        """Converts reference placeholder in path with configured reference dir to resolve paths.
-        Args:
-            path (str or Path): Path to resolve.
-        Returns:
-            Path: Resolved path.
-        """
-        path = str(path)
-        if path.startswith("//reference"):
-            reference_dir = cls.get_reference_dir()
-            logger.warn(f"Reference directory: {reference_dir}")
-            assert (
-                reference_dir.exists() and reference_dir.is_dir()
-            ), f"Reference directory does not exist: {reference_dir}."
-            path = re.sub("^//reference", str(reference_dir), path)
-        return Path(path)
-    @classmethod
-    def apply_dataset_mappers(cls, path: str) -> str:
-        """Applies dataset mapping regex rules as defined in the configuration.
-        If no rules are defined, the path is returned as-is.
-        """
-        instance = cls.instance()
-        for pattern, repl in instance._dataset_mappers:
-            path = pattern.sub(repl, path)
-        return path

audiocraft/models/__init__.py DELETED Viewed

@@ -1,21 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Models for EnCodec, AudioGen, MusicGen, as well as the generic LMModel.
-"""
-# flake8: noqa
-from . import builders, loaders
-from .encodec import (
-    CompressionModel, EncodecModel, DAC,
-    HFEncodecModel, HFEncodecCompressionModel)
-from .lm import LMModel
-from .lm_magnet import MagnetLMModel
-from .flow_matching import FlowMatchingModel
-from .encodec import CompressionModel, EncodecModel
-from .musicgen import MusicGen
-from .magnet import MAGNeT
-from .unet import DiffusionUnet

audiocraft/models/builders.py DELETED Viewed

@@ -1,351 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-All the functions to build the relevant models and modules
-from the Hydra config.
-"""
-import typing as tp
-import omegaconf
-import torch
-import audiocraft
-from .. import quantization as qt
-from ..modules.codebooks_patterns import (CoarseFirstPattern,
-                                          CodebooksPatternProvider,
-                                          DelayedPatternProvider,
-                                          MusicLMPattern,
-                                          ParallelPatternProvider,
-                                          UnrolledPatternProvider)
-from ..modules.conditioners import (BaseConditioner, ChromaStemConditioner,
-                                    CLAPEmbeddingConditioner,
-                                    ConditionFuser, JascoCondConst,
-                                    ConditioningProvider, LUTConditioner,
-                                    T5Conditioner, StyleConditioner)
-from ..modules.jasco_conditioners import (JascoConditioningProvider, ChordsEmbConditioner,
-                                          DrumsConditioner, MelodyConditioner)
-from ..modules.diffusion_schedule import MultiBandProcessor, SampleProcessor
-from ..utils.utils import dict_from_config
-from .encodec import (CompressionModel, EncodecModel,
-                      InterleaveStereoCompressionModel)
-from .lm import LMModel
-from .lm_magnet import MagnetLMModel
-from .flow_matching import FlowMatchingModel
-from .unet import DiffusionUnet
-def get_quantizer(
-    quantizer: str, cfg: omegaconf.DictConfig, dimension: int
-) -> qt.BaseQuantizer:
-    klass = {"no_quant": qt.DummyQuantizer, "rvq": qt.ResidualVectorQuantizer}[
-        quantizer
-    ]
-    kwargs = dict_from_config(getattr(cfg, quantizer))
-    if quantizer != "no_quant":
-        kwargs["dimension"] = dimension
-    return klass(**kwargs)
-def get_encodec_autoencoder(encoder_name: str, cfg: omegaconf.DictConfig):
-    if encoder_name == "seanet":
-        kwargs = dict_from_config(getattr(cfg, "seanet"))
-        encoder_override_kwargs = kwargs.pop("encoder")
-        decoder_override_kwargs = kwargs.pop("decoder")
-        encoder_kwargs = {**kwargs, **encoder_override_kwargs}
-        decoder_kwargs = {**kwargs, **decoder_override_kwargs}
-        encoder = audiocraft.modules.SEANetEncoder(**encoder_kwargs)
-        decoder = audiocraft.modules.SEANetDecoder(**decoder_kwargs)
-        return encoder, decoder
-    else:
-        raise KeyError(f"Unexpected compression model {cfg.compression_model}")
-def get_compression_model(cfg: omegaconf.DictConfig) -> CompressionModel:
-    """Instantiate a compression model."""
-    if cfg.compression_model == "encodec":
-        kwargs = dict_from_config(getattr(cfg, "encodec"))
-        encoder_name = kwargs.pop("autoencoder")
-        quantizer_name = kwargs.pop("quantizer")
-        encoder, decoder = get_encodec_autoencoder(encoder_name, cfg)
-        quantizer = get_quantizer(quantizer_name, cfg, encoder.dimension)
-        frame_rate = kwargs["sample_rate"] // encoder.hop_length
-        renormalize = kwargs.pop("renormalize", False)
-        # deprecated params
-        kwargs.pop("renorm", None)
-        return EncodecModel(
-            encoder,
-            decoder,
-            quantizer,
-            frame_rate=frame_rate,
-            renormalize=renormalize,
-            **kwargs,
-        ).to(cfg.device)
-    else:
-        raise KeyError(f"Unexpected compression model {cfg.compression_model}")
-def get_jasco_model(cfg: omegaconf.DictConfig,
-                    compression_model: tp.Optional[CompressionModel] = None) -> FlowMatchingModel:
-    kwargs = dict_from_config(getattr(cfg, "transformer_lm"))
-    attribute_dropout = dict_from_config(getattr(cfg, "attribute_dropout"))
-    cls_free_guidance = dict_from_config(getattr(cfg, "classifier_free_guidance"))
-    cfg_prob = cls_free_guidance["training_dropout"]
-    cfg_coef = cls_free_guidance["inference_coef"]
-    fuser = get_condition_fuser(cfg)
-    condition_provider = get_conditioner_provider(kwargs["dim"], cfg).to(cfg.device)
-    if JascoCondConst.DRM.value in condition_provider.conditioners:  # use self_wav for drums
-        assert compression_model is not None
-        # use compression model for drums conditioning
-        condition_provider.conditioners.self_wav.compression_model = compression_model
-        condition_provider.conditioners.self_wav.compression_model.requires_grad_(False)
-    # downcast to jasco conditioning provider
-    seq_len = cfg.compression_model_framerate * cfg.dataset.segment_duration
-    chords_card = cfg.conditioners.chords.chords_emb.card if JascoCondConst.CRD.value in cfg.conditioners else -1
-    condition_provider = JascoConditioningProvider(device=condition_provider.device,
-                                                   conditioners=condition_provider.conditioners,
-                                                   chords_card=chords_card,
-                                                   sequence_length=seq_len)
-    if len(fuser.fuse2cond["cross"]) > 0:  # enforce cross-att programmatically
-        kwargs["cross_attention"] = True
-    kwargs.pop("n_q", None)
-    kwargs.pop("card", None)
-    return FlowMatchingModel(
-        condition_provider=condition_provider,
-        fuser=fuser,
-        cfg_dropout=cfg_prob,
-        cfg_coef=cfg_coef,
-        attribute_dropout=attribute_dropout,
-        dtype=getattr(torch, cfg.dtype),
-        device=cfg.device,
-        **kwargs,
-    ).to(cfg.device)
-def get_lm_model(cfg: omegaconf.DictConfig) -> LMModel:
-    """Instantiate a transformer LM."""
-    if cfg.lm_model in ["transformer_lm", "transformer_lm_magnet"]:
-        kwargs = dict_from_config(getattr(cfg, "transformer_lm"))
-        n_q = kwargs["n_q"]
-        q_modeling = kwargs.pop("q_modeling", None)
-        codebooks_pattern_cfg = getattr(cfg, "codebooks_pattern")
-        attribute_dropout = dict_from_config(getattr(cfg, "attribute_dropout"))
-        cls_free_guidance = dict_from_config(getattr(cfg, "classifier_free_guidance"))
-        cfg_prob, cfg_coef = (
-            cls_free_guidance["training_dropout"],
-            cls_free_guidance["inference_coef"],
-        )
-        fuser = get_condition_fuser(cfg)
-        condition_provider = get_conditioner_provider(kwargs["dim"], cfg).to(cfg.device)
-        if len(fuser.fuse2cond["cross"]) > 0:  # enforce cross-att programmatically
-            kwargs["cross_attention"] = True
-        if codebooks_pattern_cfg.modeling is None:
-            assert (
-                q_modeling is not None
-            ), "LM model should either have a codebook pattern defined or transformer_lm.q_modeling"
-            codebooks_pattern_cfg = omegaconf.OmegaConf.create(
-                {"modeling": q_modeling, "delay": {"delays": list(range(n_q))}}
-            )
-        pattern_provider = get_codebooks_pattern_provider(n_q, codebooks_pattern_cfg)
-        lm_class = MagnetLMModel if cfg.lm_model == "transformer_lm_magnet" else LMModel
-        return lm_class(
-            pattern_provider=pattern_provider,
-            condition_provider=condition_provider,
-            fuser=fuser,
-            cfg_dropout=cfg_prob,
-            cfg_coef=cfg_coef,
-            attribute_dropout=attribute_dropout,
-            dtype=getattr(torch, cfg.dtype),
-            device=cfg.device,
-            **kwargs,
-        ).to(cfg.device)
-    else:
-        raise KeyError(f"Unexpected LM model {cfg.lm_model}")
-def get_conditioner_provider(
-    output_dim: int, cfg: omegaconf.DictConfig
-) -> ConditioningProvider:
-    """Instantiate a conditioning model."""
-    device = cfg.device
-    duration = cfg.dataset.segment_duration
-    cfg = getattr(cfg, "conditioners")
-    dict_cfg = {} if cfg is None else dict_from_config(cfg)
-    conditioners: tp.Dict[str, BaseConditioner] = {}
-    condition_provider_args = dict_cfg.pop("args", {})
-    condition_provider_args.pop("merge_text_conditions_p", None)
-    condition_provider_args.pop("drop_desc_p", None)
-    for cond, cond_cfg in dict_cfg.items():
-        model_type = cond_cfg["model"]
-        model_args = cond_cfg[model_type]
-        if model_type == "t5":
-            conditioners[str(cond)] = T5Conditioner(
-                output_dim=output_dim, device=device, **model_args
-            )
-        elif model_type == "lut":
-            conditioners[str(cond)] = LUTConditioner(
-                output_dim=output_dim, **model_args
-            )
-        elif model_type == "chroma_stem":
-            conditioners[str(cond)] = ChromaStemConditioner(
-                output_dim=output_dim, duration=duration, device=device, **model_args
-            )
-        elif model_type in {"chords_emb", "drum_latents", "melody"}:
-            conditioners_classes = {"chords_emb": ChordsEmbConditioner,
-                                    "drum_latents": DrumsConditioner,
-                                    "melody": MelodyConditioner}
-            conditioner_class = conditioners_classes[model_type]
-            conditioners[str(cond)] = conditioner_class(device=device, **model_args)
-        elif model_type == "clap":
-            conditioners[str(cond)] = CLAPEmbeddingConditioner(
-                output_dim=output_dim, device=device, **model_args
-            )
-        elif model_type == 'style':
-            conditioners[str(cond)] = StyleConditioner(
-                output_dim=output_dim,
-                device=device,
-                **model_args
-            )
-        else:
-            raise ValueError(f"Unrecognized conditioning model: {model_type}")
-    conditioner = ConditioningProvider(
-        conditioners, device=device, **condition_provider_args
-    )
-    return conditioner
-def get_condition_fuser(cfg: omegaconf.DictConfig) -> ConditionFuser:
-    """Instantiate a condition fuser object."""
-    fuser_cfg = getattr(cfg, "fuser")
-    fuser_methods = ["sum", "cross", "prepend", "ignore", "input_interpolate"]
-    fuse2cond = {k: fuser_cfg[k] for k in fuser_methods if k in fuser_cfg}
-    kwargs = {k: v for k, v in fuser_cfg.items() if k not in fuser_methods}
-    fuser = ConditionFuser(fuse2cond=fuse2cond, **kwargs)
-    return fuser
-def get_codebooks_pattern_provider(
-    n_q: int, cfg: omegaconf.DictConfig
-) -> CodebooksPatternProvider:
-    """Instantiate a codebooks pattern provider object."""
-    pattern_providers = {
-        "parallel": ParallelPatternProvider,
-        "delay": DelayedPatternProvider,
-        "unroll": UnrolledPatternProvider,
-        "coarse_first": CoarseFirstPattern,
-        "musiclm": MusicLMPattern,
-    }
-    name = cfg.modeling
-    kwargs = dict_from_config(cfg.get(name)) if hasattr(cfg, name) else {}
-    klass = pattern_providers[name]
-    return klass(n_q, **kwargs)
-def get_debug_compression_model(device="cpu", sample_rate: int = 32000):
-    """Instantiate a debug compression model to be used for unit tests."""
-    assert sample_rate in [
-        16000,
-        32000,
-    ], "unsupported sample rate for debug compression model"
-    model_ratios = {
-        16000: [10, 8, 8],  # 25 Hz at 16kHz
-        32000: [10, 8, 16],  # 25 Hz at 32kHz
-    }
-    ratios: tp.List[int] = model_ratios[sample_rate]
-    frame_rate = 25
-    seanet_kwargs: dict = {
-        "n_filters": 4,
-        "n_residual_layers": 1,
-        "dimension": 32,
-        "ratios": ratios,
-    }
-    encoder = audiocraft.modules.SEANetEncoder(**seanet_kwargs)
-    decoder = audiocraft.modules.SEANetDecoder(**seanet_kwargs)
-    quantizer = qt.ResidualVectorQuantizer(dimension=32, bins=400, n_q=4)
-    init_x = torch.randn(8, 32, 128)
-    quantizer(init_x, 1)  # initialize kmeans etc.
-    compression_model = EncodecModel(
-        encoder,
-        decoder,
-        quantizer,
-        frame_rate=frame_rate,
-        sample_rate=sample_rate,
-        channels=1,
-    ).to(device)
-    return compression_model.eval()
-def get_diffusion_model(cfg: omegaconf.DictConfig):
-    # TODO Find a way to infer the channels from dset
-    channels = cfg.channels
-    num_steps = cfg.schedule.num_steps
-    return DiffusionUnet(chin=channels, num_steps=num_steps, **cfg.diffusion_unet)
-def get_processor(cfg, sample_rate: int = 24000):
-    sample_processor = SampleProcessor()
-    if cfg.use:
-        kw = dict(cfg)
-        kw.pop("use")
-        kw.pop("name")
-        if cfg.name == "multi_band_processor":
-            sample_processor = MultiBandProcessor(sample_rate=sample_rate, **kw)
-    return sample_processor
-def get_debug_lm_model(device="cpu"):
-    """Instantiate a debug LM to be used for unit tests."""
-    pattern = DelayedPatternProvider(n_q=4)
-    dim = 16
-    providers = {
-        "description": LUTConditioner(
-            n_bins=128, dim=dim, output_dim=dim, tokenizer="whitespace"
-        ),
-    }
-    condition_provider = ConditioningProvider(providers)
-    fuser = ConditionFuser(
-        {"cross": ["description"], "prepend": [], "sum": [], "input_interpolate": []}
-    )
-    lm = LMModel(
-        pattern,
-        condition_provider,
-        fuser,
-        n_q=4,
-        card=400,
-        dim=dim,
-        num_heads=4,
-        custom=True,
-        num_layers=2,
-        cross_attention=True,
-        causal=True,
-    )
-    return lm.to(device).eval()
-def get_wrapped_compression_model(
-    compression_model: CompressionModel, cfg: omegaconf.DictConfig
-) -> CompressionModel:
-    if hasattr(cfg, "interleave_stereo_codebooks"):
-        if cfg.interleave_stereo_codebooks.use:
-            kwargs = dict_from_config(cfg.interleave_stereo_codebooks)
-            kwargs.pop("use")
-            compression_model = InterleaveStereoCompressionModel(
-                compression_model, **kwargs
-            )
-    if hasattr(cfg, "compression_model_n_q"):
-        if cfg.compression_model_n_q is not None:
-            compression_model.set_num_codebooks(cfg.compression_model_n_q)
-    return compression_model

audiocraft/models/encodec.py DELETED Viewed

@@ -1,506 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""Compression models or wrapper around existing models.
-Also defines the main interface that a model must follow to be usable as an audio tokenizer.
-"""
-from abc import ABC, abstractmethod
-import logging
-import math
-from pathlib import Path
-import typing as tp
-from einops import rearrange
-import numpy as np
-import torch
-from torch import nn
-from transformers import EncodecModel as HFEncodecModel
-from .. import quantization as qt
-logger = logging.getLogger()
-class CompressionModel(ABC, nn.Module):
-    """Base API for all compression model that aim at being used as audio tokenizers
-    with a language model.
-    """
-    @abstractmethod
-    def forward(self, x: torch.Tensor) -> qt.QuantizedResult:
-        ...
-    @abstractmethod
-    def encode(self, x: torch.Tensor) -> tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
-        """See `EncodecModel.encode`."""
-        ...
-    @abstractmethod
-    def decode(self, codes: torch.Tensor, scale: tp.Optional[torch.Tensor] = None):
-        """See `EncodecModel.decode`."""
-        ...
-    @abstractmethod
-    def decode_latent(self, codes: torch.Tensor):
-        """Decode from the discrete codes to continuous latent space."""
-        ...
-    @property
-    @abstractmethod
-    def channels(self) -> int:
-        ...
-    @property
-    @abstractmethod
-    def frame_rate(self) -> float:
-        ...
-    @property
-    @abstractmethod
-    def sample_rate(self) -> int:
-        ...
-    @property
-    @abstractmethod
-    def cardinality(self) -> int:
-        ...
-    @property
-    @abstractmethod
-    def num_codebooks(self) -> int:
-        ...
-    @property
-    @abstractmethod
-    def total_codebooks(self) -> int:
-        ...
-    @abstractmethod
-    def set_num_codebooks(self, n: int):
-        """Set the active number of codebooks used by the quantizer."""
-        ...
-    @staticmethod
-    def get_pretrained(
-            name: str, device: tp.Union[torch.device, str] = 'cpu'
-            ) -> 'CompressionModel':
-        """Instantiate a CompressionModel from a given pretrained model.
-        Args:
-            name (Path or str): name of the pretrained model. See after.
-            device (torch.device or str): Device on which the model is loaded.
-        Pretrained models:
-            - dac_44khz (https://github.com/descriptinc/descript-audio-codec)
-            - dac_24khz (same)
-            - facebook/encodec_24khz (https://huggingface.co/facebook/encodec_24khz)
-            - facebook/encodec_32khz (https://huggingface.co/facebook/encodec_32khz)
-            - your own model on HugginFace. Export instructions to come...
-        """
-        from . import builders, loaders
-        model: CompressionModel
-        if name in ['dac_44khz', 'dac_24khz']:
-            model_type = name.split('_')[1]
-            logger.info("Getting pretrained compression model from DAC %s", model_type)
-            model = DAC(model_type)
-        elif name in ['debug_compression_model']:
-            logger.info("Getting pretrained compression model for debug")
-            model = builders.get_debug_compression_model()
-        elif Path(name).exists():
-            # We assume here if the paths exist that it is in fact an AC checkpoint
-            # that was exported using `audiocraft.utils.export` functions.
-            model = loaders.load_compression_model(name, device=device)
-        else:
-            logger.info("Getting pretrained compression model from HF %s", name)
-            hf_model = HFEncodecModel.from_pretrained(name)
-            model = HFEncodecCompressionModel(hf_model).to(device)
-        return model.to(device).eval()
-class EncodecModel(CompressionModel):
-    """Encodec model operating on the raw waveform.
-    Args:
-        encoder (nn.Module): Encoder network.
-        decoder (nn.Module): Decoder network.
-        quantizer (qt.BaseQuantizer): Quantizer network.
-        frame_rate (int): Frame rate for the latent representation.
-        sample_rate (int): Audio sample rate.
-        channels (int): Number of audio channels.
-        causal (bool): Whether to use a causal version of the model.
-        renormalize (bool): Whether to renormalize the audio before running the model.
-    """
-    # we need assignment to override the property in the abstract class,
-    # I couldn't find a better way...
-    frame_rate: float = 0
-    sample_rate: int = 0
-    channels: int = 0
-    def __init__(self,
-                 encoder: nn.Module,
-                 decoder: nn.Module,
-                 quantizer: qt.BaseQuantizer,
-                 frame_rate: int,
-                 sample_rate: int,
-                 channels: int,
-                 causal: bool = False,
-                 renormalize: bool = False):
-        super().__init__()
-        self.encoder = encoder
-        self.decoder = decoder
-        self.quantizer = quantizer
-        self.frame_rate = frame_rate
-        self.sample_rate = sample_rate
-        self.channels = channels
-        self.renormalize = renormalize
-        self.causal = causal
-        if self.causal:
-            # we force disabling here to avoid handling linear overlap of segments
-            # as supported in original EnCodec codebase.
-            assert not self.renormalize, 'Causal model does not support renormalize'
-    @property
-    def total_codebooks(self):
-        """Total number of quantizer codebooks available."""
-        return self.quantizer.total_codebooks
-    @property
-    def num_codebooks(self):
-        """Active number of codebooks used by the quantizer."""
-        return self.quantizer.num_codebooks
-    def set_num_codebooks(self, n: int):
-        """Set the active number of codebooks used by the quantizer."""
-        self.quantizer.set_num_codebooks(n)
-    @property
-    def cardinality(self):
-        """Cardinality of each codebook."""
-        return self.quantizer.bins
-    def preprocess(self, x: torch.Tensor) -> tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
-        scale: tp.Optional[torch.Tensor]
-        if self.renormalize:
-            mono = x.mean(dim=1, keepdim=True)
-            volume = mono.pow(2).mean(dim=2, keepdim=True).sqrt()
-            scale = 1e-8 + volume
-            x = x / scale
-            scale = scale.view(-1, 1)
-        else:
-            scale = None
-        return x, scale
-    def postprocess(self,
-                    x: torch.Tensor,
-                    scale: tp.Optional[torch.Tensor] = None) -> torch.Tensor:
-        if scale is not None:
-            assert self.renormalize
-            x = x * scale.view(-1, 1, 1)
-        return x
-    def forward(self, x: torch.Tensor) -> qt.QuantizedResult:
-        assert x.dim() == 3
-        length = x.shape[-1]
-        x, scale = self.preprocess(x)
-        emb = self.encoder(x)
-        q_res = self.quantizer(emb, self.frame_rate)
-        out = self.decoder(q_res.x)
-        # remove extra padding added by the encoder and decoder
-        assert out.shape[-1] >= length, (out.shape[-1], length)
-        out = out[..., :length]
-        q_res.x = self.postprocess(out, scale)
-        return q_res
-    def encode(self, x: torch.Tensor) -> tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
-        """Encode the given input tensor to quantized representation along with scale parameter.
-        Args:
-            x (torch.Tensor): Float tensor of shape [B, C, T]
-        Returns:
-            codes, scale (tuple of torch.Tensor, torch.Tensor): Tuple composed of:
-                codes a float tensor of shape [B, K, T] with K the number of codebooks used and T the timestep.
-                scale a float tensor containing the scale for audio renormalizealization.
-        """
-        assert x.dim() == 3
-        x, scale = self.preprocess(x)
-        emb = self.encoder(x)
-        codes = self.quantizer.encode(emb)
-        return codes, scale
-    def decode(self, codes: torch.Tensor, scale: tp.Optional[torch.Tensor] = None):
-        """Decode the given codes to a reconstructed representation, using the scale to perform
-        audio denormalization if needed.
-        Args:
-            codes (torch.Tensor): Int tensor of shape [B, K, T]
-            scale (torch.Tensor, optional): Float tensor containing the scale value.
-        Returns:
-            out (torch.Tensor): Float tensor of shape [B, C, T], the reconstructed audio.
-        """
-        emb = self.decode_latent(codes)
-        out = self.decoder(emb)
-        out = self.postprocess(out, scale)
-        # out contains extra padding added by the encoder and decoder
-        return out
-    def decode_latent(self, codes: torch.Tensor):
-        """Decode from the discrete codes to continuous latent space."""
-        return self.quantizer.decode(codes)
-class DAC(CompressionModel):
-    def __init__(self, model_type: str = "44khz"):
-        super().__init__()
-        try:
-            import dac.utils
-        except ImportError:
-            raise RuntimeError("Could not import dac, make sure it is installed, "
-                               "please run `pip install descript-audio-codec`")
-        self.model = dac.utils.load_model(model_type=model_type)
-        self.n_quantizers = self.total_codebooks
-        self.model.eval()
-    def forward(self, x: torch.Tensor) -> qt.QuantizedResult:
-        # We don't support training with this.
-        raise NotImplementedError("Forward and training with DAC not supported.")
-    def encode(self, x: torch.Tensor) -> tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
-        codes = self.model.encode(x, self.n_quantizers)[1]
-        return codes[:, :self.n_quantizers], None
-    def decode(self, codes: torch.Tensor, scale: tp.Optional[torch.Tensor] = None):
-        assert scale is None
-        z_q = self.decode_latent(codes)
-        return self.model.decode(z_q)
-    def decode_latent(self, codes: torch.Tensor):
-        """Decode from the discrete codes to continuous latent space."""
-        return self.model.quantizer.from_codes(codes)[0]
-    @property
-    def channels(self) -> int:
-        return 1
-    @property
-    def frame_rate(self) -> float:
-        return self.model.sample_rate / self.model.hop_length
-    @property
-    def sample_rate(self) -> int:
-        return self.model.sample_rate
-    @property
-    def cardinality(self) -> int:
-        return self.model.codebook_size
-    @property
-    def num_codebooks(self) -> int:
-        return self.n_quantizers
-    @property
-    def total_codebooks(self) -> int:
-        return self.model.n_codebooks
-    def set_num_codebooks(self, n: int):
-        """Set the active number of codebooks used by the quantizer.
-        """
-        assert n >= 1
-        assert n <= self.total_codebooks
-        self.n_quantizers = n
-class HFEncodecCompressionModel(CompressionModel):
-    """Wrapper around HuggingFace Encodec.
-    """
-    def __init__(self, model: HFEncodecModel):
-        super().__init__()
-        self.model = model
-        bws = self.model.config.target_bandwidths
-        num_codebooks = [
-            bw * 1000 / (self.frame_rate * math.log2(self.cardinality))
-            for bw in bws
-        ]
-        deltas = [nc - int(nc) for nc in num_codebooks]
-        # Checking we didn't do some bad maths and we indeed have integers!
-        assert all(deltas) <= 1e-3, deltas
-        self.possible_num_codebooks = [int(nc) for nc in num_codebooks]
-        self.set_num_codebooks(max(self.possible_num_codebooks))
-    def forward(self, x: torch.Tensor) -> qt.QuantizedResult:
-        # We don't support training with this.
-        raise NotImplementedError("Forward and training with HF EncodecModel not supported.")
-    def encode(self, x: torch.Tensor) -> tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
-        bandwidth_index = self.possible_num_codebooks.index(self.num_codebooks)
-        bandwidth = self.model.config.target_bandwidths[bandwidth_index]
-        res = self.model.encode(x, None, bandwidth)
-        assert len(res[0]) == 1
-        assert len(res[1]) == 1
-        return res[0][0], res[1][0]
-    def decode(self, codes: torch.Tensor, scale: tp.Optional[torch.Tensor] = None):
-        if scale is None:
-            scales = [None]  # type: ignore
-        else:
-            scales = scale  # type: ignore
-        res = self.model.decode(codes[None], scales)
-        return res[0]
-    def decode_latent(self, codes: torch.Tensor):
-        """Decode from the discrete codes to continuous latent space."""
-        return self.model.quantizer.decode(codes.transpose(0, 1))
-    @property
-    def channels(self) -> int:
-        return self.model.config.audio_channels
-    @property
-    def frame_rate(self) -> float:
-        hop_length = int(np.prod(self.model.config.upsampling_ratios))
-        return self.sample_rate / hop_length
-    @property
-    def sample_rate(self) -> int:
-        return self.model.config.sampling_rate
-    @property
-    def cardinality(self) -> int:
-        return self.model.config.codebook_size
-    @property
-    def num_codebooks(self) -> int:
-        return self._num_codebooks
-    @property
-    def total_codebooks(self) -> int:
-        return max(self.possible_num_codebooks)
-    def set_num_codebooks(self, n: int):
-        """Set the active number of codebooks used by the quantizer.
-        """
-        if n not in self.possible_num_codebooks:
-            raise ValueError(f"Allowed values for num codebooks: {self.possible_num_codebooks}")
-        self._num_codebooks = n
-class InterleaveStereoCompressionModel(CompressionModel):
-    """Wraps a CompressionModel to support stereo inputs. The wrapped model
-    will be applied independently to the left and right channels, and both codebooks
-    will be interleaved. If the wrapped model returns a representation `[B, K ,T]` per
-    channel, then the output will be `[B, K * 2, T]`  or `[B, K, T * 2]` depending on
-    `per_timestep`.
-    Args:
-        model (CompressionModel): Compression model to wrap.
-        per_timestep (bool): Whether to interleave on the timestep dimension
-            or on the codebooks dimension.
-    """
-    def __init__(self, model: CompressionModel, per_timestep: bool = False):
-        super().__init__()
-        self.model = model
-        self.per_timestep = per_timestep
-        assert self.model.channels == 1, "Wrapped model is expected to be for monophonic audio"
-    @property
-    def total_codebooks(self):
-        return self.model.total_codebooks
-    @property
-    def num_codebooks(self):
-        """Active number of codebooks used by the quantizer.
-        ..Warning:: this reports the number of codebooks after the interleaving
-        of the codebooks!
-        """
-        return self.model.num_codebooks if self.per_timestep else self.model.num_codebooks * 2
-    def set_num_codebooks(self, n: int):
-        """Set the active number of codebooks used by the quantizer.
-        ..Warning:: this sets the number of codebooks before the interleaving!
-        """
-        self.model.set_num_codebooks(n)
-    @property
-    def num_virtual_steps(self) -> float:
-        """Return the number of virtual steps, e.g. one real step
-        will be split into that many steps.
-        """
-        return 2 if self.per_timestep else 1
-    @property
-    def frame_rate(self) -> float:
-        return self.model.frame_rate * self.num_virtual_steps
-    @property
-    def sample_rate(self) -> int:
-        return self.model.sample_rate
-    @property
-    def channels(self) -> int:
-        return 2
-    @property
-    def cardinality(self):
-        """Cardinality of each codebook.
-        """
-        return self.model.cardinality
-    def forward(self, x: torch.Tensor) -> qt.QuantizedResult:
-        raise NotImplementedError("Not supported, use encode and decode.")
-    def encode(self, x: torch.Tensor) -> tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
-        B, C, T = x.shape
-        assert C == self.channels, f"Expecting stereo audio but audio num channels is {C}"
-        indices_c0, scales_c0 = self.model.encode(x[:, 0, ...].unsqueeze(1))
-        indices_c1, scales_c1 = self.model.encode(x[:, 1, ...].unsqueeze(1))
-        indices = torch.stack([indices_c0, indices_c1], dim=0)
-        scales: tp.Optional[torch.Tensor] = None
-        if scales_c0 is not None and scales_c1 is not None:
-            scales = torch.stack([scales_c0, scales_c1], dim=1)
-        if self.per_timestep:
-            indices = rearrange(indices, 'c b k t -> b k (t c)', c=2)
-        else:
-            indices = rearrange(indices, 'c b k t -> b (k c) t', c=2)
-        return (indices, scales)
-    def get_left_right_codes(self, codes: torch.Tensor) -> tp.Tuple[torch.Tensor, torch.Tensor]:
-        if self.per_timestep:
-            codes = rearrange(codes, 'b k (t c) -> c b k t', c=2)
-        else:
-            codes = rearrange(codes, 'b (k c) t -> c b k t', c=2)
-        return codes[0], codes[1]
-    def decode(self, codes: torch.Tensor, scale: tp.Optional[torch.Tensor] = None):
-        B, K, T = codes.shape
-        assert T % self.num_virtual_steps == 0, "Provided codes' number of timesteps does not match"
-        assert K == self.num_codebooks, "Provided codes' number of codebooks does not match"
-        scale_c0, scale_c1 = None, None
-        if scale is not None:
-            assert scale.size(0) == B and scale.size(1) == 2, f"Scale has unexpected shape: {scale.shape}"
-            scale_c0 = scale[0, ...]
-            scale_c1 = scale[1, ...]
-        codes_c0, codes_c1 = self.get_left_right_codes(codes)
-        audio_c0 = self.model.decode(codes_c0, scale_c0)
-        audio_c1 = self.model.decode(codes_c1, scale_c1)
-        return torch.cat([audio_c0, audio_c1], dim=1)
-    def decode_latent(self, codes: torch.Tensor):
-        """Decode from the discrete codes to continuous latent space."""
-        raise NotImplementedError("Not supported by interleaved stereo wrapped models.")

audiocraft/models/flow_matching.py DELETED Viewed

@@ -1,516 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-from dataclasses import dataclass
-from functools import partial
-import logging
-import math
-import typing as tp
-import torch
-from torch import nn
-from torchdiffeq import odeint  # type: ignore
-from ..modules.streaming import StreamingModule
-from ..modules.transformer import create_norm_fn, StreamingTransformerLayer
-from ..modules.unet_transformer import UnetTransformer
-from ..modules.conditioners import (
-    ConditionFuser,
-    ClassifierFreeGuidanceDropout,
-    AttributeDropout,
-    ConditioningAttributes,
-    JascoCondConst
-)
-from ..modules.jasco_conditioners import JascoConditioningProvider
-from ..modules.activations import get_activation_fn
-from .lm import ConditionTensors, init_layer
-logger = logging.getLogger(__name__)
-@dataclass
-class FMOutput:
-    latents: torch.Tensor  # [B, T, D]
-    mask: torch.Tensor  # [B, T]
-class CFGTerm:
-    """
-    Base class for Multi Source Classifier-Free Guidance (CFG) terms. This class represents a term in the CFG process,
-    which is used to guide the generation process by adjusting the influence of different conditions.
-    Attributes:
-        conditions (dict): A dictionary of conditions that influence the generation process.
-        weight (float): The weight of the CFG term, determining its influence on the generation.
-    """
-    def __init__(self, conditions, weight):
-        self.conditions = conditions
-        self.weight = weight
-    def drop_irrelevant_conds(self, conditions):
-        """
-        Drops irrelevant conditions from the CFG term. This method should be implemented by subclasses.
-        Args:
-            conditions (dict): The conditions to be filtered.
-        Raises:
-            NotImplementedError: If the method is not implemented in a subclass.
-        """
-        raise NotImplementedError("No base implementation for setting generation params.")
-class AllCFGTerm(CFGTerm):
-    """
-    A CFG term that retains all conditions. This class does not drop any condition.
-    """
-    def __init__(self, conditions, weight):
-        super().__init__(conditions, weight)
-        self.drop_irrelevant_conds()
-    def drop_irrelevant_conds(self):
-        pass
-class NullCFGTerm(CFGTerm):
-    """
-    A CFG term that drops all conditions, effectively nullifying their influence.
-    """
-    def __init__(self, conditions, weight):
-        super().__init__(conditions, weight)
-        self.drop_irrelevant_conds()
-    def drop_irrelevant_conds(self):
-        """
-        Drops all conditions by applying a dropout with probability 1.0, effectively nullifying their influence.
-        """
-        self.conditions = ClassifierFreeGuidanceDropout(p=1.0)(
-                                                        samples=self.conditions,
-                                                        cond_types=["wav", "text", "symbolic"])
-class TextCFGTerm(CFGTerm):
-    """
-    A CFG term that selectively drops conditions based on specified dropout probabilities for different types
-    of conditions, such as 'symbolic' and 'wav'.
-    """
-    def __init__(self, conditions, weight, model_att_dropout):
-        """
-        Initializes a TextCFGTerm with specified conditions, weight, and model attention dropout configuration.
-        Args:
-            conditions (dict): The conditions to be used in the CFG process.
-            weight (float): The weight of the CFG term.
-            model_att_dropout (object): The attribute dropouts used by the model.
-        """
-        super().__init__(conditions, weight)
-        if 'symbolic' in model_att_dropout.p:
-            self.drop_symbolics = {k: 1.0 for k in model_att_dropout.p['symbolic'].keys()}
-        else:
-            self.drop_symbolics = {}
-        if 'wav' in model_att_dropout.p:
-            self.drop_wav = {k: 1.0 for k in model_att_dropout.p['wav'].keys()}
-        else:
-            self.drop_wav = {}
-        self.drop_irrelevant_conds()
-    def drop_irrelevant_conds(self):
-        self.conditions = AttributeDropout({'symbolic': self.drop_symbolics,
-                                            'wav': self.drop_wav})(self.conditions)  # drop temporal conds
-class FlowMatchingModel(StreamingModule):
-    """
-    A flow matching model inherits from StreamingModule.
-    This model uses a transformer architecture to process and fuse conditions, applying learned embeddings and
-    transformations and predicts multi-source guided vector fields.
-    Attributes:
-        condition_provider (JascoConditioningProvider): Provider for conditioning attributes.
-        fuser (ConditionFuser): Fuser for combining multiple conditions.
-        dim (int): Dimensionality of the model's main features.
-        num_heads (int): Number of attention heads in the transformer.
-        flow_dim (int): Dimensionality of the flow features.
-        chords_dim (int): Dimensionality for chord embeddings, if used.
-        drums_dim (int): Dimensionality for drums embeddings, if used.
-        melody_dim (int): Dimensionality for melody embeddings, if used.
-        hidden_scale (int): Scaling factor for the dimensionality of the feedforward network in the transformer.
-        norm (str): Type of normalization to use ('layer_norm' or other supported types).
-        norm_first (bool): Whether to apply normalization before other operations in the transformer layers.
-        bias_proj (bool): Whether to include bias in the projection layers.
-        weight_init (Optional[str]): Method for initializing weights.
-        depthwise_init (Optional[str]): Method for initializing depthwise convolutional layers.
-        zero_bias_init (bool): Whether to initialize biases to zero.
-        cfg_dropout (float): Dropout rate for configuration settings.
-        cfg_coef (float): Coefficient for configuration influence.
-        attribute_dropout (Dict[str, Dict[str, float]]): Dropout rates for specific attributes.
-        time_embedding_dim (int): Dimensionality of time embeddings.
-        **kwargs: Additional keyword arguments for the transformer.
-    Methods:
-        __init__: Initializes the model with the specified attributes and configuration.
-    """
-    def __init__(self, condition_provider: JascoConditioningProvider,
-                 fuser: ConditionFuser,
-                 dim: int = 128,
-                 num_heads: int = 8,
-                 flow_dim: int = 128,
-                 chords_dim: int = 0,
-                 drums_dim: int = 0,
-                 melody_dim: int = 0,
-                 hidden_scale: int = 4,
-                 norm: str = 'layer_norm',
-                 norm_first: bool = False,
-                 bias_proj: bool = True,
-                 weight_init: tp.Optional[str] = None,
-                 depthwise_init: tp.Optional[str] = None,
-                 zero_bias_init: bool = False,
-                 cfg_dropout: float = 0,
-                 cfg_coef: float = 1.0,
-                 attribute_dropout: tp.Dict[str, tp.Dict[str, float]] = {},
-                 time_embedding_dim: int = 128,
-                 **kwargs):
-        super().__init__()
-        self.cfg_coef = cfg_coef
-        self.cfg_dropout = ClassifierFreeGuidanceDropout(p=cfg_dropout)
-        self.att_dropout = AttributeDropout(p=attribute_dropout)
-        self.condition_provider = condition_provider
-        self.fuser = fuser
-        self.dim = dim  # transformer dim
-        self.flow_dim = flow_dim
-        self.chords_dim = chords_dim
-        self.emb = nn.Linear(flow_dim + chords_dim + drums_dim + melody_dim, dim, bias=False)
-        if 'activation' in kwargs:
-            kwargs['activation'] = get_activation_fn(kwargs['activation'])
-        self.transformer = UnetTransformer(
-            d_model=dim, num_heads=num_heads, dim_feedforward=int(hidden_scale * dim),
-            norm=norm, norm_first=norm_first,
-            layer_class=StreamingTransformerLayer,
-            **kwargs)
-        self.out_norm: tp.Optional[nn.Module] = None
-        if norm_first:
-            self.out_norm = create_norm_fn(norm, dim)
-        self.linear = nn.Linear(dim, flow_dim, bias=bias_proj)
-        self._init_weights(weight_init, depthwise_init, zero_bias_init)
-        self._fsdp: tp.Optional[nn.Module]
-        self.__dict__['_fsdp'] = None
-        # init time parameter embedding
-        self.d_temb1 = time_embedding_dim
-        self.d_temb2 = 4 * time_embedding_dim
-        self.temb = nn.Module()
-        self.temb.dense = nn.ModuleList([
-            torch.nn.Linear(self.d_temb1,
-                            self.d_temb2),
-            torch.nn.Linear(self.d_temb2,
-                            self.d_temb2),
-        ])
-        self.temb_proj = nn.Linear(self.d_temb2, dim)
-    def _get_timestep_embedding(self, timesteps, embedding_dim):
-        """
-        #######################################################################################################
-        TAKEN FROM: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/model.py
-        #######################################################################################################
-        This matches the implementation in Denoising Diffusion Probabilistic Models:
-        From Fairseq.
-        Build sinusoidal embeddings.
-        This matches the implementation in tensor2tensor, but differs slightly
-        from the description in Section 3.5 of "Attention Is All You Need".
-        """
-        assert len(timesteps.shape) == 1
-        half_dim = embedding_dim // 2
-        emb = math.log(10000) / (half_dim - 1)
-        emb = torch.exp(torch.arange(half_dim, dtype=torch.float32) * -emb)
-        emb = emb.to(device=timesteps.device)
-        emb = timesteps.float()[:, None] * emb[None, :]
-        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
-        if embedding_dim % 2 == 1:  # zero pad
-            emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
-        return emb
-    def _embed_time_parameter(self, t: torch.Tensor):
-        """
-        #######################################################################################################
-        TAKEN FROM: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/model.py
-        #######################################################################################################
-        """
-        temb = self._get_timestep_embedding(t.flatten(), self.d_temb1)
-        temb = self.temb.dense[0](temb)
-        temb = temb * torch.sigmoid(temb)  # swish activation
-        temb = self.temb.dense[1](temb)
-        return temb
-    def _init_weights(self, weight_init: tp.Optional[str], depthwise_init: tp.Optional[str], zero_bias_init: bool):
-        """Initialization of the transformer module weights.
-        Args:
-            weight_init (str, optional): Weight initialization strategy. See ``get_init_fn`` for valid options.
-            depthwise_init (str, optional): Depthwise initialization strategy. The following options are valid:
-                'current' where the depth corresponds to the current layer index or 'global' where the total number
-                of layer is used as depth. If not set, no depthwise initialization strategy is used.
-            zero_bias_init (bool): Whether to initialize bias to zero or not.
-        """
-        assert depthwise_init is None or depthwise_init in ['current', 'global']
-        assert depthwise_init is None or weight_init is not None, \
-            "If 'depthwise_init' is defined, a 'weight_init' method should be provided."
-        assert not zero_bias_init or weight_init is not None, \
-            "If 'zero_bias_init', a 'weight_init' method should be provided"
-        if weight_init is None:
-            return
-        init_layer(self.emb, method=weight_init, init_depth=None, zero_bias_init=zero_bias_init)
-        for layer_idx, tr_layer in enumerate(self.transformer.layers):
-            depth = None
-            if depthwise_init == 'current':
-                depth = layer_idx + 1
-            elif depthwise_init == 'global':
-                depth = len(self.transformer.layers)
-            init_fn = partial(init_layer, method=weight_init, init_depth=depth, zero_bias_init=zero_bias_init)
-            tr_layer.apply(init_fn)
-        init_layer(self.linear, method=weight_init, init_depth=None, zero_bias_init=zero_bias_init)
-    def _align_seq_length(self,
-                          cond: torch.Tensor,
-                          seq_len: int = 500):
-        # trim if needed
-        cond = cond[:, :seq_len, :]
-        # pad if needed
-        B, T, C = cond.shape
-        if T < seq_len:
-            cond = torch.cat((cond, torch.zeros((B, seq_len - T, C), dtype=cond.dtype, device=cond.device)), dim=1)
-        return cond
-    def forward(self,
-                latents: torch.Tensor,
-                t: torch.Tensor,
-                conditions: tp.List[ConditioningAttributes],
-                condition_tensors: tp.Optional[ConditionTensors] = None) -> torch.Tensor:
-        """Apply flow matching forward pass on latents and conditions.
-        Given a tensor of noisy latents of shape [B, T, D] with D the flow dim and T the sequence steps,
-        and a time parameter tensor t, return the vector field with shape [B, T, D].
-        Args:
-            latents (torch.Tensor): noisy latents.
-            conditions (list of ConditioningAttributes): Conditions to use when modeling
-                the given codes. Note that when evaluating multiple time with the same conditioning
-                you should pre-compute those and pass them as `condition_tensors`.
-            condition_tensors (dict[str, ConditionType], optional): Pre-computed conditioning
-                tensors, see `conditions`.
-        Returns:
-            torch.Tensor: estimated vector field v_theta.
-        """
-        assert condition_tensors is not None, "FlowMatchingModel require pre-calculation of condition tensors"
-        assert not conditions, "Shouldn't pass unprocessed conditions to FlowMatchingModel."
-        B, T, D = latents.shape
-        x = latents
-        # concat temporal conditions on the feature dimension
-        temporal_conds = JascoCondConst.ALL.value
-        for cond in temporal_conds:
-            if cond not in condition_tensors:
-                continue
-            c = self._align_seq_length(condition_tensors[cond][0], seq_len=T)
-            x = torch.concat((x, c), dim=-1)
-        # project to transformer dimension
-        input_ = self.emb(x)
-        input_, cross_attention_input = self.fuser(input_, condition_tensors)
-        # embed time parameter
-        t_embs = self._embed_time_parameter(t)
-        # add it to cross_attention_input
-        cross_attention_input = cross_attention_input + self.temb_proj(t_embs[:, None, :])
-        out = self.transformer(input_, cross_attention_src=cross_attention_input)
-        if self.out_norm:
-            out = self.out_norm(out)
-        v_theta = self.linear(out)  # [B, T, D]
-        # remove the prefix from the model outputs
-        if len(self.fuser.fuse2cond['prepend']) > 0:
-            v_theta = v_theta[:, :, -T:]
-        return v_theta  # [B, T, D]
-    def _multi_source_cfg_preprocess(self,
-                                     conditions: tp.List[ConditioningAttributes],
-                                     cfg_coef_all: float,
-                                     cfg_coef_txt: float,
-                                     min_weight: float = 1e-6):
-        """
-        Preprocesses the CFG terms for multi-source conditional generation.
-        Args:
-            conditions (list): A list of conditions to be applied.
-            cfg_coef_all (float): The coefficient for all conditions.
-            cfg_coef_txt (float): The coefficient for text conditions.
-            min_weight (float): The minimal absolute weight for calculating a CFG term.
-        Returns:
-            tuple: A tuple containing condition_tensors and cfg_terms.
-                condition_tensors is a dictionary or ConditionTensors object with tokenized conditions.
-                cfg_terms is a list of CFGTerm objects with weights adjusted based on the coefficients.
-        """
-        condition_tensors: tp.Optional[ConditionTensors]
-        cfg_terms = []
-        if conditions:
-            # conditional terms
-            cfg_terms = [AllCFGTerm(conditions=conditions, weight=cfg_coef_all),
-                         TextCFGTerm(conditions=conditions, weight=cfg_coef_txt,
-                                     model_att_dropout=self.att_dropout)]
-            # add null term
-            cfg_terms.append(NullCFGTerm(conditions=conditions, weight=1 - sum([ct.weight for ct in cfg_terms])))
-            # remove terms with negligible weight
-            for ct in cfg_terms:
-                if abs(ct.weight) < min_weight:
-                    cfg_terms.remove(ct)
-            conds: tp.List[ConditioningAttributes] = sum([ct.conditions for ct in cfg_terms], [])
-            tokenized = self.condition_provider.tokenize(conds)
-            condition_tensors = self.condition_provider(tokenized)
-        else:
-            condition_tensors = {}
-        return condition_tensors, cfg_terms
-    def estimated_vector_field(self, z, t, condition_tensors=None, cfg_terms=[]):
-        """
-        Estimates the vector field for the given latent variables and time parameter,
-        conditioned on the provided conditions.
-        Args:
-            z (Tensor): The latent variables.
-            t (float): The time variable.
-            condition_tensors (ConditionTensors, optional): The condition tensors. Defaults to None.
-            cfg_terms (list, optional): The list of CFG terms. Defaults to an empty list.
-        Returns:
-            Tensor: The estimated vector field.
-        """
-        if len(cfg_terms) > 1:
-            z = z.repeat(len(cfg_terms), 1, 1)  # duplicate noisy latents for multi-source CFG
-        v_thetas = self(latents=z, t=t, conditions=[], condition_tensors=condition_tensors)
-        return self._multi_source_cfg_postprocess(v_thetas, cfg_terms)
-    def _multi_source_cfg_postprocess(self, v_thetas, cfg_terms):
-        """
-        Postprocesses the vector fields generated for each CFG term to combine them into a single vector field.
-        Multi source guidance occurs here.
-        Args:
-            v_thetas (Tensor): The vector fields for each CFG term.
-            cfg_terms (list): The CFG terms used.
-        Returns:
-            Tensor: The combined vector field.
-        """
-        if len(cfg_terms) <= 1:
-            return v_thetas
-        v_theta_per_term = v_thetas.chunk(len(cfg_terms))
-        return sum([ct.weight * term_vf for ct, term_vf in zip(cfg_terms, v_theta_per_term)])
-    @torch.no_grad()
-    def generate(self,
-                 prompt: tp.Optional[torch.Tensor] = None,
-                 conditions: tp.List[ConditioningAttributes] = [],
-                 num_samples: tp.Optional[int] = None,
-                 max_gen_len: int = 256,
-                 callback: tp.Optional[tp.Callable[[int, int], None]] = None,
-                 cfg_coef_all: float = 3.0,
-                 cfg_coef_txt: float = 1.0,
-                 euler: bool = False,
-                 euler_steps: int = 100,
-                 ode_rtol: float = 1e-5,
-                 ode_atol: float = 1e-5,
-                 ) -> torch.Tensor:
-        """
-        Generate audio latents given a prompt or unconditionally. This method supports both Euler integration
-        and adaptive ODE solving to generate sequences based on the specified conditions and configuration coefficients.
-        Args:
-            prompt (torch.Tensor, optional): Initial prompt to condition the generation. defaults to None
-            conditions (List[ConditioningAttributes]): List of conditioning attributes - text, symbolic or audio.
-            num_samples (int, optional): Number of samples to generate.
-                                         If None, it is inferred from the number of conditions.
-            max_gen_len (int): Maximum length of the generated sequence.
-            callback (Callable[[int, int], None], optional): Callback function to monitor the generation process.
-            cfg_coef_all (float): Coefficient for the fully conditional CFG term.
-            cfg_coef_txt (float): Coefficient for text CFG term.
-            euler (bool): If True, use Euler integration, otherwise use adaptive ODE solver.
-            euler_steps (int): Number of Euler steps to perform if Euler integration is used.
-            ode_rtol (float): ODE solver rtol threshold.
-            ode_atol (float): ODE solver atol threshold.
-        Returns:
-            torch.Tensor: Generated latents, shaped as (num_samples, max_gen_len, feature_dim).
-        """
-        assert not self.training, "generation shouldn't be used in training mode."
-        first_param = next(iter(self.parameters()))
-        device = first_param.device
-        # Checking all input shapes are consistent.
-        possible_num_samples = []
-        if num_samples is not None:
-            possible_num_samples.append(num_samples)
-        elif prompt is not None:
-            possible_num_samples.append(prompt.shape[0])
-        elif conditions:
-            possible_num_samples.append(len(conditions))
-        else:
-            possible_num_samples.append(1)
-        assert [x == possible_num_samples[0] for x in possible_num_samples], "Inconsistent inputs shapes"
-        num_samples = possible_num_samples[0]
-        condition_tensors, cfg_terms = self._multi_source_cfg_preprocess(conditions, cfg_coef_all, cfg_coef_txt)
-        # flow matching inference
-        B, T, D = num_samples, max_gen_len, self.flow_dim
-        z_0 = torch.randn((B, T, D), device=device)
-        if euler:
-            # vanilla Euler intergration
-            dt = (1 / euler_steps)
-            z = z_0
-            t = torch.zeros((1, ), device=device)
-            for _ in range(euler_steps):
-                v_theta = self.estimated_vector_field(z, t,
-                                                      condition_tensors=condition_tensors,
-                                                      cfg_terms=cfg_terms)
-                z = z + dt * v_theta
-                t = t + dt
-            z_1 = z
-        else:
-            # solve with dynamic ode integrator (dopri5)
-            t = torch.tensor([0, 1.0 - 1e-5], device=device)
-            num_evals = 0
-            # define ode vector field function
-            def inner_ode_func(t, z):
-                nonlocal num_evals
-                num_evals += 1
-                if callback is not None:
-                    ESTIMATED_ODE_SOLVER_STEPS = 300
-                    callback(num_evals, ESTIMATED_ODE_SOLVER_STEPS)
-                return self.estimated_vector_field(z, t,
-                                                   condition_tensors=condition_tensors,
-                                                   cfg_terms=cfg_terms)
-            ode_opts: dict = {"options": {}}
-            z = odeint(
-                inner_ode_func,
-                z_0,
-                t,
-                **{"atol": ode_atol, "rtol": ode_rtol, **ode_opts},
-            )
-            logger.info("Generated in %d steps", num_evals)
-            z_1 = z[-1]
-        return z_1

audiocraft/models/genmodel.py DELETED Viewed

@@ -1,273 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Base implementation for audio generative models. This base implementation
-combines all the required components to run inference with pretrained audio
-generative models. It can be easily inherited by downstream model classes to
-provide easy access to the generation API.
-"""
-from abc import ABC, abstractmethod
-import typing as tp
-import omegaconf
-import torch
-import gradio as gr
-from .encodec import CompressionModel
-from .lm import LMModel
-from .builders import get_wrapped_compression_model
-from ..data.audio_utils import convert_audio
-from ..modules.conditioners import ConditioningAttributes
-from ..utils.autocast import TorchAutocast
-class BaseGenModel(ABC):
-    """Base generative model with convenient generation API.
-    Args:
-        name (str): name of the model.
-        compression_model (CompressionModel): Compression model
-            used to map audio to invertible discrete representations.
-        lm (LMModel): Language model over discrete representations.
-        max_duration (float, optional): maximum duration the model can produce,
-            otherwise, inferred from the training params.
-    """
-    def __init__(self, name: str, compression_model: CompressionModel, lm: LMModel,
-                 max_duration: tp.Optional[float] = None):
-        self.name = name
-        self.compression_model = compression_model
-        self.lm = lm
-        self.cfg: tp.Optional[omegaconf.DictConfig] = None
-        # Just to be safe, let's put everything in eval mode.
-        self.compression_model.eval()
-        self.lm.eval()
-        if hasattr(lm, 'cfg'):
-            cfg = lm.cfg
-            assert isinstance(cfg, omegaconf.DictConfig)
-            self.cfg = cfg
-        if self.cfg is not None:
-            self.compression_model = get_wrapped_compression_model(self.compression_model, self.cfg)
-        if max_duration is None:
-            if self.cfg is not None:
-                max_duration = lm.cfg.dataset.segment_duration  # type: ignore
-            else:
-                raise ValueError("You must provide max_duration when building directly your GenModel")
-        assert max_duration is not None
-        self.max_duration: float = max_duration
-        self.duration = self.max_duration
-        # self.extend_stride is the length of audio extension when generating samples longer
-        # than self.max_duration. NOTE: the derived class must set self.extend_stride to a
-        # positive float value when generating with self.duration > self.max_duration.
-        self.extend_stride: tp.Optional[float] = None
-        self.device = next(iter(lm.parameters())).device
-        self.generation_params: dict = {}
-        self._progress_callback: tp.Optional[tp.Callable[[int, int], None]] = None
-        if self.device.type == 'cpu':
-            self.autocast = TorchAutocast(enabled=False)
-        else:
-            self.autocast = TorchAutocast(
-                enabled=True, device_type=self.device.type, dtype=torch.float16)
-    @property
-    def frame_rate(self) -> float:
-        """Roughly the number of AR steps per seconds."""
-        return self.compression_model.frame_rate
-    @property
-    def sample_rate(self) -> int:
-        """Sample rate of the generated audio."""
-        return self.compression_model.sample_rate
-    @property
-    def audio_channels(self) -> int:
-        """Audio channels of the generated audio."""
-        return self.compression_model.channels
-    def set_custom_progress_callback(self, progress_callback: tp.Optional[tp.Callable[[int, int], None]] = None):
-        """Override the default progress callback."""
-        self._progress_callback = progress_callback
-    @abstractmethod
-    def set_generation_params(self, *args, **kwargs):
-        """Set the generation parameters."""
-        raise NotImplementedError("No base implementation for setting generation params.")
-    @staticmethod
-    @abstractmethod
-    def get_pretrained(name: str, device=None):
-        raise NotImplementedError("No base implementation for getting pretrained model")
-    @torch.no_grad()
-    def _prepare_tokens_and_attributes(
-            self,
-            descriptions: tp.Sequence[tp.Optional[str]],
-            prompt: tp.Optional[torch.Tensor],
-    ) -> tp.Tuple[tp.List[ConditioningAttributes], tp.Optional[torch.Tensor]]:
-        """Prepare model inputs.
-        Args:
-            descriptions (list of str): A list of strings used as text conditioning.
-            prompt (torch.Tensor): A batch of waveforms used for continuation.
-        """
-        attributes = [
-            ConditioningAttributes(text={'description': description})
-            for description in descriptions]
-        if prompt is not None:
-            if descriptions is not None:
-                assert len(descriptions) == len(prompt), "Prompt and nb. descriptions doesn't match"
-            prompt = prompt.to(self.device)
-            prompt_tokens, scale = self.compression_model.encode(prompt)
-            assert scale is None
-        else:
-            prompt_tokens = None
-        return attributes, prompt_tokens
-    def generate_unconditional(self, num_samples: int, progress: bool = False,
-                               return_tokens: bool = False) -> tp.Union[torch.Tensor,
-                                                                        tp.Tuple[torch.Tensor, torch.Tensor]]:
-        """Generate samples in an unconditional manner.
-        Args:
-            num_samples (int): Number of samples to be generated.
-            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
-        """
-        descriptions: tp.List[tp.Optional[str]] = [None] * num_samples
-        attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, None)
-        tokens = self._generate_tokens(attributes, prompt_tokens, progress)
-        if return_tokens:
-            return self.generate_audio(tokens), tokens
-        return self.generate_audio(tokens)
-    def generate(self, descriptions: tp.List[str], progress: bool = False, return_tokens: bool = False) \
-            -> tp.Union[torch.Tensor, tp.Tuple[torch.Tensor, torch.Tensor]]:
-        """Generate samples conditioned on text.
-        Args:
-            descriptions (list of str): A list of strings used as text conditioning.
-            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
-        """
-        attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, None)
-        assert prompt_tokens is None
-        tokens = self._generate_tokens(attributes, prompt_tokens, progress)
-        if return_tokens:
-            return self.generate_audio(tokens), tokens
-        return self.generate_audio(tokens)
-    def generate_continuation(self, prompt: torch.Tensor, prompt_sample_rate: int,
-                              descriptions: tp.Optional[tp.List[tp.Optional[str]]] = None,
-                              progress: bool = False, return_tokens: bool = False) \
-            -> tp.Union[torch.Tensor, tp.Tuple[torch.Tensor, torch.Tensor]]:
-        """Generate samples conditioned on audio prompts and an optional text description.
-        Args:
-            prompt (torch.Tensor): A batch of waveforms used for continuation.
-                Prompt should be [B, C, T], or [C, T] if only one sample is generated.
-            prompt_sample_rate (int): Sampling rate of the given audio waveforms.
-            descriptions (list of str, optional): A list of strings used as text conditioning. Defaults to None.
-            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
-        """
-        if prompt.dim() == 2:
-            prompt = prompt[None]
-        if prompt.dim() != 3:
-            raise ValueError("prompt should have 3 dimensions: [B, C, T] (C = 1).")
-        prompt = convert_audio(prompt, prompt_sample_rate, self.sample_rate, self.audio_channels)
-        if descriptions is None:
-            descriptions = [None] * len(prompt)
-        attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, prompt)
-        assert prompt_tokens is not None
-        tokens = self._generate_tokens(attributes, prompt_tokens, progress)
-        if return_tokens:
-            return self.generate_audio(tokens), tokens
-        return self.generate_audio(tokens)
-    def _generate_tokens(self, attributes: tp.List[ConditioningAttributes],
-                         prompt_tokens: tp.Optional[torch.Tensor], progress: bool = False, progress_callback: gr.Progress = None) -> torch.Tensor:
-        """Generate discrete audio tokens given audio prompt and/or conditions.
-        Args:
-            attributes (list of ConditioningAttributes): Conditions used for generation (text/melody).
-            prompt_tokens (torch.Tensor, optional): Audio prompt used for continuation.
-            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
-        Returns:
-            torch.Tensor: Generated audio, of shape [B, C, T], T is defined by the generation params.
-        """
-        total_gen_len = int(self.duration * self.frame_rate)
-        max_prompt_len = int(min(self.duration, self.max_duration) * self.frame_rate)
-        current_gen_offset: int = 0
-        def _progress_callback(generated_tokens: int, tokens_to_generate: int):
-            generated_tokens += current_gen_offset
-            generated_tokens /= ((tokens_to_generate) / self.duration)
-            tokens_to_generate /= ((tokens_to_generate) / self.duration)
-            if self._progress_callback is not None:
-                # Note that total_gen_len might be quite wrong depending on the
-                # codebook pattern used, but with delay it is almost accurate.
-                self._progress_callback((generated_tokens / tokens_to_generate), f"Generated {generated_tokens: 6.2f}/{tokens_to_generate: 6.2f} seconds")
-            if progress_callback is not None:
-                # Update Gradio progress bar
-                progress_callback((generated_tokens / tokens_to_generate), f"Generated {generated_tokens: 6.2f}/{tokens_to_generate: 6.2f} seconds")
-            if progress:
-                print(f'{generated_tokens: 6.2f} / {tokens_to_generate: 6.2f}', end='\r')
-        if prompt_tokens is not None:
-            if prompt_tokens.shape[-1] > max_prompt_len:
-                prompt_tokens = prompt_tokens[..., :max_prompt_len]
-        # callback = None
-        callback = _progress_callback
-        if self.duration <= self.max_duration:
-            # generate by sampling from LM, simple case.
-            with self.autocast:
-                gen_tokens = self.lm.generate(
-                    prompt_tokens, attributes,
-                    callback=callback, max_gen_len=total_gen_len, **self.generation_params)
-        else:
-            assert self.extend_stride is not None, "Stride should be defined to generate beyond max_duration"
-            assert self.extend_stride < self.max_duration, "Cannot stride by more than max generation duration."
-            all_tokens = []
-            if prompt_tokens is None:
-                prompt_length = 0
-            else:
-                all_tokens.append(prompt_tokens)
-                prompt_length = prompt_tokens.shape[-1]
-            stride_tokens = int(self.frame_rate * self.extend_stride)
-            while current_gen_offset + prompt_length < total_gen_len:
-                time_offset = current_gen_offset / self.frame_rate
-                chunk_duration = min(self.duration - time_offset, self.max_duration)
-                max_gen_len = int(chunk_duration * self.frame_rate)
-                with self.autocast:
-                    gen_tokens = self.lm.generate(
-                        prompt_tokens, attributes,
-                        callback=callback, max_gen_len=max_gen_len, **self.generation_params)
-                if prompt_tokens is None:
-                    all_tokens.append(gen_tokens)
-                else:
-                    all_tokens.append(gen_tokens[:, :, prompt_tokens.shape[-1]:])
-                prompt_tokens = gen_tokens[:, :, stride_tokens:]
-                prompt_length = prompt_tokens.shape[-1]
-                current_gen_offset += stride_tokens
-            gen_tokens = torch.cat(all_tokens, dim=-1)
-        return gen_tokens
-    def generate_audio(self, gen_tokens: torch.Tensor) -> torch.Tensor:
-        """Generate Audio from tokens."""
-        assert gen_tokens.dim() == 3
-        with torch.no_grad():
-            gen_audio = self.compression_model.decode(gen_tokens, None)
-        return gen_audio

audiocraft/models/lm.py DELETED Viewed

@@ -1,588 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-from dataclasses import dataclass
-from functools import partial
-import logging
-import math
-import typing as tp
-import torch
-from torch import nn
-from ..utils import utils
-from ..modules.streaming import StreamingModule, State
-from ..modules.transformer import StreamingTransformer, create_norm_fn
-from ..modules.conditioners import (
-    ConditionFuser,
-    ClassifierFreeGuidanceDropout,
-    AttributeDropout,
-    ConditioningProvider,
-    ConditioningAttributes,
-    ConditionType,
-    _drop_description_condition
-)
-from ..modules.codebooks_patterns import CodebooksPatternProvider
-from ..modules.activations import get_activation_fn
-logger = logging.getLogger(__name__)
-ConditionTensors = tp.Dict[str, ConditionType]
-CFGConditions = tp.Union[ConditionTensors, tp.Tuple[ConditionTensors, ConditionTensors]]
-def get_init_fn(method: str, input_dim: int, init_depth: tp.Optional[int] = None):
-    """LM layer initialization.
-    Inspired from xlformers: https://github.com/fairinternal/xlformers
-    Args:
-        method (str): Method name for init function. Valid options are:
-            'gaussian', 'uniform'.
-        input_dim (int): Input dimension of the initialized module.
-        init_depth (int, optional): Optional init depth value used to rescale
-            the standard deviation if defined.
-    """
-    # Compute std
-    std = 1 / math.sqrt(input_dim)
-    # Rescale with depth
-    if init_depth is not None:
-        std = std / math.sqrt(2 * init_depth)
-    if method == 'gaussian':
-        return partial(
-            torch.nn.init.trunc_normal_, mean=0.0, std=std, a=-3 * std, b=3 * std
-        )
-    elif method == 'uniform':
-        bound = math.sqrt(3) * std  # ensure the standard deviation is `std`
-        return partial(torch.nn.init.uniform_, a=-bound, b=bound)
-    else:
-        raise ValueError("Unsupported layer initialization method")
-def init_layer(m: nn.Module,
-               method: str,
-               init_depth: tp.Optional[int] = None,
-               zero_bias_init: bool = False):
-    """Wrapper around ``get_init_fn`` for proper initialization of LM modules.
-    Args:
-        m (nn.Module): Module to initialize.
-        method (str): Method name for the init function.
-        init_depth (int, optional): Optional init depth value used to rescale
-            the standard deviation if defined.
-        zero_bias_init (bool): Whether to initialize the bias to 0 or not.
-    """
-    if isinstance(m, nn.Linear):
-        init_fn = get_init_fn(method, m.in_features, init_depth=init_depth)
-        if m.weight.device.type == 'cpu' and m.weight.dtype == torch.float16:
-            weight = m.weight.float()
-            init_fn(weight)
-            m.weight.data[:] = weight.half()
-        else:
-            init_fn(m.weight)
-        if zero_bias_init and m.bias is not None:
-            nn.init.constant_(m.bias, 0)
-    elif isinstance(m, nn.Embedding):
-        init_fn = get_init_fn(method, m.embedding_dim, init_depth=None)
-        if m.weight.device.type == 'cpu' and m.weight.dtype == torch.float16:
-            weight = m.weight.float()
-            init_fn(weight)
-            m.weight.data[:] = weight.half()
-        else:
-            init_fn(m.weight)
-class ScaledEmbedding(nn.Embedding):
-    """Boost learning rate for embeddings (with `scale`).
-    """
-    def __init__(self, *args, lr=None, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.lr = lr
-    def make_optim_group(self):
-        group = {"params": list(self.parameters())}
-        if self.lr is not None:
-            group["lr"] = self.lr
-        return group
-@dataclass
-class LMOutput:
-    # The logits are already re-aligned with the input codes
-    # hence no extra shift is required, e.g. when computing CE
-    logits: torch.Tensor  # [B, K, T, card]
-    mask: torch.Tensor  # [B, K, T]
-class LMModel(StreamingModule):
-    """Transformer-based language model on multiple streams of codes.
-    Args:
-        pattern_provider (CodebooksPatternProvider): Pattern provider for codebook interleaving.
-        condition_provider (MusicConditioningProvider): Conditioning provider from metadata.
-        fuser (ConditionFuser): Fuser handling the fusing of conditions with language model input.
-        n_q (int): Number of parallel streams to model.
-        card (int): Cardinality, vocabulary size.
-        dim (int): Dimension of the transformer encoder.
-        num_heads (int): Number of heads for the transformer encoder.
-        hidden_scale (int): Scale for hidden feed forward dimension of the transformer encoder.
-        norm (str): Normalization method.
-        norm_first (bool): Use pre-norm instead of post-norm.
-        emb_lr (float, optional): Embedding-specific learning rate.
-        bias_proj (bool): Use bias for output projections.
-        weight_init (str, optional): Method for weight initialization.
-        depthwise_init (str, optional): Method for depthwise weight initialization.
-        zero_bias_init (bool): If true and bias in Linears, initialize bias to zeros.
-        cfg_dropout (float): Classifier-free guidance dropout.
-        cfg_coef (float): Classifier-free guidance coefficient.
-        attribute_dropout (dict): Attribute dropout probabilities.
-        two_step_cfg (bool): Whether to run classifier free-guidance with 2 distinct steps.
-        **kwargs: Additional parameters for the transformer encoder.
-    """
-    def __init__(self, pattern_provider: CodebooksPatternProvider, condition_provider: ConditioningProvider,
-                 fuser: ConditionFuser, n_q: int = 8, card: int = 1024, dim: int = 128, num_heads: int = 8,
-                 hidden_scale: int = 4, norm: str = 'layer_norm', norm_first: bool = False,
-                 emb_lr: tp.Optional[float] = None, bias_proj: bool = True,
-                 weight_init: tp.Optional[str] = None, depthwise_init: tp.Optional[str] = None,
-                 zero_bias_init: bool = False, cfg_dropout: float = 0, cfg_coef: float = 1.0,
-                 attribute_dropout: tp.Dict[str, tp.Dict[str, float]] = {}, two_step_cfg: bool = False,
-                 **kwargs):
-        super().__init__()
-        self.cfg_coef = cfg_coef
-        self.cfg_dropout = ClassifierFreeGuidanceDropout(p=cfg_dropout)
-        self.att_dropout = AttributeDropout(p=attribute_dropout)
-        self.condition_provider = condition_provider
-        self.fuser = fuser
-        self.card = card
-        embed_dim = self.card + 1
-        self.n_q = n_q
-        self.dim = dim
-        self.pattern_provider = pattern_provider
-        self.two_step_cfg = two_step_cfg
-        self.emb = nn.ModuleList([ScaledEmbedding(embed_dim, dim, lr=emb_lr) for _ in range(n_q)])
-        if 'activation' in kwargs:
-            kwargs['activation'] = get_activation_fn(kwargs['activation'])
-        self.transformer = StreamingTransformer(
-            d_model=dim, num_heads=num_heads, dim_feedforward=int(hidden_scale * dim),
-            norm=norm, norm_first=norm_first, **kwargs)
-        self.out_norm: tp.Optional[nn.Module] = None
-        if norm_first:
-            self.out_norm = create_norm_fn(norm, dim)
-        self.linears = nn.ModuleList([nn.Linear(dim, self.card, bias=bias_proj) for _ in range(n_q)])
-        self._init_weights(weight_init, depthwise_init, zero_bias_init)
-        self._fsdp: tp.Optional[nn.Module]
-        self.__dict__['_fsdp'] = None
-    def _init_weights(self, weight_init: tp.Optional[str], depthwise_init: tp.Optional[str], zero_bias_init: bool):
-        """Initialization of the transformer module weights.
-        Args:
-            weight_init (str, optional): Weight initialization strategy. See ``get_init_fn`` for valid options.
-            depthwise_init (str, optional): Depthwise initialization strategy. The following options are valid:
-                'current' where the depth corresponds to the current layer index or 'global' where the total number
-                of layer is used as depth. If not set, no depthwise initialization strategy is used.
-            zero_bias_init (bool): Whether to initialize bias to zero or not.
-        """
-        assert depthwise_init is None or depthwise_init in ['current', 'global']
-        assert depthwise_init is None or weight_init is not None, \
-            "If 'depthwise_init' is defined, a 'weight_init' method should be provided."
-        assert not zero_bias_init or weight_init is not None, \
-            "If 'zero_bias_init', a 'weight_init' method should be provided"
-        if weight_init is None:
-            return
-        for emb_layer in self.emb:
-            init_layer(emb_layer, method=weight_init, init_depth=None, zero_bias_init=zero_bias_init)
-        for layer_idx, tr_layer in enumerate(self.transformer.layers):
-            depth = None
-            if depthwise_init == 'current':
-                depth = layer_idx + 1
-            elif depthwise_init == 'global':
-                depth = len(self.transformer.layers)
-            init_fn = partial(init_layer, method=weight_init, init_depth=depth, zero_bias_init=zero_bias_init)
-            tr_layer.apply(init_fn)
-        for linear in self.linears:
-            init_layer(linear, method=weight_init, init_depth=None, zero_bias_init=zero_bias_init)
-    @property
-    def special_token_id(self) -> int:
-        return self.card
-    @property
-    def num_codebooks(self) -> int:
-        return self.n_q
-    def forward(self, sequence: torch.Tensor,
-                conditions: tp.List[ConditioningAttributes],
-                condition_tensors: tp.Optional[ConditionTensors] = None,
-                stage: int = -1) -> torch.Tensor:
-        """Apply language model on sequence and conditions.
-        Given a tensor of sequence of shape [B, K, S] with K the number of codebooks and
-        S the sequence steps, return the logits with shape [B, card, K, S].
-        Args:
-            indices (torch.Tensor): Indices of the codes to model.
-            conditions (list of ConditioningAttributes): Conditions to use when modeling
-                the given codes. Note that when evaluating multiple time with the same conditioning
-                you should pre-compute those and pass them as `condition_tensors`.
-            condition_tensors (dict[str, ConditionType], optional): Pre-computed conditioning
-                tensors, see `conditions`.
-            stage (int): The codebook level that is being predicted. Relevant for MAGNeT
-                in which prediction is done in a codebook-by-codebook manner.
-                Takes values in range(n_q), and ignored by default.
-        Returns:
-            torch.Tensor: Logits.
-        """
-        B, K, S = sequence.shape
-        assert K == self.num_codebooks, "Sequence shape must match the specified number of codebooks"
-        input_ = sum([self.emb[k](sequence[:, k]) for k in range(K)])
-        if condition_tensors is None:
-            assert not self._is_streaming, "Conditions tensors should be precomputed when streaming."
-            # apply dropout modules
-            conditions = self.cfg_dropout(conditions)
-            conditions = self.att_dropout(conditions)
-            tokenized = self.condition_provider.tokenize(conditions)
-            # encode conditions and fuse, both have a streaming cache to not recompute when generating.
-            condition_tensors = self.condition_provider(tokenized)
-        else:
-            assert not conditions, "Shouldn't pass both conditions and condition_tensors."
-        input_, cross_attention_input = self.fuser(input_, condition_tensors)
-        out = self.transformer(input_, cross_attention_src=cross_attention_input,
-                               src_mask=(self.attn_mask_per_stage[stage] if stage >= 0 else None))  # type: ignore
-        if self.out_norm:
-            out = self.out_norm(out)
-        logits = torch.stack([self.linears[k](out) for k in range(K)], dim=1)  # [B, K, S, card]
-        # remove the prefix from the model outputs
-        if len(self.fuser.fuse2cond['prepend']) > 0:
-            logits = logits[:, :, -S:]
-        return logits  # [B, K, S, card]
-    def compute_predictions(
-            self, codes: torch.Tensor,
-            conditions: tp.List[ConditioningAttributes],
-            condition_tensors: tp.Optional[ConditionTensors] = None,
-            stage: int = -1,
-            keep_only_valid_steps: bool = True) -> LMOutput:
-        """Given an input tensor of codes [B, K, T] and list of conditions, runs the model
-        forward using the specified codes interleaving pattern.
-        Args:
-            codes (torch.Tensor): Input codes of shape [B, K, T] with B the batch size,
-                K the number of codebooks and T the number of timesteps.
-            conditions (list of ConditioningAttributes): conditionings to use when modeling
-                the given codes. Note that when evaluating multiple time with the same conditioning
-                you should pre-compute those and pass them as `condition_tensors`.
-            condition_tensors (dict[str, ConditionType], optional): pre-computed conditioning
-                tensors, see `conditions`.
-            stage (int): The codebook level that is being predicted. Relevant for MAGNeT
-                in which prediction is done in a codebook-by-codebook manner.
-                Takes values in range(n_q), and ignored by default.
-            keep_only_valid_steps (bool): Build a sequence from the pattern up to valid (= fully defined) steps.
-                Steps that are beyond valid steps will be replaced by the special_token in that case.
-        Returns:
-            LMOutput: Language model outputs
-                logits (torch.Tensor) of shape [B, K, T, card] corresponding to the provided codes,
-                    i.e. the first item corresponds to logits to predict the first code, meaning that
-                    no additional shifting of codes and logits is required.
-                mask (torch.Tensor) of shape [B, K, T], mask over valid and invalid positions.
-                    Given the specified interleaving strategies, parts of the logits and codes should
-                    not be considered as valid predictions because of invalid context.
-        """
-        B, K, T = codes.shape
-        codes = codes.contiguous()
-        # map codes [B, K, T] into pattern sequence [B, K, S] using special_token_id for masked tokens
-        pattern = self.pattern_provider.get_pattern(T)
-        sequence_codes, sequence_indexes, sequence_mask = pattern.build_pattern_sequence(
-            codes, self.special_token_id, keep_only_valid_steps=keep_only_valid_steps,
-        )
-        # apply model on pattern sequence
-        model = self if self._fsdp is None else self._fsdp
-        logits = model(sequence_codes, conditions, condition_tensors, stage=stage)  # [B, K, S, card]
-        # map back the logits on pattern sequence to logits on original codes: [B, K, S, card] -> [B, K, T, card]
-        # and provide the corresponding mask over invalid positions of tokens
-        logits = logits.permute(0, 3, 1, 2)  # [B, card, K, S]
-        # note: we use nans as special token to make it obvious if we feed unexpected logits
-        logits, logits_indexes, logits_mask = pattern.revert_pattern_logits(
-            logits, float('nan'), keep_only_valid_steps=keep_only_valid_steps
-        )
-        logits = logits.permute(0, 2, 3, 1)  # [B, K, T, card]
-        logits_mask = logits_mask[None, :, :].expand(B, -1, -1)  # [K, T] -> [B, K, T]
-        return LMOutput(logits, logits_mask)
-    def _sample_next_token(self,
-                           sequence: torch.Tensor,
-                           cfg_conditions: CFGConditions,
-                           unconditional_state: State,
-                           use_sampling: bool = False,
-                           temp: float = 1.0,
-                           top_k: int = 0,
-                           top_p: float = 0.0,
-                           cfg_coef: tp.Optional[float] = None,
-                           cfg_coef_beta: tp.Optional[float] = None,
-                           two_step_cfg: tp.Optional[bool] = None) -> torch.Tensor:
-        """Sample next token from the model given a sequence and a set of conditions. The model supports
-        multiple sampling strategies (greedy sampling, softmax, top-k, top-p...).
-        Args:
-            sequence (torch.Tensor): Current sequence of shape [B, K, S]
-                with K corresponding to the number of codebooks and S the number of sequence steps.
-                S = 1 in streaming mode, except for the first step that contains a bigger prompt.
-            condition_tensors (dict[str, ConditionType): Set of conditions. If CFG is used,
-                should be twice the batch size, being the concatenation of the conditions + null conditions.
-            use_sampling (bool): Whether to use a sampling strategy or not.
-            temp (float): Sampling temperature.
-            top_k (int): K for "top-k" sampling.
-            top_p (float): P for "top-p" sampling.
-            cfg_coef (float, optional): classifier free guidance coefficient
-            cfg_coef_beta (float, optional): If None, simple classifier free guidance is used with cfg_coef.
-                If not None, we apply double classifier free guidance as introduced in MusicGen-Style
-                in paragraph 4.3 (https://arxiv.org/pdf/2407.12563). This beta coefficient is meant to
-                push the text condition more than the style condition in the case where both text and style
-                conditions are being used.
-            two_step_cfg (bool): Whether to run classifier free-guidance with 2 distinct steps.
-        Returns:
-            next_token (torch.Tensor): Next token tensor of shape [B, K, 1].
-        """
-        B = sequence.shape[0]
-        cfg_coef = self.cfg_coef if cfg_coef is None else cfg_coef
-        model = self if self._fsdp is None else self._fsdp
-        two_step_cfg = self.two_step_cfg if two_step_cfg is None else two_step_cfg
-        if cfg_coef_beta is not None:
-            assert isinstance(cfg_conditions, dict)
-            condition_tensors = cfg_conditions
-            if condition_tensors:
-                # Preparing for CFG, predicting conditional text and style, conditional style
-                # and unconditional
-                sequence = torch.cat([sequence, sequence, sequence], dim=0)
-            all_logits = model(
-                sequence,
-                conditions=[], condition_tensors=condition_tensors)
-            if condition_tensors:
-                cond_logits, wav_logits, uncond_logits = all_logits.split(B, dim=0)  # [B, K, T, card]
-                logits = uncond_logits + cfg_coef * (
-                    wav_logits + cfg_coef_beta * (cond_logits - wav_logits) - uncond_logits
-                    )
-        elif two_step_cfg and cfg_conditions != {}:
-            assert isinstance(cfg_conditions, tuple), type(cfg_conditions)
-            condition_tensors, null_condition_tensors = cfg_conditions
-            cond_logits = model(sequence, conditions=[], condition_tensors=condition_tensors)
-            state = self.get_streaming_state()
-            self.set_streaming_state(unconditional_state)
-            uncond_logits = model(sequence, conditions=[], condition_tensors=null_condition_tensors)
-            unconditional_state.update(self.get_streaming_state())
-            self.set_streaming_state(state)
-            logits = uncond_logits + (cond_logits - uncond_logits) * self.cfg_coef
-        else:
-            assert isinstance(cfg_conditions, dict)
-            condition_tensors = cfg_conditions
-            if condition_tensors:
-                # Preparing for CFG, predicting both conditional and unconditional logits.
-                sequence = torch.cat([sequence, sequence], dim=0)
-            all_logits = model(
-                sequence,
-                conditions=[], condition_tensors=condition_tensors)
-            if condition_tensors:
-                cond_logits, uncond_logits = all_logits.split(B, dim=0)  # [B, K, T, card]
-                logits = uncond_logits + (cond_logits - uncond_logits) * cfg_coef
-            else:
-                logits = all_logits
-        logits = logits.permute(0, 1, 3, 2)  # [B, K, card, T]
-        logits = logits[..., -1]  # [B x K x card]
-        # Apply softmax for sampling if temp > 0. Else, do greedy sampling to avoid zero division error.
-        if use_sampling and temp > 0.0:
-            probs = torch.softmax(logits / temp, dim=-1)
-            if top_p > 0.0:
-                next_token = utils.sample_top_p(probs, p=top_p)
-            elif top_k > 0:
-                next_token = utils.sample_top_k(probs, k=top_k)
-            else:
-                next_token = utils.multinomial(probs, num_samples=1)
-        else:
-            next_token = torch.argmax(logits, dim=-1, keepdim=True)
-        return next_token
-    @torch.no_grad()
-    def generate(self,
-                 prompt: tp.Optional[torch.Tensor] = None,
-                 conditions: tp.List[ConditioningAttributes] = [],
-                 num_samples: tp.Optional[int] = None,
-                 max_gen_len: int = 256,
-                 use_sampling: bool = True,
-                 temp: float = 1.0,
-                 top_k: int = 250,
-                 top_p: float = 0.0,
-                 cfg_coef: tp.Optional[float] = None,
-                 cfg_coef_beta: tp.Optional[float] = None,
-                 two_step_cfg: tp.Optional[bool] = None,
-                 remove_prompts: bool = False,
-                 check: bool = False,
-                 callback: tp.Optional[tp.Callable[[int, int], None]] = None,
-                 ) -> torch.Tensor:
-        """Generate tokens sampling from the model given a prompt or unconditionally. Generation can
-        be performed in a greedy fashion or using sampling with top K and top P strategies.
-        Args:
-            prompt (torch.Tensor, optional): Prompt tokens of shape [B, K, T].
-            conditions (list of ConditioningAttributes, optional): List of conditions.
-            num_samples (int, optional): Number of samples to generate when no prompt and no conditions are given.
-            max_gen_len (int): Maximum generation length.
-            use_sampling (bool): Whether to use a sampling strategy or not.
-            temp (float): Sampling temperature.
-            top_k (int): K for "top-k" sampling.
-            top_p (float): P for "top-p" sampling.
-            cfg_coef (float, optional): Classifier-free guidance coefficient.
-            cfg_coef_beta (float, optional): If None, simple classifier free guidance is used with cfg_coef.
-                If not None, we apply double classifier free guidance as introduced in MusicGen-Style
-                in paragraph 4.3 (https://arxiv.org/pdf/2407.12563). This beta coefficient is meant to
-                push the text condition more than the style condition in the case where both text and style
-                conditions are being used.
-            two_step_cfg (bool, optional): Whether to perform classifier-free guidance with two steps generation.
-            remove_prompts (bool): Whether to remove prompts from generation or not.
-            check (bool): Whether to apply further checks on generated sequence.
-            callback (Callback, optional): Callback function to report generation progress.
-        Returns:
-            torch.Tensor: Generated tokens.
-        """
-        assert not self.training, "generation shouldn't be used in training mode."
-        first_param = next(iter(self.parameters()))
-        device = first_param.device
-        # Checking all input shapes are consistent.
-        possible_num_samples = []
-        if num_samples is not None:
-            possible_num_samples.append(num_samples)
-        elif prompt is not None:
-            possible_num_samples.append(prompt.shape[0])
-        elif conditions:
-            possible_num_samples.append(len(conditions))
-        else:
-            possible_num_samples.append(1)
-        assert [x == possible_num_samples[0] for x in possible_num_samples], "Inconsistent inputs shapes"
-        num_samples = possible_num_samples[0]
-        # below we create set of conditions: one conditional and one unconditional
-        # to do that we merge the regular condition together with the null condition
-        # we then do 1 forward pass instead of 2.
-        # the reason for that is two-fold:
-        # 1. it is about x2 faster than doing 2 forward passes
-        # 2. avoid the streaming API treating the 2 passes as part of different time steps
-        # We also support doing two different passes, in particular to ensure that
-        # the padding structure is exactly the same between train and test.
-        # With a batch size of 1, this can be slower though.
-        cfg_conditions: CFGConditions
-        cfg_conditions = {}
-        if cfg_coef_beta is not None:
-            if conditions:
-                wav_conditions = _drop_description_condition(conditions)
-                null_conditions = ClassifierFreeGuidanceDropout(p=1.0)(conditions)
-                conditions = conditions + wav_conditions + null_conditions
-                tokenized = self.condition_provider.tokenize(conditions)
-                cfg_conditions = self.condition_provider(tokenized)
-        elif conditions:
-            two_step_cfg = self.two_step_cfg if two_step_cfg is None else two_step_cfg
-            if conditions:
-                null_conditions = ClassifierFreeGuidanceDropout(p=1.0)(conditions)
-                if two_step_cfg:
-                    cfg_conditions = (
-                        self.condition_provider(self.condition_provider.tokenize(conditions)),
-                        self.condition_provider(self.condition_provider.tokenize(null_conditions)),
-                    )
-                else:
-                    conditions = conditions + null_conditions
-                    tokenized = self.condition_provider.tokenize(conditions)
-                    cfg_conditions = self.condition_provider(tokenized)
-        else:
-            cfg_conditions = {}
-        if prompt is None:
-            assert num_samples > 0
-            prompt = torch.zeros((num_samples, self.num_codebooks, 0), dtype=torch.long, device=device)
-        B, K, T = prompt.shape
-        start_offset = T
-        print(f"start_offset: {start_offset} | max_gen_len: {max_gen_len}")
-        assert start_offset <= max_gen_len
-        pattern = self.pattern_provider.get_pattern(max_gen_len)
-        # this token is used as default value for codes that are not generated yet
-        unknown_token = -1
-        # we generate codes up to the max_gen_len that will be mapped to the pattern sequence
-        gen_codes = torch.full((B, K, max_gen_len), unknown_token, dtype=torch.long, device=device)
-        # filling the gen_codes with the prompt if needed
-        gen_codes[..., :start_offset] = prompt
-        # create the gen_sequence with proper interleaving from the pattern: [B, K, S]
-        gen_sequence, indexes, mask = pattern.build_pattern_sequence(gen_codes, self.special_token_id)
-        # retrieve the start_offset in the sequence:
-        # it is the first sequence step that contains the `start_offset` timestep
-        start_offset_sequence = pattern.get_first_step_with_timesteps(start_offset)
-        assert start_offset_sequence is not None
-        with self.streaming():
-            unconditional_state = self.get_streaming_state()
-            prev_offset = 0
-            gen_sequence_len = gen_sequence.shape[-1]  # gen_sequence shape is [B, K, S]
-            for offset in range(start_offset_sequence, gen_sequence_len):
-                # get current sequence (note that the streaming API is providing the caching over previous offsets)
-                curr_sequence = gen_sequence[..., prev_offset:offset]
-                curr_mask = mask[None, ..., prev_offset:offset].expand(B, -1, -1)
-                if check:
-                    # check coherence between mask and sequence
-                    assert (curr_sequence == torch.where(curr_mask, curr_sequence, self.special_token_id)).all()
-                    # should never happen as gen_sequence is filled progressively
-                    assert not (curr_sequence == unknown_token).any()
-                # sample next token from the model, next token shape is [B, K, 1]
-                next_token = self._sample_next_token(
-                    curr_sequence, cfg_conditions, unconditional_state, use_sampling, temp, top_k, top_p,
-                    cfg_coef=cfg_coef, cfg_coef_beta=cfg_coef_beta, two_step_cfg=two_step_cfg)
-                # ensure the tokens that should be masked are properly set to special_token_id
-                # as the model never output special_token_id
-                valid_mask = mask[..., offset:offset+1].expand(B, -1, -1)
-                next_token[~valid_mask] = self.special_token_id
-                # ensure we don't overwrite prompt tokens, we only write over unknown tokens
-                # (then mask tokens should be left as is as well, which is correct)
-                gen_sequence[..., offset:offset+1] = torch.where(
-                    gen_sequence[..., offset:offset+1] == unknown_token,
-                    next_token, gen_sequence[..., offset:offset+1]
-                )
-                prev_offset = offset
-                if callback is not None:
-                    callback(1 + offset - start_offset_sequence, gen_sequence_len - start_offset_sequence)
-        unconditional_state.clear()
-        # ensure sequence has been entirely filled
-        assert not (gen_sequence == unknown_token).any()
-        # ensure gen_sequence pattern and mask are matching
-        # which means the gen_sequence is valid according to the pattern
-        assert (
-            gen_sequence == torch.where(mask[None, ...].expand(B, -1, -1), gen_sequence, self.special_token_id)
-        ).all()
-        # get back the codes, trimming the prompt if needed and cutting potentially incomplete timesteps
-        out_codes, out_indexes, out_mask = pattern.revert_pattern_sequence(gen_sequence, special_token=unknown_token)
-        # sanity checks over the returned codes and corresponding masks
-        assert (out_codes[..., :max_gen_len] != unknown_token).all()
-        assert (out_mask[..., :max_gen_len] == 1).all()
-        out_start_offset = start_offset if remove_prompts else 0
-        out_codes = out_codes[..., out_start_offset:max_gen_len]
-        # ensure the returned codes are all valid
-        assert (out_codes >= 0).all() and (out_codes <= self.card).all()
-        return out_codes

audiocraft/models/lm_magnet.py DELETED Viewed

@@ -1,500 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-import logging
-import math
-import typing as tp
-import torch
-import numpy as np
-from ..utils import utils
-from ..modules.conditioners import (
-    ClassifierFreeGuidanceDropout,
-    ConditioningAttributes,
-    ConditionType,
-)
-from .lm import LMModel
-logger = logging.getLogger(__name__)
-ConditionTensors = tp.Dict[str, ConditionType]
-CFGConditions = tp.Union[ConditionTensors, tp.Tuple[ConditionTensors, ConditionTensors]]
-class MagnetLMModel(LMModel):
-    """Transformer-based, non-autoregressive model, operates on multiple streams of audio tokens (MAGNeT).
-    Args:
-        subcodes_context (int): The number of timesteps attended in the self-attention blocks of codebooks > 0.
-                                When set to -1, attention is unrestricted and all timesteps are attended. Defaults to 5.
-        compression_model_framerate (int): frame rate of the audio tokenizer.
-        segment_duration (int): Sample length in seconds.
-        span_len (int): Determines the length of masking spans. This is the minimal length of consecutive masked tokens,
-                        for both training and inference. Defaults to 3.
-        **kwargs: Additional parameters for the LMModel.
-    """
-    def __init__(self, subcodes_context: int = 5, compression_model_framerate: int = 50,
-                 segment_duration: int = 10, span_len: int = 3, **kwargs):
-        super().__init__(**kwargs)
-        self.causal = kwargs['causal']
-        self.subcodes_context = subcodes_context
-        self.span_len = span_len
-        self._build_attn_masks(compression_model_framerate=compression_model_framerate,
-                               segment_duration=segment_duration,
-                               num_heads=kwargs['num_heads'],
-                               device=kwargs['device'], dtype=kwargs['dtype'])
-    def restricted_context_attn_mask(self, seq_len: int, device: torch.device, dtype: torch.dtype) -> torch.Tensor:
-        """Creates a restricted attention mask (local attention map) where the context
-           is determined by self.subcodes_context.
-        Args:
-            seq_len (int): token sequence length.
-            device (torch.device): device of the output tensor.
-            dtype (torch.dtype): data type of the output tensor.
-        Returns:
-            torch.Tensor: The restricted attention mask.
-        """
-        # Return a context restricted non-causal att mask
-        queries_pos = torch.arange(seq_len, device=device).view(-1, 1)
-        keys_pos = torch.arange(seq_len, device=device).view(1, -1)
-        delta = queries_pos - keys_pos
-        valid = torch.abs(delta) <= self.subcodes_context
-        return torch.where(
-            valid,
-            torch.zeros([], device=device, dtype=dtype),
-            torch.full([], float('-inf'), device=device, dtype=dtype))
-    def _stage_attn_mask(self, stage: int, seq_len: int, num_heads: int,
-                         device: torch.device, dtype: torch.dtype) -> tp.Optional[torch.Tensor]:
-        """Creates a restricted attention mask given the stage (codebook index).
-        Args:
-            stage (int): The codebook index. Takes values in [0, n_q].
-            seq_len (int): Token sequence length.
-            num_heads (int): Num transformer attention heads.
-            device (torch.device): device of the output tensor.
-            dtype (torch.dtype): data type of the output tensor.
-        Returns:
-            torch.Tensor: Either a restricted attention mask or None if stage attention is unrestricted.
-        """
-        sa_mask = None
-        if stage > 0 and self.subcodes_context > -1:
-            # parallel - non-causal - with restricted subcodes context
-            sa_mask = self.restricted_context_attn_mask(seq_len, device=device, dtype=dtype)
-        if sa_mask is not None:
-            # Repeat for each attention head
-            sa_mask = sa_mask.repeat((1, num_heads, 1, 1))
-            # align8 to enable memory efficient attention
-            MEMORY_EFFICIENT_ATTN_ALIGN_FACTOR = 8
-            seq_len_aligned = \
-                int(np.ceil(seq_len / MEMORY_EFFICIENT_ATTN_ALIGN_FACTOR)) * MEMORY_EFFICIENT_ATTN_ALIGN_FACTOR
-            sa_mask_aligned = torch.zeros((1, num_heads, seq_len_aligned, seq_len_aligned), device=device, dtype=dtype)
-            sa_mask_aligned[..., :seq_len, :seq_len] = sa_mask
-            sa_mask = sa_mask_aligned
-        return sa_mask
-    def _build_attn_masks(self, compression_model_framerate: int, segment_duration: int, num_heads: int,
-                          device: torch.device, dtype: torch.dtype):
-        """Construct attention mask per stage. For each of the RVQ codebook levels in the [0, n_q] range,
-           either a local attention map or None would be stored as an entry in the self.attn_mask_per_stage list.
-        Args:
-            compression_model_framerate (int): The frame rate of the tokenizer.
-            segment_duration (int): Sample length in seconds.
-            num_heads (int): Num transformer attention heads.
-            device (torch.device): device of the output tensor.
-            dtype (torch.dtype): data type of the output tensor.
-        """
-        seq_len = compression_model_framerate * segment_duration
-        self.attn_mask_per_stage = [self._stage_attn_mask(stage, seq_len, num_heads,
-                                                          device, dtype) for stage in range(self.n_q)]
-    @torch.no_grad()
-    def generate(self,
-                 prompt: tp.Optional[torch.Tensor] = None,
-                 conditions: tp.List[ConditioningAttributes] = [],
-                 num_samples: tp.Optional[int] = None,
-                 max_gen_len: int = 256,
-                 use_sampling: bool = True,
-                 temp: float = 1.0,
-                 top_k: int = 250,
-                 top_p: float = 0.0,
-                 cfg_coef: tp.Optional[float] = None,
-                 cfg_coef_beta: tp.Optional[float] = None,
-                 two_step_cfg: tp.Optional[bool] = None,
-                 remove_prompts: bool = False,
-                 check: bool = False,
-                 callback: tp.Optional[tp.Callable[[int, int], None]] = None,
-                 **kwargs) -> torch.Tensor:
-        assert cfg_coef is None, "Unsupported in MAGNeT. Use max_cfg_coef,min_cfg_coef instead."
-        assert two_step_cfg is None, "MAGNeT currently doesn't support two step classifier-free-guidance."
-        assert remove_prompts is False, "MAGNeT currently doesn't support the remove_prompts arg."
-        assert check is False, "MAGNeT currently doesn't support the check arg."
-        assert cfg_coef_beta is None, "MAGNeT currently doesn't support the cfg_coef_beta arg."
-        # Call the MAGNeT-specific generation method
-        return self._generate_magnet(prompt=prompt,
-                                     conditions=conditions,
-                                     num_samples=num_samples,
-                                     max_gen_len=max_gen_len,
-                                     use_sampling=use_sampling,
-                                     temp=temp,
-                                     top_k=top_k,
-                                     top_p=top_p,
-                                     callback=callback, **kwargs)
-    @torch.no_grad()
-    def _generate_magnet(self,
-                         prompt: tp.Optional[torch.Tensor] = None,
-                         conditions: tp.List[ConditioningAttributes] = [],
-                         num_samples: tp.Optional[int] = None,
-                         max_gen_len: int = 256,
-                         use_sampling: bool = True,
-                         temp: float = 3.0,
-                         top_k: int = 0,
-                         top_p: float = 0.9,
-                         callback: tp.Optional[tp.Callable[[int, int], None]] = None,
-                         max_cfg_coef: float = 10.0,
-                         min_cfg_coef: float = 1.0,
-                         decoding_steps: tp.List[int] = [20, 10, 10, 10],
-                         anneal_temp: bool = True,
-                         span_scoring='max',
-                         span_arrangement='nonoverlap') -> torch.Tensor:
-        """Generate audio tokens given textual conditions, and optionally given audio prompts,
-        by running MAGNeT's iterative decoding algorithm for each of the n_q RVQ levels.
-        Args:
-            prompt (torch.Tensor): Prompt tokens of shape [B, K, T].
-            conditions (list of ConditioningAttributes): List of conditions.
-            num_samples (int): Number of samples to generate when no prompt and no conditions are given.
-            max_gen_len (int): Maximum generation length.
-            use_sampling (bool): Whether to use a sampling strategy or not.
-            temp (float): Initial sampling temperature.
-            top_k (int): k for "top-k" sampling.
-            top_p (float): p for "top-p" sampling.
-            callback (Callback): Callback function to report generation progress.
-            max_clsfg_coef (float): Initial coefficient used for classifier free guidance.
-            min_clsfg_coef (float): Final coefficient used for classifier free guidance.
-            decoding_steps (list of n_q ints): The number of iterative decoding steps,
-                                            for each of the n_q RVQ codebooks.
-            anneal_temp (bool): When set to True, softmax temperature will be linearly decayed to zero, at each stage.
-            span_scoring (str): Use the maximum probability of each span ('max')
-                                or the product of probabilities ('prod').
-            span_arrangement (str): Use either non-overlapping spans ('nonoverlap') or overlapping spans ('stride1').
-                                                in the masking scheme.
-        Returns:
-            torch.Tensor: Generated tokens.
-        """
-        assert not self.training, "generation shouldn't be used in training mode."
-        first_param = next(iter(self.parameters()))
-        device = first_param.device
-        # Checking all input shapes are consistent.
-        possible_num_samples = []
-        if num_samples is not None:
-            possible_num_samples.append(num_samples)
-        elif prompt is not None:
-            possible_num_samples.append(prompt.shape[0])
-        elif conditions:
-            possible_num_samples.append(len(conditions))
-        else:
-            possible_num_samples.append(1)
-        assert [x == possible_num_samples[0] for x in possible_num_samples], "Inconsistent inputs shapes"
-        num_samples = possible_num_samples[0]
-        # below we create set of conditions: one conditional and one unconditional
-        # to do that we merge the regular condition together with the null condition
-        # we then do 1 forward pass instead of 2.
-        cfg_conditions: tp.Optional[ConditionTensors]
-        if conditions:
-            null_conditions = ClassifierFreeGuidanceDropout(p=1.0)(conditions)
-            conditions = conditions + null_conditions
-            tokenized = self.condition_provider.tokenize(conditions)
-            cfg_conditions = self.condition_provider(tokenized)
-        else:
-            cfg_conditions = {}
-        if prompt is None:
-            assert num_samples > 0
-            prompt = torch.zeros((num_samples, self.num_codebooks, 0), dtype=torch.long, device=device)
-        B, K, prompt_length = prompt.shape
-        start_offset = prompt_length
-        assert start_offset < max_gen_len
-        mask_id = self.special_token_id
-        # we generate codes with a fixed sequence length
-        shape = (B, K, max_gen_len)
-        gen_codes = torch.full(shape, mask_id, dtype=torch.long, device=device)
-        # filling the gen_codes with the prompt if needed
-        gen_codes[..., :start_offset] = prompt
-        # create the gen_sequence with proper interleaving from the pattern: [B, K, S]
-        gen_sequence = gen_codes
-        curr_step = 0
-        for stage, n_steps in zip(range(self.n_q), decoding_steps):
-            gen_sequence, curr_step = self._generate_stage(gen_sequence,
-                                                           cfg_conditions,
-                                                           stage=stage,
-                                                           device=device,
-                                                           prompt_length=prompt_length,
-                                                           prompt=prompt,
-                                                           temp=temp,
-                                                           max_cfg_coef=max_cfg_coef,
-                                                           min_cfg_coef=min_cfg_coef,
-                                                           top_k=top_k,
-                                                           top_p=top_p,
-                                                           timesteps=n_steps,
-                                                           anneal_temp=anneal_temp,
-                                                           span_scoring=span_scoring,
-                                                           use_sampling=use_sampling,
-                                                           span_arrangement=span_arrangement,
-                                                           curr_step=curr_step,
-                                                           total_steps=sum(decoding_steps),
-                                                           callback=callback)
-        return gen_sequence
-    @torch.no_grad()
-    def _generate_stage(self,
-                        gen_sequence: torch.Tensor,
-                        condition_tensors: tp.Optional[ConditionTensors],
-                        stage: int,
-                        device: torch.device,
-                        prompt_length: int = 0,
-                        prompt: tp.Optional[torch.Tensor] = None,
-                        use_sampling: bool = True,
-                        temp: float = 3.0,
-                        max_cfg_coef: float = 10.0,
-                        min_cfg_coef: float = 1.0,
-                        top_k: int = 0,
-                        top_p: float = 0.0,
-                        timesteps: int = 10,
-                        anneal_temp: bool = True,
-                        span_scoring: str = 'max',
-                        span_arrangement: str = 'nonoverlap',
-                        curr_step: int = 0,
-                        total_steps: int = 0,
-                        callback: tp.Optional[tp.Callable[[int, int], None]] = None) -> tp.Tuple[torch.Tensor, int]:
-        """Generate audio tokens of a single RVQ level (stage), given the previously generated stages,
-           and the textual conditions.
-        Args:
-            gen_sequence (torch.Tensor): Previously generated tokens.
-            condition_tensors (tp.Optional[ConditionTensors]): pre-computed conditioning tensors.
-            stage (int): RVQ level to generate.
-            device (torch.device): device of the output tensor.
-            prompt_length (int): Temporal length of the audio prompt.
-            prompt (torch.Tensor): Prompt tokens of shape [B, K, T].
-            use_sampling (bool): Whether to use a sampling strategy or not.
-            temp (float): Initial sampling temperature.
-            max_clsfg_coef (float): Initial coefficient used for classifier free guidance.
-            min_clsfg_coef (float): Final coefficient used for classifier free guidance.
-            top_k (int): k for "top-k" sampling.
-            top_p (float): p for "top-p" sampling.
-            timesteps (int): Number of iterative decoding steps.
-            anneal_temp (bool): When set to True, softmax temperature will be linearly decayed to zero, at each stage.
-            span_scoring (str): Use the maximum probability of each span ('max')
-                                or the product of probabilities ('prod').
-            span_arrangement (str): Use either non-overlapping spans ('nonoverlap') or overlapping spans ('stride1').
-                                                in the masking scheme.
-            curr_step (int): Global iterative decoding step counter.
-            total_steps (int): Total decoding steps.
-            callback (Callback): Callback function to report generation progress.
-        Returns:
-            tuple(torch.Tensor, int): Generated tokens and the current decoding step counter.
-        """
-        B, K, T = gen_sequence.shape
-        shape = (B, 1, T)  # generating a single codebook per stage
-        mask_id = self.special_token_id
-        stage_gen_seq = torch.full(shape, mask_id, dtype=torch.long, device=device)
-        assert span_arrangement == 'nonoverlap' or span_arrangement == 'stride1'
-        chunk_masking = self.span_len > 1 and span_arrangement == 'nonoverlap'
-        DONT_REMASK_ME_SCORE = -1e4
-        model = self if self._fsdp is None else self._fsdp
-        if chunk_masking:
-            # span-wise scores
-            n_chunks = T // self.span_len
-            if T % self.span_len != 0:
-                # trim sequence ending to achieve a multiple of span_len
-                T = self.span_len * n_chunks
-                gen_sequence = gen_sequence[..., :T]
-                stage_gen_seq = stage_gen_seq[..., :T]
-            chunked_shape = (B, 1, n_chunks)
-            n_prompt_chunks = prompt_length // self.span_len
-            scores = torch.zeros(chunked_shape, dtype=torch.float32, device=device)
-            scores[..., :n_prompt_chunks] = DONT_REMASK_ME_SCORE
-            num_chunks_to_gen = n_chunks - n_prompt_chunks
-        else:
-            # token-wise scores
-            scores = torch.zeros(shape, dtype=torch.float32, device=device)
-            scores[..., :prompt_length] = DONT_REMASK_ME_SCORE
-            gen_T = T - prompt_length
-        # run MAGNeT iterative decoding for "timesteps" iterations
-        for timestep, steps_left in zip(torch.linspace(0, 1, timesteps, device=device), reversed(range(timesteps))):
-            mask_p = torch.cos(timestep * math.pi * 0.5)
-            if chunk_masking:
-                num_masked = max(int((mask_p * num_chunks_to_gen).item()), 1)
-            else:
-                num_masked = max(int((mask_p * gen_T).item()), 1)
-            # masking
-            run_lps_masking = (span_arrangement == 'stride1') and self.span_len > 1
-            if run_lps_masking:
-                # masking of the k least probable overlapping (stride 1) spans
-                mask = torch.concat((
-                    [self._least_probable_span_masking(scores[[i], :, :], num_masked).to(device)
-                     for i in range(B)]), dim=0)
-                stage_gen_seq[mask] = mask_id
-            else:
-                # masking of the k least probable non-overlapping spans
-                masked = scores.topk(num_masked, dim=-1).indices
-                if chunk_masking:
-                    chunks_mask = torch.full(chunked_shape, False, dtype=torch.bool, device=device)
-                    chunks_mask = chunks_mask.scatter(2, masked, True)
-                    mask = torch.repeat_interleave(chunks_mask, self.span_len, dim=-1)
-                    stage_gen_seq[mask] = mask_id
-                else:
-                    stage_gen_seq = stage_gen_seq.scatter(2, masked, mask_id)
-            if prompt is not None:
-                stage_gen_seq[..., :prompt_length] = prompt[:, stage, :].unsqueeze(1)
-            gen_sequence[:, [stage], :] = stage_gen_seq
-            if condition_tensors:
-                # duplicate input for classifier free guidance
-                sequence = torch.cat([gen_sequence, gen_sequence], dim=0)
-            all_logits = model(sequence, [], condition_tensors, stage=stage)
-            if condition_tensors:
-                # classifier free guidance with annealing
-                cond_logits, uncond_logits = all_logits.split(B, dim=0)  # [B, K, T, card]
-                clsfg_coef = float(mask_p) * max_cfg_coef + (1 - float(mask_p)) * min_cfg_coef
-                logits = uncond_logits + (cond_logits - uncond_logits) * clsfg_coef
-            else:
-                logits = all_logits
-            # temperature annealing - linear
-            t = temp * (steps_left / timesteps) if anneal_temp else temp
-            # sampling
-            logits = logits[:, stage, :, :].unsqueeze(1)
-            probs = torch.softmax(logits / max(t, 1e-2), dim=-1)
-            if use_sampling:
-                if top_p > 0.0:
-                    sampled_tokens = utils.sample_top_p(probs, p=top_p)
-                elif top_k > 0:
-                    sampled_tokens = utils.sample_top_k(probs, k=top_k)
-                else:
-                    sampled_tokens = utils.multinomial(probs, num_samples=1)
-            else:
-                sampled_tokens = torch.argmax(logits, dim=-1, keepdim=True)
-            # place mask_id token in each of the masked positions
-            mask = stage_gen_seq == mask_id
-            stage_gen_seq = torch.where(mask, sampled_tokens[..., 0], stage_gen_seq)
-            gen_sequence[:, [stage], :] = stage_gen_seq
-            # get probs of sampled tokens
-            sampled_probs = torch.gather(probs, 3, sampled_tokens)[..., 0]
-            # span scoring
-            if chunk_masking:
-                if span_scoring == 'max':
-                    # max in linear space
-                    scores = 1 - torch.max(sampled_probs.reshape((B, 1, n_chunks, -1)), dim=-1)[0]
-                elif span_scoring == 'prod':
-                    # prod in log space
-                    scores = torch.sum(-torch.log(sampled_probs).reshape((B, 1, n_chunks, -1)), dim=-1)
-                else:
-                    raise NotImplementedError
-            else:
-                # prod in log space for lps masking (stride1)
-                scores = -torch.log(sampled_probs)
-            # Fix unmasked tokens by placing inf probs (-inf scores)
-            if chunk_masking:
-                scores = scores.masked_fill(~chunks_mask, DONT_REMASK_ME_SCORE)
-            else:
-                scores = scores.masked_fill(~mask, DONT_REMASK_ME_SCORE)
-            if callback is not None:
-                curr_step += 1
-                callback(curr_step, total_steps)
-        return gen_sequence, curr_step
-    def _construct_spans_mask(self, span_starts: torch.Tensor, T: int, device: torch.device) -> torch.Tensor:
-        """Build a [1x1xT] boolean mask consists of overlapping spans of True values, where
-           span_starts defines the initial index of each span, and the span length is
-           defined by self.span_len.
-        Args:
-            span_starts (torch.Tensor): Boolean mask determines the temporal location of each span start.
-            T (int): Sequence length.
-            device (torch.device): device of the output tensor.
-        Returns:
-            torch.Tensor: Spans mask of shape [1x1xT]
-        """
-        mask = torch.full((1, 1, T), False, device=device)
-        mask[:, :, span_starts] = True
-        shifted_mask = mask.clone()
-        for _ in range(self.span_len - 1):
-            shifted_mask = torch.concat((torch.full((1, 1, 1), False, device=device), shifted_mask[:, :, :-1]), dim=-1)
-            mask = torch.logical_or(mask, shifted_mask)
-        return mask
-    def _least_probable_span_masking(self, scores: torch.Tensor, num_masked_trg: int) -> torch.Tensor:
-        """Construct a [1x1xT] boolean mask, consists of the u least probable spans,
-           where the token probability is determined by -scores, and the total
-           number of masked tokens is as closest as possible to num_masked_trg.
-           Find u using binary search.
-        Args:
-            scores (torch.Tensor): Per token score [-log(prob)]
-            num_masked_trg: int: The desired amount of tokens to be masked.
-        Returns:
-            torch.Tensor: Spans mask of shape [1x1xT]
-        """
-        T = scores.shape[-1]
-        device = scores.device
-        scores_unfolded = scores.unfold(2, self.span_len, 1)
-        # Span score is the product of probs (sum in log space)
-        span_scores = scores_unfolded.sum(dim=-1)
-        spans_by_scores = torch.argsort(span_scores[0, 0], descending=True)
-        num_masked_trg = max(num_masked_trg, self.span_len)
-        # Binary search for u - the number least probable overlapping masked spans s.t.
-        # the total masking rate is the closest to num_masked_trg / T.
-        min_u = num_masked_trg // self.span_len
-        max_u = num_masked_trg - self.span_len + 1
-        mid = round(0.5 * (min_u + max_u))
-        if mid == min_u or mid == max_u:
-            return self._construct_spans_mask(spans_by_scores[:mid], T, device)
-        while mid > min_u and mid < max_u:
-            mask = self._construct_spans_mask(spans_by_scores[:mid], T, device)
-            n_masked = mask.sum()
-            if n_masked > num_masked_trg:
-                max_u = mid
-                mid = round(0.5 * (min_u + max_u))
-            else:
-                min_u = mid
-                mid = round(0.5 * (min_u + max_u))
-        return mask

audiocraft/models/loaders.py DELETED Viewed

@@ -1,291 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Utility functions to load from the checkpoints.
-Each checkpoint is a torch.saved dict with the following keys:
-- 'xp.cfg': the hydra config as dumped during training. This should be used
-    to rebuild the object using the audiocraft.models.builders functions,
-- 'model_best_state': a readily loadable best state for the model, including
-    the conditioner. The model obtained from `xp.cfg` should be compatible
-    with this state dict. In the case of a LM, the encodec model would not be
-    bundled along but instead provided separately.
-Those functions also support loading from a remote location with the Torch Hub API.
-They also support overriding some parameters, in particular the device and dtype
-of the returned model.
-"""
-from pathlib import Path
-from huggingface_hub import hf_hub_download
-import typing as tp
-import os
-from omegaconf import OmegaConf, DictConfig
-import torch
-import audiocraft
-from . import builders
-from .encodec import CompressionModel
-def get_audiocraft_cache_dir() -> tp.Optional[str]:
-    return os.environ.get('AUDIOCRAFT_CACHE_DIR', None)
-HF_MODEL_CHECKPOINTS_MAP = {
-    "small": "facebook/musicgen-small",
-    "medium": "facebook/musicgen-medium",
-    "large": "facebook/musicgen-large",
-    "melody": "facebook/musicgen-melody",
-    "melody-large": "facebook/musicgen-melody-large",
-    "stereo-small": "facebook/musicgen-stereo-small",
-    "stereo-medium": "facebook/musicgen-stereo-medium",
-    "stereo-large": "facebook/musicgen-stereo-large",
-    "stereo-melody": "facebook/musicgen-stereo-melody",
-    "stereo-melody-large": "facebook/musicgen-stereo-melody-large",
-    "style": "facebook/musicgen-style",
-}
-def _get_state_dict(
-    file_or_url_or_id: tp.Union[Path, str],
-    filename: tp.Optional[str] = None,
-    device='cpu',
-    cache_dir: tp.Optional[str] = None,
-):
-    if cache_dir is None:
-        cache_dir = get_audiocraft_cache_dir()
-    # Return the state dict either from a file or url
-    file_or_url_or_id = str(file_or_url_or_id)
-    assert isinstance(file_or_url_or_id, str)
-    if os.path.isfile(file_or_url_or_id):
-        return torch.load(file_or_url_or_id, map_location=device)
-    if os.path.isdir(file_or_url_or_id):
-        file = f"{file_or_url_or_id}/{filename}"
-        return torch.load(file, map_location=device)
-    elif file_or_url_or_id.startswith('https://'):
-        return torch.hub.load_state_dict_from_url(file_or_url_or_id, map_location=device, check_hash=True)
-    elif file_or_url_or_id in HF_MODEL_CHECKPOINTS_MAP:
-        assert filename is not None, "filename needs to be defined if using HF checkpoints"
-        repo_id = HF_MODEL_CHECKPOINTS_MAP[file_or_url_or_id]
-        file = hf_hub_download(repo_id=repo_id, filename=filename, cache_dir=cache_dir)
-        return torch.load(file, map_location=device)
-    else:
-        assert filename is not None, "filename needs to be defined if using HF checkpoints"
-        file = hf_hub_download(
-            repo_id=file_or_url_or_id, filename=filename, cache_dir=cache_dir,
-            library_name="audiocraft", library_version=audiocraft.__version__)
-        return torch.load(file, map_location=device)
-def create_melody_config(model_id: str, device: str) -> DictConfig:
-    """Create a fallback configuration for melody models.
-    Args:
-        model_id: The model identifier
-        device: The device to use
-    Returns:
-        A compatible OmegaConf DictConfig
-    """
-    base_cfg = {
-        "device": str(device),
-        "channels": 2 if "stereo" in model_id else 1,
-        "sample_rate": 32000,
-        "audio_channels": 2 if "stereo" in model_id else 1,
-        "frame_rate": 50,
-        "codec_name": "encodec",
-        "codec": {
-            "dim": 128,
-            "hidden_dim": 1024,
-            "stride": 320,
-            "n_q": 4,
-            "codebook_size": 2048,
-            "normalize": True,
-        }
-    }
-    return OmegaConf.create(base_cfg)
-def create_default_config(model_id: str, device: str) -> DictConfig:
-    """Create a fallback configuration for standard models.
-    Args:
-        model_id: The model identifier
-        device: The device to use
-    Returns:
-        A compatible OmegaConf DictConfig
-    """
-    base_cfg = {
-        "device": str(device),
-        "channels": 2 if "stereo" in model_id else 1,
-        "sample_rate": 32000,
-        "audio_channels": 2 if "stereo" in model_id else 1,
-        "frame_rate": 50,
-        "codec_name": "encodec",
-        "codec": {
-            "dim": 128,
-            "hidden_dim": 1024,
-            "stride": 320,
-            "n_q": 4,
-            "codebook_size": 1024,
-            "normalize": True,
-        }
-    }
-    return OmegaConf.create(base_cfg)
-def load_compression_model_ckpt(file_or_url_or_id: tp.Union[Path, str], cache_dir: tp.Optional[str] = None):
-    return _get_state_dict(file_or_url_or_id, filename="compression_state_dict.bin", cache_dir=cache_dir)
-def load_compression_model(file_or_url_or_id: tp.Union[Path, str], device='cpu', cache_dir: tp.Optional[str] = None):
-    pkg = load_compression_model_ckpt(file_or_url_or_id, cache_dir=cache_dir)
-    if 'pretrained' in pkg:
-        return CompressionModel.get_pretrained(pkg['pretrained'], device=device)
-    # Handle newer model formats that might not have xp.cfg
-    if 'xp.cfg' not in pkg:
-        if file_or_url_or_id in ['melody-large', 'stereo-melody', 'stereo-medium',
-                                 'stereo-small', 'stereo-large', 'stereo-melody-large','style']:
-            print(f"Using fallback configuration for {file_or_url_or_id}")
-            # Create a default configuration based on the model type
-            # This is where you'd need to add model-specific configurations
-            if 'melody' in file_or_url_or_id:
-                cfg = create_melody_config(file_or_url_or_id, device)
-            else:
-                cfg = create_default_config(file_or_url_or_id, device)
-        else:
-            raise KeyError(f"Missing configuration for model {file_or_url_or_id}")
-    else:
-        cfg = OmegaConf.create(pkg['xp.cfg'])
-    cfg.device = str(device)
-    model = builders.get_compression_model(cfg)
-    model.load_state_dict(pkg['best_state'])
-    model.eval()
-    return model
-def load_lm_model_ckpt(file_or_url_or_id: tp.Union[Path, str], cache_dir: tp.Optional[str] = None):
-    return _get_state_dict(file_or_url_or_id, filename="state_dict.bin", cache_dir=cache_dir)
-def _delete_param(cfg: DictConfig, full_name: str):
-    parts = full_name.split('.')
-    for part in parts[:-1]:
-        if part in cfg:
-            cfg = cfg[part]
-        else:
-            return
-    OmegaConf.set_struct(cfg, False)
-    if parts[-1] in cfg:
-        del cfg[parts[-1]]
-    OmegaConf.set_struct(cfg, True)
-def load_lm_model(file_or_url_or_id: tp.Union[Path, str], device='cpu', cache_dir: tp.Optional[str] = None):
-    pkg = load_lm_model_ckpt(file_or_url_or_id, cache_dir=cache_dir)
-    cfg = OmegaConf.create(pkg['xp.cfg'])
-    cfg.device = str(device)
-    if cfg.device == 'cpu':
-        cfg.transformer_lm.memory_efficient = False
-        cfg.transformer_lm.custom = True
-        cfg.dtype = 'float32'
-    else:
-        cfg.dtype = 'float16'
-    _delete_param(cfg, 'conditioners.self_wav.chroma_stem.cache_path')
-    _delete_param(cfg, 'conditioners.args.merge_text_conditions_p')
-    _delete_param(cfg, 'conditioners.args.drop_desc_p')
-    model = builders.get_lm_model(cfg)
-    model.load_state_dict(pkg['best_state'])
-    model.eval()
-    model.cfg = cfg
-    return model
-def load_lm_model_magnet(file_or_url_or_id: tp.Union[Path, str], compression_model_frame_rate: int,
-                         device='cpu', cache_dir: tp.Optional[str] = None):
-    pkg = load_lm_model_ckpt(file_or_url_or_id, cache_dir=cache_dir)
-    cfg = OmegaConf.create(pkg['xp.cfg'])
-    cfg.device = str(device)
-    if cfg.device == 'cpu':
-        cfg.dtype = 'float32'
-    else:
-        cfg.dtype = 'float16'
-    _delete_param(cfg, 'conditioners.args.merge_text_conditions_p')
-    _delete_param(cfg, 'conditioners.args.drop_desc_p')
-    cfg.transformer_lm.compression_model_framerate = compression_model_frame_rate
-    cfg.transformer_lm.segment_duration = cfg.dataset.segment_duration
-    cfg.transformer_lm.span_len = cfg.masking.span_len
-    # MAGNeT models v1 support only xformers backend.
-    from audiocraft.modules.transformer import set_efficient_attention_backend
-    if cfg.transformer_lm.memory_efficient:
-        set_efficient_attention_backend("xformers")
-    model = builders.get_lm_model(cfg)
-    model.load_state_dict(pkg['best_state'])
-    model.eval()
-    model.cfg = cfg
-    return model
-def load_jasco_model(file_or_url_or_id: tp.Union[Path, str],
-                     compression_model: CompressionModel,
-                     device='cpu', cache_dir: tp.Optional[str] = None):
-    pkg = load_lm_model_ckpt(file_or_url_or_id, cache_dir=cache_dir)
-    cfg = OmegaConf.create(pkg['xp.cfg'])
-    cfg.device = str(device)
-    if cfg.device == 'cpu':
-        cfg.dtype = 'float32'
-    else:
-        cfg.dtype = 'float16'
-    model = builders.get_jasco_model(cfg, compression_model)
-    model.load_state_dict(pkg['best_state'])
-    model.eval()
-    model.cfg = cfg
-    return model
-def load_mbd_ckpt(file_or_url_or_id: tp.Union[Path, str],
-                  filename: tp.Optional[str] = None,
-                  cache_dir: tp.Optional[str] = None):
-    return _get_state_dict(file_or_url_or_id, filename=filename, cache_dir=cache_dir)
-def load_diffusion_models(file_or_url_or_id: tp.Union[Path, str],
-                          device='cpu',
-                          filename: tp.Optional[str] = None,
-                          cache_dir: tp.Optional[str] = None):
-    pkg = load_mbd_ckpt(file_or_url_or_id, filename=filename, cache_dir=cache_dir)
-    models = []
-    processors = []
-    cfgs = []
-    sample_rate = pkg['sample_rate']
-    for i in range(pkg['n_bands']):
-        cfg = pkg[i]['cfg']
-        model = builders.get_diffusion_model(cfg)
-        model_dict = pkg[i]['model_state']
-        model.load_state_dict(model_dict)
-        model.to(device)
-        processor = builders.get_processor(cfg=cfg.processor, sample_rate=sample_rate)
-        processor_dict = pkg[i]['processor_state']
-        processor.load_state_dict(processor_dict)
-        processor.to(device)
-        models.append(model)
-        processors.append(processor)
-        cfgs.append(cfg)
-    return models, processors, cfgs

audiocraft/models/magnet.py DELETED Viewed

@@ -1,88 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Main model for using MAGNeT. This will combine all the required components
-and provide easy access to the generation API.
-"""
-import typing as tp
-import torch
-from .genmodel import BaseGenModel
-from .loaders import load_compression_model, load_lm_model_magnet
-class MAGNeT(BaseGenModel):
-    """MAGNeT main model with convenient generation API.
-    Args:
-       See MusicGen class.
-    """
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        # MAGNeT operates over a fixed sequence length defined in it's config.
-        self.duration = self.lm.cfg.dataset.segment_duration
-        self.set_generation_params()
-    @staticmethod
-    def get_pretrained(name: str = 'facebook/magnet-small-10secs', device=None):
-        """Return pretrained model, we provide six models:
-        - facebook/magnet-small-10secs (300M), text to music, 10-second audio samples.
-          # see: https://huggingface.co/facebook/magnet-small-10secs
-        - facebook/magnet-medium-10secs (1.5B), text to music, 10-second audio samples.
-          # see: https://huggingface.co/facebook/magnet-medium-10secs
-        - facebook/magnet-small-30secs (300M), text to music, 30-second audio samples.
-          # see: https://huggingface.co/facebook/magnet-small-30secs
-        - facebook/magnet-medium-30secs (1.5B), text to music, 30-second audio samples.
-          # see: https://huggingface.co/facebook/magnet-medium-30secs
-        - facebook/audio-magnet-small (300M), text to sound-effect (10-second samples).
-          # see: https://huggingface.co/facebook/audio-magnet-small
-        - facebook/audio-magnet-medium (1.5B), text to sound-effect (10-second samples).
-          # see: https://huggingface.co/facebook/audio-magnet-medium
-        """
-        if device is None:
-            if torch.cuda.device_count():
-                device = 'cuda'
-            else:
-                device = 'cpu'
-        compression_model = load_compression_model(name, device=device)
-        lm = load_lm_model_magnet(name, compression_model_frame_rate=int(compression_model.frame_rate), device=device)
-        if 'self_wav' in lm.condition_provider.conditioners:
-            lm.condition_provider.conditioners['self_wav'].match_len_on_eval = True
-        kwargs = {'name': name, 'compression_model': compression_model, 'lm': lm}
-        return MAGNeT(**kwargs)
-    def set_generation_params(self, use_sampling: bool = True, top_k: int = 0,
-                              top_p: float = 0.9, temperature: float = 3.0,
-                              max_cfg_coef: float = 10.0, min_cfg_coef: float = 1.0,
-                              decoding_steps: tp.List[int] = [20, 10, 10, 10],
-                              span_arrangement: str = 'nonoverlap'):
-        """Set the generation parameters for MAGNeT.
-        Args:
-            use_sampling (bool, optional): Use sampling if True, else do argmax decoding. Defaults to True.
-            top_k (int, optional): top_k used for sampling. Defaults to 0.
-            top_p (float, optional): top_p used for sampling, when set to 0 top_k is used. Defaults to 0.9.
-            temperature (float, optional): Initial softmax temperature parameter. Defaults to 3.0.
-            max_cfg_coef (float, optional): Coefficient used for classifier free guidance. Defaults to 10.0.
-            min_cfg_coef (float, optional): End coefficient of classifier free guidance annealing. Defaults to 1.0.
-            decoding_steps (list of n_q ints, optional): The number of iterative decoding steps,
-                                                         for each of the n_q RVQ codebooks.
-            span_arrangement (str, optional): Use either non-overlapping spans ('nonoverlap')
-                                              or overlapping spans ('stride1') in the masking scheme.
-        """
-        self.generation_params = {
-            'use_sampling': use_sampling,
-            'temp': temperature,
-            'top_k': top_k,
-            'top_p': top_p,
-            'max_cfg_coef': max_cfg_coef,
-            'min_cfg_coef': min_cfg_coef,
-            'decoding_steps': [int(s) for s in decoding_steps],
-            'span_arrangement': span_arrangement
-        }

audiocraft/models/musicgen.py DELETED Viewed

@@ -1,566 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Main model for using MusicGen. This will combine all the required components
-and provide easy access to the generation API.
-"""
-import os
-import typing as tp
-import warnings
-import omegaconf
-import torch
-import gradio as gr
-from .encodec import CompressionModel
-from .genmodel import BaseGenModel
-from .lm import LMModel
-from .builders import get_debug_compression_model, get_debug_lm_model, get_wrapped_compression_model
-from .loaders import load_compression_model, load_lm_model, HF_MODEL_CHECKPOINTS_MAP
-from ..data.audio_utils import convert_audio
-from ..modules.conditioners import ConditioningAttributes, WavCondition, StyleConditioner
-from ..utils.autocast import TorchAutocast
-MelodyList = tp.List[tp.Optional[torch.Tensor]]
-MelodyType = tp.Union[torch.Tensor, MelodyList]
-class MusicGen:
-    """MusicGen main model with convenient generation API.
-    Args:
-        name (str): name of the model.
-        compression_model (CompressionModel): Compression model
-            used to map audio to invertible discrete representations.
-        lm (LMModel): Language model over discrete representations.
-        max_duration (float, optional): maximum duration the model can produce,
-            otherwise, inferred from the training params.
-    """
-    def __init__(self, name: str, compression_model: CompressionModel, lm: LMModel, max_duration: tp.Optional[float] = 30):
-        self.name = name
-        self.compression_model = compression_model
-        self.lm = lm
-        self.cfg: tp.Optional[omegaconf.DictConfig] = None
-        # Just to be safe, let's put everything in eval mode.
-        self.compression_model.eval()
-        self.lm.eval()
-        if hasattr(lm, 'cfg'):
-            cfg = lm.cfg
-            assert isinstance(cfg, omegaconf.DictConfig)
-            self.cfg = cfg
-        if self.cfg is not None:
-            self.compression_model = get_wrapped_compression_model(self.compression_model, self.cfg)
-        if max_duration is None:
-            if self.cfg is not None:
-                max_duration = lm.cfg.dataset.segment_duration  # type: ignore
-            else:
-                raise ValueError("You must provide max_duration when building directly MusicGen")
-        assert max_duration is not None
-        self.max_duration = max_duration
-        self.duration = 15.0  # default duration
-        self.device = next(iter(lm.parameters())).device
-        self.generation_params: dict = {}
-        self.set_generation_params(duration=self.duration)  # 15 seconds by default
-        self._progress_callback: tp.Union[tp.Callable[[int, int], None], gr.Progress] = None
-        if self.device.type == 'cpu':
-            self.autocast = TorchAutocast(enabled=False)
-        else:
-            self.autocast = TorchAutocast(
-                enabled=True, device_type=self.device.type, dtype=torch.float16)
-    @property
-    def version(self) -> str:
-        from audiocraft import __version__ as audiocraft_version
-        return audiocraft_version
-    @property
-    def frame_rate(self) -> float:
-        """Roughly the number of AR steps per seconds."""
-        return self.compression_model.frame_rate
-    @property
-    def sample_rate(self) -> int:
-        """Sample rate of the generated audio."""
-        return self.compression_model.sample_rate
-    @property
-    def audio_channels(self) -> int:
-        """Audio channels of the generated audio."""
-        return self.compression_model.channels
-    @staticmethod
-    def get_pretrained(name: str = 'melody-large', device=None):
-        """Return pretrained model, we provide ten models:
-        - small (300M), text to music, # see: https://huggingface.co/facebook/musicgen-small
-        - medium (1.5B), text to music, # see: https://huggingface.co/facebook/musicgen-medium
-        - melody (1.5B) text to music and text+melody to music, # see: https://huggingface.co/facebook/musicgen-melody
-        - large (3.3B), text to music, # see: https://huggingface.co/facebook/musicgen-large
-        - melody-large (3.3B), text to music, and text+melody to music # see: https://huggingface.co/facebook/musicgen-melody-large
-        - stereo-small (300M), text to music, # see: https://huggingface.co/facebook/musicgen-small
-        - stereo-medium (1.5B), text to music, # see: https://huggingface.co/facebook/musicgen-stereo-medium
-        - stereo-melody (1.5B) text to music and text+melody to music, # see: https://huggingface.co/facebook/musicgen-stereo-melody
-        - stereo-large (3.3B), text to music, # see: https://huggingface.co/facebook/musicgen-stereo-large
-        - stereo-melody-large (3.3B), text to music, and text+melody to music # see: https://huggingface.co/facebook/musicgen-stereo-melody-large
-        - musicgen-style (1.5B), text to music, # see: https://huggingface.co/facebook/musicgen-style
-        """
-        if device is None:
-            if torch.cuda.device_count():
-                device = 'cuda'
-            else:
-                device = 'cpu'
-        if name == 'debug':
-            # used only for unit tests
-            compression_model = get_debug_compression_model(device)
-            lm = get_debug_lm_model(device)
-            return MusicGen(name, compression_model, lm, max_duration=30)
-        if name not in HF_MODEL_CHECKPOINTS_MAP:
-            if not os.path.isfile(name) and not os.path.isdir(name):
-                raise ValueError(
-                    f"{name} is not a valid checkpoint name. "
-                    f"Choose one of {', '.join(HF_MODEL_CHECKPOINTS_MAP.keys())}"
-                )
-        else:
-            name = HF_MODEL_CHECKPOINTS_MAP[name]
-        cache_dir = os.environ.get('MUSICGEN_ROOT', None)
-        compression_model = load_compression_model(name, device=device, cache_dir=cache_dir)
-        lm = load_lm_model(name, device=device, cache_dir=cache_dir)
-        if name.__contains__('melody') or 'self_wav' in lm.condition_provider.conditioners:
-            lm.condition_provider.conditioners['self_wav'].match_len_on_eval = True
-            lm.condition_provider.conditioners['self_wav']._use_masking = False
-        return MusicGen(name, compression_model, lm)
-    def set_generation_params(self, use_sampling: bool = True, top_k: int = 250,
-                              top_p: float = 0.0, temperature: float = 1.0,
-                              duration: float = 30.0, cfg_coef: float = 3.0,
-                              cfg_coef_beta: tp.Optional[float] = None,
-                              two_step_cfg: bool = False, extend_stride: float = 10, rep_penalty: float = None):
-        """Set the generation parameters for MusicGen.
-        Args:
-            use_sampling (bool, optional): Use sampling if True, else do argmax decoding. Defaults to True.
-            top_k (int, optional): top_k used for sampling. Defaults to 250.
-            top_p (float, optional): top_p used for sampling, when set to 0 top_k is used. Defaults to 0.0.
-            temperature (float, optional): Softmax temperature parameter. Defaults to 1.0.
-            duration (float, optional): Duration of the generated waveform. Defaults to 30.0.
-            cfg_coef (float, optional): Coefficient used for classifier free guidance. Defaults to 3.0.
-            cfg_coef_beta (float, optional): beta coefficient in double classifier free guidance.
-                Should be only used for MusicGen melody if we want to push the text condition more than
-                the audio conditioning. See paragraph 4.3 in https://arxiv.org/pdf/2407.12563 to understand
-                double CFG.
-            two_step_cfg (bool, optional): If True, performs 2 forward for Classifier Free Guidance,
-                instead of batching together the two. This has some impact on how things
-                are padded but seems to have little impact in practice.
-            extend_stride: when doing extended generation (i.e. more than 30 seconds), by how much
-                should we extend the audio each time. Larger values will mean less context is
-                preserved, and shorter value will require extra computations.
-            rep_penalty (float, optional): If set, use repetition penalty during generation. Not Implemented.
-        """
-        assert extend_stride < self.max_duration, "Cannot stride by more than max generation duration."
-        self.extend_stride = extend_stride
-        self.duration = duration
-        self.generation_params = {
-            #'max_gen_len': int(duration * self.frame_rate),
-            'use_sampling': use_sampling,
-            'temp': temperature,
-            'top_k': top_k,
-            'top_p': top_p,
-            'cfg_coef': cfg_coef,
-            'two_step_cfg': two_step_cfg,
-            'cfg_coef_beta': cfg_coef_beta,
-        }
-    def set_style_conditioner_params(self, eval_q: int = 3, excerpt_length: float = 3.0,
-                                     ds_factor: tp.Optional[int] = None,
-                                     encodec_n_q: tp.Optional[int] = None) -> None:
-        """Set the parameters of the style conditioner
-        Args:
-            eval_q (int): the number of residual quantization streams used to quantize the style condition
-                the smaller it is, the narrower is the information bottleneck
-            excerpt_length (float): the excerpt length in seconds that is extracted from the audio
-                conditioning
-            ds_factor: (int): the downsampling factor used to downsample the style tokens before
-                using them as a prefix
-            encodec_n_q: (int, optional): if encodec is used as a feature extractor, sets the number
-                of streams that is used to extract features
-        """
-        assert isinstance(self.lm.condition_provider.conditioners.self_wav, StyleConditioner), \
-            "Only use this function if you model is MusicGen-Style"
-        self.lm.condition_provider.conditioners.self_wav.set_params(eval_q=eval_q,
-                                                                    excerpt_length=excerpt_length,
-                                                                    ds_factor=ds_factor,
-                                                                    encodec_n_q=encodec_n_q)
-    def set_custom_progress_callback(self, progress_callback: tp.Union[tp.Callable[[int, int], None],gr.Progress] = None):
-        """Override the default progress callback."""
-        self._progress_callback = progress_callback
-    def generate_unconditional(self, num_samples: int, progress: bool = False,
-                               return_tokens: bool = False, progress_callback: gr.Progress = None) -> tp.Union[torch.Tensor,
-                                                                        tp.Tuple[torch.Tensor, torch.Tensor]]:
-        """Generate samples in an unconditional manner.
-        Args:
-            num_samples (int): Number of samples to be generated.
-            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
-            return_tokens (bool, optional): If True, also return the generated tokens. Defaults to False.
-        """
-        descriptions: tp.List[tp.Optional[str]] = [None] * num_samples
-        attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, None)
-        tokens = self._generate_tokens(attributes, prompt_tokens, progress)
-        if return_tokens:
-            return self.generate_audio(tokens), tokens
-        return self.generate_audio(tokens)
-    def generate(self, descriptions: tp.List[str], progress: bool = False, return_tokens: bool = False, progress_callback: gr.Progress = None) \
-            -> tp.Union[torch.Tensor, tp.Tuple[torch.Tensor, torch.Tensor]]:
-        """Generate samples conditioned on text.
-        Args:
-            descriptions (list of str): A list of strings used as text conditioning.
-            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
-            return_tokens (bool, optional): If True, also return the generated tokens. Defaults to False.
-        """
-        attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, None)
-        assert prompt_tokens is None
-        tokens = self._generate_tokens(attributes, prompt_tokens, progress)
-        if return_tokens:
-            return self.generate_audio(tokens), tokens
-        return self.generate_audio(tokens)
-    def generate_with_chroma(self, descriptions: tp.List[str], melody_wavs: MelodyType,
-                             melody_sample_rate: int, progress: bool = False,
-                             return_tokens: bool = False, progress_callback=gr.Progress(track_tqdm=True)) -> tp.Union[torch.Tensor,
-                                                                      tp.Tuple[torch.Tensor, torch.Tensor]]:
-        """Generate samples conditioned on text and melody.
-        Args:
-            descriptions (list of str): A list of strings used as text conditioning.
-            melody_wavs: (torch.Tensor or list of Tensor): A batch of waveforms used as
-                melody conditioning. Should have shape [B, C, T] with B matching the description length,
-                C=1 or 2. It can be [C, T] if there is a single description. It can also be
-                a list of [C, T] tensors.
-            melody_sample_rate: (int): Sample rate of the melody waveforms.
-            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
-            return_tokens (bool, optional): If True, also return the generated tokens. Defaults to False.
-        """
-        if isinstance(melody_wavs, torch.Tensor):
-            if melody_wavs.dim() == 2:
-                melody_wavs = melody_wavs[None]
-            if melody_wavs.dim() != 3:
-                raise ValueError("Melody wavs should have a shape [B, C, T].")
-            melody_wavs = list(melody_wavs)
-        else:
-            for melody in melody_wavs:
-                if melody is not None:
-                    assert melody.dim() == 2, "One melody in the list has the wrong number of dims."
-        melody_wavs = [
-            convert_audio(wav, melody_sample_rate, self.sample_rate, self.audio_channels)
-            if wav is not None else None
-            for wav in melody_wavs]
-        attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions=descriptions, prompt=None,
-                                                                        melody_wavs=melody_wavs)
-        assert prompt_tokens is None
-        tokens = self._generate_tokens(attributes, prompt_tokens, progress)
-        if return_tokens:
-            return self.generate_audio(tokens), tokens
-        return self.generate_audio(tokens)
-    def generate_with_all(self, descriptions: tp.List[str], melody_wavs: MelodyType,
-                             sample_rate: int, progress: bool = False, prompt: tp.Optional[torch.Tensor] = None, return_tokens: bool = False, progress_callback: gr.Progress = None) \
-            -> tp.Union[torch.Tensor, tp.Tuple[torch.Tensor, torch.Tensor]]:
-        """Generate samples conditioned on text and melody and audio prompts.
-        Args:
-            descriptions (tp.List[str]): A list of strings used as text conditioning.
-            melody_wavs: (torch.Tensor or list of Tensor): A batch of waveforms used as
-                melody conditioning. Should have shape [B, C, T] with B matching the description length,
-                C=1 or 2. It can be [C, T] if there is a single description. It can also be
-                a list of [C, T] tensors.
-           sample_rate: (int): Sample rate of the melody waveforms.
-           progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
-           prompt (torch.Tensor): A batch of waveforms used for continuation.
-                Prompt should be [B, C, T], or [C, T] if only one sample is generated.
-        """
-        if isinstance(melody_wavs, torch.Tensor):
-            if melody_wavs.dim() == 2:
-                melody_wavs = melody_wavs[None]
-            if melody_wavs.dim() != 3:
-                raise ValueError("Melody wavs should have a shape [B, C, T].")
-            melody_wavs = list(melody_wavs)
-        else:
-            for melody in melody_wavs:
-                if melody is not None:
-                    assert melody.dim() == 2, "One melody in the list has the wrong number of dims."
-        melody_wavs = [
-            convert_audio(wav, sample_rate, self.sample_rate, self.audio_channels)
-            if wav is not None else None
-            for wav in melody_wavs]
-        #attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions=descriptions, prompt=None,
-        #                                                                melody_wavs=melody_wavs)
-        if prompt is not None:
-            if prompt.dim() == 2:
-                prompt = prompt[None]
-            if prompt.dim() != 3:
-                raise ValueError("prompt should have 3 dimensions: [B, C, T] (C = 1).")
-            prompt = convert_audio(prompt, sample_rate, self.sample_rate, self.audio_channels)
-        if descriptions is None:
-            descriptions = [None] * len(prompt)
-        #if prompt is not None:
-        #    attributes_gen, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, prompt)
-        attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions=descriptions, prompt=prompt,
-                                                                        melody_wavs=melody_wavs)
-        if prompt is not None:
-            assert prompt_tokens is not None
-        else:
-            assert prompt_tokens is None
-        tokens = self._generate_tokens(attributes, prompt_tokens, progress)
-        if return_tokens:
-            return self.generate_audio(tokens), tokens
-        return self.generate_audio(tokens)
-    def generate_continuation(self, prompt: torch.Tensor, prompt_sample_rate: int,
-                              descriptions: tp.Optional[tp.List[tp.Optional[str]]] = None,
-                              progress: bool = False, return_tokens: bool = False, progress_callback: gr.Progress = None) \
-            -> tp.Union[torch.Tensor, tp.Tuple[torch.Tensor, torch.Tensor]]:
-        """Generate samples conditioned on audio prompts.
-        Args:
-            prompt (torch.Tensor): A batch of waveforms used for continuation.
-                Prompt should be [B, C, T], or [C, T] if only one sample is generated.
-            prompt_sample_rate (int): Sampling rate of the given audio waveforms.
-            descriptions (list of str, optional): A list of strings used as text conditioning. Defaults to None.
-            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
-            return_tokens (bool, optional): If True, also return the generated tokens. Defaults to False.\
-            This is truly a hack and does not follow the progression of conditioning melody or previously generated audio.
-        """
-        if prompt.dim() == 2:
-            prompt = prompt[None]
-        if prompt.dim() != 3:
-            raise ValueError("prompt should have 3 dimensions: [B, C, T] (C = 1).")
-        prompt = convert_audio(prompt, prompt_sample_rate, self.sample_rate, self.audio_channels)
-        if descriptions is None:
-            descriptions = [None] * len(prompt)
-        attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, prompt)
-        assert prompt_tokens is not None
-        tokens = self._generate_tokens(attributes, prompt_tokens, progress)
-        if return_tokens:
-            return self.generate_audio(tokens), tokens
-        return self.generate_audio(tokens)
-    @torch.no_grad()
-    def _prepare_tokens_and_attributes(
-            self,
-            descriptions: tp.Sequence[tp.Optional[str]],
-            prompt: tp.Optional[torch.Tensor],
-            melody_wavs: tp.Optional[MelodyList] = None,
-            progress_callback: tp.Optional[gr.Progress] = None
-    ) -> tp.Tuple[tp.List[ConditioningAttributes], tp.Optional[torch.Tensor]]:
-        """Prepare model inputs.
-        Args:
-            descriptions (list of str): A list of strings used as text conditioning.
-            prompt (torch.Tensor): A batch of waveforms used for continuation.
-            melody_wavs (torch.Tensor, optional): A batch of waveforms
-                used as melody conditioning. Defaults to None.
-        """
-        attributes = [
-            ConditioningAttributes(text={'description': description})
-            for description in descriptions]
-        if melody_wavs is None:
-            for attr in attributes:
-                attr.wav['self_wav'] = WavCondition(
-                    torch.zeros((1, 1, 1), device=self.device),
-                    torch.tensor([0], device=self.device),
-                    sample_rate=[self.sample_rate],
-                    path=[None]) # type: ignore
-        else:
-            if 'self_wav' not in self.lm.condition_provider.conditioners:
-                raise RuntimeError("This model doesn't support melody conditioning. "
-                                   "Use the `melody` model.")
-            assert len(melody_wavs) == len(descriptions), \
-                f"number of melody wavs must match number of descriptions! " \
-                f"got melody len={len(melody_wavs)}, and descriptions len={len(descriptions)}"
-            for attr, melody in zip(attributes, melody_wavs):
-                if melody is None:
-                    attr.wav['self_wav'] = WavCondition(
-                        torch.zeros((1, 1, 1), device=self.device),
-                        torch.tensor([0], device=self.device),
-                        sample_rate=[self.sample_rate],
-                        path=[None]) # type: ignore
-                else:
-                    attr.wav['self_wav'] = WavCondition(
-                        melody[None].to(device=self.device),
-                        torch.tensor([melody.shape[-1]], device=self.device),
-                        sample_rate=[self.sample_rate],
-                        path=[None],
-                    )
-        if prompt is not None:
-            if descriptions is not None:
-                assert len(descriptions) == len(prompt), "Prompt and nb. descriptions doesn't match"
-            prompt = prompt.to(self.device)
-            prompt_tokens, scale = self.compression_model.encode(prompt)
-            assert scale is None
-        else:
-            prompt_tokens = None
-        return attributes, prompt_tokens
-    def _generate_tokens(self, attributes: tp.List[ConditioningAttributes],
-                         prompt_tokens: tp.Optional[torch.Tensor], progress: bool = False, progress_callback: gr.Progress = None) -> torch.Tensor:
-        """Generate discrete audio tokens given audio prompt and/or conditions.
-        Args:
-            attributes (list of ConditioningAttributes): Conditions used for generation (text/melody).
-            prompt_tokens (torch.Tensor, optional): Audio prompt used for continuation.
-            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
-        Returns:
-            torch.Tensor: Generated audio, of shape [B, C, T], T is defined by the generation params.
-        """
-        total_gen_len = int(self.duration * self.frame_rate)
-        max_prompt_len = int(min(self.duration, self.max_duration) * self.frame_rate)
-        current_gen_offset: int = 0
-        def _progress_callback(generated_tokens: int, tokens_to_generate: int):
-            generated_tokens += current_gen_offset
-            generated_tokens /= ((tokens_to_generate) / self.duration)
-            tokens_to_generate /= ((tokens_to_generate) / self.duration)
-            if self._progress_callback is not None:
-                # Note that total_gen_len might be quite wrong depending on the
-                # codebook pattern used, but with delay it is almost accurate.
-                self._progress_callback((generated_tokens / tokens_to_generate), f"Generated {generated_tokens: 6.2f}/{tokens_to_generate: 6.2f} seconds")
-            if progress_callback is not None:
-                # Update Gradio progress bar
-                progress_callback((generated_tokens / tokens_to_generate), f"Generated {generated_tokens: 6.2f}/{tokens_to_generate: 6.2f} seconds")
-            if progress:
-                print(f'{generated_tokens: 6.2f} / {tokens_to_generate: 6.2f}', end='\r')
-        if prompt_tokens is not None:
-            if prompt_tokens.shape[-1] > max_prompt_len:
-                prompt_tokens = prompt_tokens[..., :max_prompt_len]
-        # callback = None
-        callback = _progress_callback
-        if self.duration <= self.max_duration:
-            # generate by sampling from LM, simple case.
-            with self.autocast:
-                gen_tokens = self.lm.generate(
-                    prompt_tokens, attributes,
-                    callback=callback, max_gen_len=total_gen_len, **self.generation_params)
-        else:
-            # now this gets a bit messier, we need to handle prompts,
-            # melody conditioning etc.
-            ref_wavs = [attr.wav['self_wav'] for attr in attributes]
-            all_tokens = []
-            if prompt_tokens is None:
-                prompt_length = 0
-            else:
-                all_tokens.append(prompt_tokens)
-                prompt_length = prompt_tokens.shape[-1]
-            stride_tokens = int(self.frame_rate * self.extend_stride)
-            while current_gen_offset + prompt_length < total_gen_len:
-                time_offset = current_gen_offset / self.frame_rate
-                chunk_duration = min(self.duration - time_offset, self.max_duration)
-                max_gen_len = int(chunk_duration * self.frame_rate)
-                for attr, ref_wav in zip(attributes, ref_wavs):
-                    wav_length = ref_wav.length.item()
-                    if wav_length == 0:
-                        continue
-                    # We will extend the wav periodically if it not long enough.
-                    # we have to do it here rather than in conditioners.py as otherwise
-                    # we wouldn't have the full wav.
-                    initial_position = int(time_offset * self.sample_rate)
-                    wav_target_length = int(self.max_duration * self.sample_rate)
-                    print(initial_position / self.sample_rate, wav_target_length / self.sample_rate)
-                    positions = torch.arange(initial_position,
-                                             initial_position + wav_target_length, device=self.device)
-                    attr.wav['self_wav'] = WavCondition(
-                        ref_wav[0][..., positions % wav_length],
-                        torch.full_like(ref_wav[1], wav_target_length),
-                        [self.sample_rate] * ref_wav[0].size(0),
-                        [None], [0.])
-                with self.autocast:
-                    gen_tokens = self.lm.generate(
-                        prompt_tokens, attributes,
-                        callback=callback, max_gen_len=max_gen_len, **self.generation_params)
-                if prompt_tokens is None:
-                    all_tokens.append(gen_tokens)
-                else:
-                    all_tokens.append(gen_tokens[:, :, prompt_tokens.shape[-1]:])
-                prompt_tokens = gen_tokens[:, :, stride_tokens:]
-                prompt_length = prompt_tokens.shape[-1]
-                current_gen_offset += stride_tokens
-            gen_tokens = torch.cat(all_tokens, dim=-1)
-        return gen_tokens
-        # generate audio
-    def generate_audio(self, gen_tokens: torch.Tensor):
-        try:
-            """Generate Audio from tokens"""
-            assert gen_tokens.dim() == 3
-            with torch.no_grad():
-                gen_audio = self.compression_model.decode(gen_tokens, None)
-            return gen_audio
-        except Exception as e:
-            print(f"Error generating audio: {e}")
-            return None
-    #def _generate_tokens(self, attributes: tp.List[ConditioningAttributes],
-    #                     prompt_tokens: tp.Optional[torch.Tensor], progress: bool = False) -> torch.Tensor:
-    #    """Generate discrete audio tokens given audio prompt and/or conditions.
-    #    Args:
-    #        attributes (tp.List[ConditioningAttributes]): Conditions used for generation (text/melody).
-    #        prompt_tokens (tp.Optional[torch.Tensor]): Audio prompt used for continuation.
-    #        progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
-    #    Returns:
-    #        torch.Tensor: Generated audio, of shape [B, C, T], T is defined by the generation params.
-    #    """
-    #    def _progress_callback(generated_tokens: int, tokens_to_generate: int):
-    #        print(f'{generated_tokens: 6d} / {tokens_to_generate: 6d}', end='\r')
-    #    if prompt_tokens is not None:
-    #        assert self.generation_params['max_gen_len'] > prompt_tokens.shape[-1], \
-    #            "Prompt is longer than audio to generate"
-    #    callback = None
-    #    if progress:
-    #        callback = _progress_callback
-    #    # generate by sampling from LM
-    #    with self.autocast:
-    #        gen_tokens = self.lm.generate(prompt_tokens, attributes, callback=callback, **self.generation_params)
-    #    # generate audio
-    #    assert gen_tokens.dim() == 3
-    #    with torch.no_grad():
-    #        gen_audio = self.compression_model.decode(gen_tokens, None)
-    #    return gen_audio
-    def to(self, device: str):
-        self.compression_model.to(device)
-        self.lm.to(device)
-        return self

audiocraft/models/unet.py DELETED Viewed

@@ -1,214 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Pytorch Unet Module used for diffusion.
-"""
-from dataclasses import dataclass
-import typing as tp
-import torch
-from torch import nn
-from torch.nn import functional as F
-from audiocraft.modules.transformer import StreamingTransformer, create_sin_embedding
-@dataclass
-class Output:
-    sample: torch.Tensor
-def get_model(cfg, channels: int, side: int, num_steps: int):
-    if cfg.model == 'unet':
-        return DiffusionUnet(
-            chin=channels, num_steps=num_steps, **cfg.diffusion_unet)
-    else:
-        raise RuntimeError('Not Implemented')
-class ResBlock(nn.Module):
-    def __init__(self, channels: int, kernel: int = 3, norm_groups: int = 4,
-                 dilation: int = 1, activation: tp.Type[nn.Module] = nn.ReLU,
-                 dropout: float = 0.):
-        super().__init__()
-        stride = 1
-        padding = dilation * (kernel - stride) // 2
-        Conv = nn.Conv1d
-        Drop = nn.Dropout1d
-        self.norm1 = nn.GroupNorm(norm_groups, channels)
-        self.conv1 = Conv(channels, channels, kernel, 1, padding, dilation=dilation)
-        self.activation1 = activation()
-        self.dropout1 = Drop(dropout)
-        self.norm2 = nn.GroupNorm(norm_groups, channels)
-        self.conv2 = Conv(channels, channels, kernel, 1, padding, dilation=dilation)
-        self.activation2 = activation()
-        self.dropout2 = Drop(dropout)
-    def forward(self, x):
-        h = self.dropout1(self.conv1(self.activation1(self.norm1(x))))
-        h = self.dropout2(self.conv2(self.activation2(self.norm2(h))))
-        return x + h
-class DecoderLayer(nn.Module):
-    def __init__(self, chin: int, chout: int, kernel: int = 4, stride: int = 2,
-                 norm_groups: int = 4, res_blocks: int = 1, activation: tp.Type[nn.Module] = nn.ReLU,
-                 dropout: float = 0.):
-        super().__init__()
-        padding = (kernel - stride) // 2
-        self.res_blocks = nn.Sequential(
-            *[ResBlock(chin, norm_groups=norm_groups, dilation=2**idx, dropout=dropout)
-              for idx in range(res_blocks)])
-        self.norm = nn.GroupNorm(norm_groups, chin)
-        ConvTr = nn.ConvTranspose1d
-        self.convtr = ConvTr(chin, chout, kernel, stride, padding, bias=False)
-        self.activation = activation()
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.res_blocks(x)
-        x = self.norm(x)
-        x = self.activation(x)
-        x = self.convtr(x)
-        return x
-class EncoderLayer(nn.Module):
-    def __init__(self, chin: int, chout: int, kernel: int = 4, stride: int = 2,
-                 norm_groups: int = 4, res_blocks: int = 1, activation: tp.Type[nn.Module] = nn.ReLU,
-                 dropout: float = 0.):
-        super().__init__()
-        padding = (kernel - stride) // 2
-        Conv = nn.Conv1d
-        self.conv = Conv(chin, chout, kernel, stride, padding, bias=False)
-        self.norm = nn.GroupNorm(norm_groups, chout)
-        self.activation = activation()
-        self.res_blocks = nn.Sequential(
-            *[ResBlock(chout, norm_groups=norm_groups, dilation=2**idx, dropout=dropout)
-              for idx in range(res_blocks)])
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        B, C, T = x.shape
-        stride, = self.conv.stride
-        pad = (stride - (T % stride)) % stride
-        x = F.pad(x, (0, pad))
-        x = self.conv(x)
-        x = self.norm(x)
-        x = self.activation(x)
-        x = self.res_blocks(x)
-        return x
-class BLSTM(nn.Module):
-    """BiLSTM with same hidden units as input dim.
-    """
-    def __init__(self, dim, layers=2):
-        super().__init__()
-        self.lstm = nn.LSTM(bidirectional=True, num_layers=layers, hidden_size=dim, input_size=dim)
-        self.linear = nn.Linear(2 * dim, dim)
-    def forward(self, x):
-        x = x.permute(2, 0, 1)
-        x = self.lstm(x)[0]
-        x = self.linear(x)
-        x = x.permute(1, 2, 0)
-        return x
-class DiffusionUnet(nn.Module):
-    def __init__(self, chin: int = 3, hidden: int = 24, depth: int = 3, growth: float = 2.,
-                 max_channels: int = 10_000, num_steps: int = 1000, emb_all_layers=False, cross_attention: bool = False,
-                 bilstm: bool = False, transformer: bool = False,
-                 codec_dim: tp.Optional[int] = None, **kwargs):
-        super().__init__()
-        self.encoders = nn.ModuleList()
-        self.decoders = nn.ModuleList()
-        self.embeddings: tp.Optional[nn.ModuleList] = None
-        self.embedding = nn.Embedding(num_steps, hidden)
-        if emb_all_layers:
-            self.embeddings = nn.ModuleList()
-        self.condition_embedding: tp.Optional[nn.Module] = None
-        for d in range(depth):
-            encoder = EncoderLayer(chin, hidden, **kwargs)
-            decoder = DecoderLayer(hidden, chin, **kwargs)
-            self.encoders.append(encoder)
-            self.decoders.insert(0, decoder)
-            if emb_all_layers and d > 0:
-                assert self.embeddings is not None
-                self.embeddings.append(nn.Embedding(num_steps, hidden))
-            chin = hidden
-            hidden = min(int(chin * growth), max_channels)
-        self.bilstm: tp.Optional[nn.Module]
-        if bilstm:
-            self.bilstm = BLSTM(chin)
-        else:
-            self.bilstm = None
-        self.use_transformer = transformer
-        self.cross_attention = False
-        if transformer:
-            self.cross_attention = cross_attention
-            self.transformer = StreamingTransformer(chin, 8, 6, bias_ff=False, bias_attn=False,
-                                                    cross_attention=cross_attention)
-        self.use_codec = False
-        if codec_dim is not None:
-            self.conv_codec = nn.Conv1d(codec_dim, chin, 1)
-            self.use_codec = True
-    def forward(self, x: torch.Tensor, step: tp.Union[int, torch.Tensor], condition: tp.Optional[torch.Tensor] = None):
-        skips = []
-        bs = x.size(0)
-        z = x
-        view_args = [1]
-        if type(step) is torch.Tensor:
-            step_tensor = step
-        else:
-            step_tensor = torch.tensor([step], device=x.device, dtype=torch.long).expand(bs)
-        for idx, encoder in enumerate(self.encoders):
-            z = encoder(z)
-            if idx == 0:
-                z = z + self.embedding(step_tensor).view(bs, -1, *view_args).expand_as(z)
-            elif self.embeddings is not None:
-                z = z + self.embeddings[idx - 1](step_tensor).view(bs, -1, *view_args).expand_as(z)
-            skips.append(z)
-        if self.use_codec:  # insert condition in the bottleneck
-            assert condition is not None, "Model defined for conditionnal generation"
-            condition_emb = self.conv_codec(condition)  # reshape to the bottleneck dim
-            assert condition_emb.size(-1) <= 2 * z.size(-1), \
-                f"You are downsampling the conditionning with factor >=2 : {condition_emb.size(-1)=} and {z.size(-1)=}"
-            if not self.cross_attention:
-                condition_emb = torch.nn.functional.interpolate(condition_emb, z.size(-1))
-                assert z.size() == condition_emb.size()
-                z += condition_emb
-                cross_attention_src = None
-            else:
-                cross_attention_src = condition_emb.permute(0, 2, 1)  # B, T, C
-                B, T, C = cross_attention_src.shape
-                positions = torch.arange(T, device=x.device).view(1, -1, 1)
-                pos_emb = create_sin_embedding(positions, C, max_period=10_000, dtype=cross_attention_src.dtype)
-                cross_attention_src = cross_attention_src + pos_emb
-        if self.use_transformer:
-            z = self.transformer(z.permute(0, 2, 1), cross_attention_src=cross_attention_src).permute(0, 2, 1)
-        else:
-            if self.bilstm is None:
-                z = torch.zeros_like(z)
-            else:
-                z = self.bilstm(z)
-        for decoder in self.decoders:
-            s = skips.pop(-1)
-            z = z[:, :, :s.shape[2]]
-            z = z + s
-            z = decoder(z)
-        z = z[:, :, :x.shape[2]]
-        return Output(z)

audiocraft/modules/__init__.py DELETED Viewed

@@ -1,21 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-# flake8: noqa
-from .conv import (
-    NormConv1d,
-    NormConv2d,
-    NormConvTranspose1d,
-    NormConvTranspose2d,
-    StreamableConv1d,
-    StreamableConvTranspose1d,
-    pad_for_conv1d,
-    pad1d,
-    unpad1d,
-)
-from .lstm import StreamableLSTM
-from .seanet import SEANetEncoder, SEANetDecoder
-from .transformer import StreamingTransformer

audiocraft/modules/activations.py DELETED Viewed

@@ -1,96 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-import torch
-import torch.nn as nn
-from torch import Tensor
-from typing import Union, Callable
-class CustomGLU(nn.Module):
-    """Custom Gated Linear Unit activation.
-    Applies a modified gated linear unit :math:`a * f(b)` where :math:`a` is the first half
-    of the input matrices, :math:`b` is the second half, and :math:`f` is a provided activation
-    function (i.e. sigmoid, swish, etc.).
-    Args:
-        activation (nn.Module): The custom activation to apply in the Gated Linear Unit
-        dim (int): the dimension on which to split the input. Default: -1
-    Shape:
-        - Input: :math:`(\ast_1, N, \ast_2)` where `*` means, any number of additional
-          dimensions
-        - Output: :math:`(\ast_1, M, \ast_2)` where :math:`M=N/2`
-    Examples::
-        >>> m = CustomGLU(nn.Sigmoid())
-        >>> input = torch.randn(4, 2)
-        >>> output = m(input)
-    """
-    def __init__(self, activation: nn.Module, dim: int = -1):
-        super(CustomGLU, self).__init__()
-        self.dim = dim
-        self.activation = activation
-    def forward(self, x: Tensor):
-        assert x.shape[self.dim] % 2 == 0  # M = N / 2
-        a, b = torch.chunk(x, 2, dim=self.dim)
-        return a * self.activation(b)
-class SwiGLU(CustomGLU):
-    """SiLU Gated Linear Unit activation.
-    Applies SiLU Gated Linear Unit :math:`a * SiLU(b)` where :math:`a` is
-    the first half of the input matrices, :math:`b` is the second half.
-    Args:
-        dim (int): the dimension on which to split the input. Default: -1
-    """
-    def __init__(self, dim: int = -1):
-        super(SwiGLU, self).__init__(nn.SiLU(), dim)
-class GeGLU(CustomGLU):
-    """GeLU Gated Linear Unit activation.
-    Applies GeLU Gated Linear Unit :math:`a * GELU(b)` where :math:`a` is
-    the first half of the input matrices, :math:`b` is the second half.
-    Args:
-        dim (int): the dimension on which to split the input. Default: -1
-    """
-    def __init__(self, dim: int = -1):
-        super(GeGLU, self).__init__(nn.GELU(), dim)
-class ReGLU(CustomGLU):
-    """ReLU Gated Linear Unit activation.
-    Applies ReLU Gated Linear Unit :math:`a * ReLU(b)` where :math:`a` is
-    the first half of the input matrices, :math:`b` is the second half.
-    Args:
-        dim (int): the dimension on which to split the input. Default: -1
-    """
-    def __init__(self, dim: int = -1):
-        super(ReGLU, self).__init__(nn.ReLU(), dim)
-def get_activation_fn(
-    activation: Union[str, Callable[[Tensor], Tensor]]
-) -> Union[str, Callable[[Tensor], Tensor]]:
-    """Helper function to map an activation string to the activation class.
-    If the supplied activation is not a string that is recognized, the activation is passed back.
-    Args:
-        activation (Union[str, Callable[[Tensor], Tensor]]): Activation to check
-    """
-    if isinstance(activation, str):
-        if activation == "reglu":
-            return ReGLU()
-        elif activation == "geglu":
-            return GeGLU()
-        elif activation == "swiglu":
-            return SwiGLU()
-    return activation

audiocraft/modules/chroma.py DELETED Viewed

@@ -1,66 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-import typing as tp
-from einops import rearrange
-from librosa import filters
-import torch
-from torch import nn
-import torch.nn.functional as F
-import torchaudio
-class ChromaExtractor(nn.Module):
-    """Chroma extraction and quantization.
-    Args:
-        sample_rate (int): Sample rate for the chroma extraction.
-        n_chroma (int): Number of chroma bins for the chroma extraction.
-        radix2_exp (int): Size of stft window for the chroma extraction (power of 2, e.g. 12 -> 2^12).
-        nfft (int, optional): Number of FFT.
-        winlen (int, optional): Window length.
-        winhop (int, optional): Window hop size.
-        argmax (bool, optional): Whether to use argmax. Defaults to False.
-        norm (float, optional): Norm for chroma normalization. Defaults to inf.
-    """
-    def __init__(self, sample_rate: int, n_chroma: int = 12, radix2_exp: int = 12, nfft: tp.Optional[int] = None,
-                 winlen: tp.Optional[int] = None, winhop: tp.Optional[int] = None, argmax: bool = False,
-                 norm: float = torch.inf):
-        super().__init__()
-        self.winlen = winlen or 2 ** radix2_exp
-        self.nfft = nfft or self.winlen
-        self.winhop = winhop or (self.winlen // 4)
-        self.sample_rate = sample_rate
-        self.n_chroma = n_chroma
-        self.norm = norm
-        self.argmax = argmax
-        self.register_buffer('fbanks', torch.from_numpy(filters.chroma(sr=sample_rate, n_fft=self.nfft, tuning=0,
-                                                                       n_chroma=self.n_chroma)), persistent=False)
-        self.spec = torchaudio.transforms.Spectrogram(n_fft=self.nfft, win_length=self.winlen,
-                                                      hop_length=self.winhop, power=2, center=True,
-                                                      pad=0, normalized=True)
-    def forward(self, wav: torch.Tensor) -> torch.Tensor:
-        T = wav.shape[-1]
-        # in case we are getting a wav that was dropped out (nullified)
-        # from the conditioner, make sure wav length is no less that nfft
-        if T < self.nfft:
-            pad = self.nfft - T
-            r = 0 if pad % 2 == 0 else 1
-            wav = F.pad(wav, (pad // 2, pad // 2 + r), 'constant', 0)
-            assert wav.shape[-1] == self.nfft, f"expected len {self.nfft} but got {wav.shape[-1]}"
-        spec = self.spec(wav).squeeze(1)
-        raw_chroma = torch.einsum('cf,...ft->...ct', self.fbanks, spec)
-        norm_chroma = torch.nn.functional.normalize(raw_chroma, p=self.norm, dim=-2, eps=1e-6)
-        norm_chroma = rearrange(norm_chroma, 'b d t -> b t d')
-        if self.argmax:
-            idx = norm_chroma.argmax(-1, keepdim=True)
-            norm_chroma[:] = 0
-            norm_chroma.scatter_(dim=-1, index=idx, value=1)
-        return norm_chroma

audiocraft/modules/codebooks_patterns.py DELETED Viewed

@@ -1,548 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-from collections import namedtuple
-from dataclasses import dataclass
-from functools import lru_cache
-import logging
-import typing as tp
-from abc import ABC, abstractmethod
-import torch
-LayoutCoord = namedtuple('LayoutCoord', ['t', 'q'])  # (timestep, codebook index)
-PatternLayout = tp.List[tp.List[LayoutCoord]]  # Sequence of coordinates
-logger = logging.getLogger(__name__)
-@dataclass
-class Pattern:
-    """Base implementation of a pattern over a sequence with multiple codebooks.
-    The codebook pattern consists in a layout, defining for each sequence step
-    the list of coordinates of each codebook timestep in the resulting interleaved sequence.
-    The first item of the pattern is always an empty list in order to properly insert a special token
-    to start with. For convenience, we also keep track of ``n_q`` the number of codebooks used for the pattern
-    and ``timesteps`` the number of timesteps corresponding to the original sequence.
-    The pattern provides convenient methods to build and revert interleaved sequences from it:
-    ``build_pattern_sequence`` maps a given a dense input tensor of multi-codebook sequence from [B, K, T]
-        to the interleaved sequence of shape [B, K, S] applying the pattern, with B being the batch size,
-        K being the number of codebooks, T the number of original timesteps and S the number of sequence steps
-        for the output sequence. The unfilled positions are replaced with a special token and the built sequence
-        is returned along with a mask indicating valid tokens.
-    ``revert_pattern_sequence`` maps back an interleaved sequence of shape [B, K, S] to the original alignment
-        of codebooks across timesteps to an output tensor of shape [B, K, T], using again a special token and a mask
-        to fill and specify invalid positions if needed.
-    See the dedicated methods for more details.
-    """
-    # Pattern layout, for each sequence step, we have a list of coordinates
-    # corresponding to the original codebook timestep and position.
-    # The first list is always an empty list in order to properly insert
-    # a special token to start with.
-    layout: PatternLayout
-    timesteps: int
-    n_q: int
-    def __post_init__(self):
-        assert len(self.layout) > 0
-        self._validate_layout()
-        self._build_reverted_sequence_scatter_indexes = lru_cache(100)(self._build_reverted_sequence_scatter_indexes)
-        self._build_pattern_sequence_scatter_indexes = lru_cache(100)(self._build_pattern_sequence_scatter_indexes)
-        logger.info("New pattern, time steps: %d, sequence steps: %d", self.timesteps, len(self.layout))
-    def _validate_layout(self):
-        """Runs checks on the layout to ensure a valid pattern is defined.
-        A pattern is considered invalid if:
-            - Multiple timesteps for a same codebook are defined in the same sequence step
-            - The timesteps for a given codebook are not in ascending order as we advance in the sequence
-              (this would mean that we have future timesteps before past timesteps).
-        """
-        q_timesteps = {q: 0 for q in range(self.n_q)}
-        for s, seq_coords in enumerate(self.layout):
-            if len(seq_coords) > 0:
-                qs = set()
-                for coord in seq_coords:
-                    qs.add(coord.q)
-                    last_q_timestep = q_timesteps[coord.q]
-                    assert coord.t >= last_q_timestep, \
-                        f"Past timesteps are found in the sequence for codebook = {coord.q} at step {s}"
-                    q_timesteps[coord.q] = coord.t
-                # each sequence step contains at max 1 coordinate per codebook
-                assert len(qs) == len(seq_coords), \
-                    f"Multiple entries for a same codebook are found at step {s}"
-    @property
-    def num_sequence_steps(self):
-        return len(self.layout) - 1
-    @property
-    def max_delay(self):
-        max_t_in_seq_coords = 0
-        for seq_coords in self.layout[1:]:
-            for coords in seq_coords:
-                max_t_in_seq_coords = max(max_t_in_seq_coords, coords.t + 1)
-        return max_t_in_seq_coords - self.timesteps
-    @property
-    def valid_layout(self):
-        valid_step = len(self.layout) - self.max_delay
-        return self.layout[:valid_step]
-    def starts_with_special_token(self):
-        return self.layout[0] == []
-    def get_sequence_coords_with_timestep(self, t: int, q: tp.Optional[int] = None):
-        """Get codebook coordinates in the layout that corresponds to the specified timestep t
-        and optionally to the codebook q. Coordinates are returned as a tuple with the sequence step
-        and the actual codebook coordinates.
-        """
-        assert t <= self.timesteps, "provided timesteps is greater than the pattern's number of timesteps"
-        if q is not None:
-            assert q <= self.n_q, "provided number of codebooks is greater than the pattern's number of codebooks"
-        coords = []
-        for s, seq_codes in enumerate(self.layout):
-            for code in seq_codes:
-                if code.t == t and (q is None or code.q == q):
-                    coords.append((s, code))
-        return coords
-    def get_steps_with_timestep(self, t: int, q: tp.Optional[int] = None) -> tp.List[int]:
-        return [step for step, coords in self.get_sequence_coords_with_timestep(t, q)]
-    def get_first_step_with_timesteps(self, t: int, q: tp.Optional[int] = None) -> tp.Optional[int]:
-        steps_with_timesteps = self.get_steps_with_timestep(t, q)
-        return steps_with_timesteps[0] if len(steps_with_timesteps) > 0 else None
-    def _build_pattern_sequence_scatter_indexes(self, timesteps: int, n_q: int, keep_only_valid_steps: bool,
-                                                device: tp.Union[torch.device, str] = 'cpu'):
-        """Build scatter indexes corresponding to the pattern, up to the provided sequence_steps.
-        Args:
-            timesteps (int): Maximum number of timesteps steps to consider.
-            keep_only_valid_steps (bool): Restrict the pattern layout to match only valid steps.
-            device (torch.device or str): Device for created tensors.
-        Returns:
-            indexes (torch.Tensor): Indexes corresponding to the sequence, of shape [K, S].
-            mask (torch.Tensor): Mask corresponding to indexes that matches valid indexes, of shape [K, S].
-        """
-        assert n_q == self.n_q, f"invalid number of codebooks for the sequence and the pattern: {n_q} != {self.n_q}"
-        assert timesteps <= self.timesteps, "invalid number of timesteps used to build the sequence from the pattern"
-        # use the proper layout based on whether we limit ourselves to valid steps only or not,
-        # note that using the valid_layout will result in a truncated sequence up to the valid steps
-        ref_layout = self.valid_layout if keep_only_valid_steps else self.layout
-        # single item indexing being super slow with pytorch vs. numpy, so we use numpy here
-        indexes = torch.zeros(n_q, len(ref_layout), dtype=torch.long).numpy()
-        mask = torch.zeros(n_q, len(ref_layout), dtype=torch.bool).numpy()
-        # fill indexes with last sequence step value that will correspond to our special token
-        # the last value is n_q * timesteps as we have flattened z and append special token as the last token
-        # which will correspond to the index: n_q * timesteps
-        indexes[:] = n_q * timesteps
-        # iterate over the pattern and fill scattered indexes and mask
-        for s, sequence_coords in enumerate(ref_layout):
-            for coords in sequence_coords:
-                if coords.t < timesteps:
-                    indexes[coords.q, s] = coords.t + coords.q * timesteps
-                    mask[coords.q, s] = 1
-        indexes = torch.from_numpy(indexes).to(device)
-        mask = torch.from_numpy(mask).to(device)
-        return indexes, mask
-    def build_pattern_sequence(self, z: torch.Tensor, special_token: int, keep_only_valid_steps: bool = False):
-        """Build sequence corresponding to the pattern from the input tensor z.
-        The sequence is built using up to sequence_steps if specified, and non-pattern
-        coordinates are filled with the special token.
-        Args:
-            z (torch.Tensor): Input tensor of multi-codebooks sequence, of shape [B, K, T].
-            special_token (int): Special token used to fill non-pattern coordinates in the new sequence.
-            keep_only_valid_steps (bool): Build a sequence from the pattern up to valid (= fully defined) steps.
-                Steps that are beyond valid steps will be replaced by the special_token in that case.
-        Returns:
-            values (torch.Tensor): Interleaved sequence matching the pattern, of shape [B, K, S] with S
-                corresponding either to the sequence_steps if provided, otherwise to the length of the pattern.
-            indexes (torch.Tensor): Indexes corresponding to the interleaved sequence, of shape [K, S].
-            mask (torch.Tensor): Mask corresponding to indexes that matches valid indexes of shape [K, S].
-        """
-        B, K, T = z.shape
-        indexes, mask = self._build_pattern_sequence_scatter_indexes(
-            T, K, keep_only_valid_steps=keep_only_valid_steps, device=str(z.device)
-        )
-        z = z.view(B, -1)
-        # we append the special token as the last index of our flattened z tensor
-        z = torch.cat([z, torch.zeros_like(z[:, :1]) + special_token], dim=1)
-        values = z[:, indexes.view(-1)]
-        values = values.view(B, K, indexes.shape[-1])
-        return values, indexes, mask
-    def _build_reverted_sequence_scatter_indexes(self, sequence_steps: int, n_q: int,
-                                                 keep_only_valid_steps: bool = False,
-                                                 is_model_output: bool = False,
-                                                 device: tp.Union[torch.device, str] = 'cpu'):
-        """Builds scatter indexes required to retrieve the original multi-codebook sequence
-        from interleaving pattern.
-        Args:
-            sequence_steps (int): Sequence steps.
-            n_q (int): Number of codebooks.
-            keep_only_valid_steps (bool): Build a sequence from the pattern up to valid (= fully defined) steps.
-                Steps that are beyond valid steps will be replaced by the special_token in that case.
-            is_model_output (bool): Whether to keep the sequence item corresponding to initial special token or not.
-            device (torch.device or str): Device for created tensors.
-        Returns:
-            indexes (torch.Tensor): Indexes for reconstructing the output, of shape [K, T].
-            mask (torch.Tensor): Mask corresponding to indexes that matches valid indexes of shape [K, T].
-        """
-        ref_layout = self.valid_layout if keep_only_valid_steps else self.layout
-        # TODO(jade): Do we want to further truncate to only valid timesteps here as well?
-        timesteps = self.timesteps
-        assert n_q == self.n_q, f"invalid number of codebooks for the sequence and the pattern: {n_q} != {self.n_q}"
-        assert sequence_steps <= len(ref_layout), \
-            f"sequence to revert is longer than the defined pattern: {sequence_steps} > {len(ref_layout)}"
-        # ensure we take the appropriate indexes to keep the model output from the first special token as well
-        if is_model_output and self.starts_with_special_token():
-            ref_layout = ref_layout[1:]
-        # single item indexing being super slow with pytorch vs. numpy, so we use numpy here
-        indexes = torch.zeros(n_q, timesteps, dtype=torch.long).numpy()
-        mask = torch.zeros(n_q, timesteps, dtype=torch.bool).numpy()
-        # fill indexes with last sequence step value that will correspond to our special token
-        indexes[:] = n_q * sequence_steps
-        for s, sequence_codes in enumerate(ref_layout):
-            if s < sequence_steps:
-                for code in sequence_codes:
-                    if code.t < timesteps:
-                        indexes[code.q, code.t] = s + code.q * sequence_steps
-                        mask[code.q, code.t] = 1
-        indexes = torch.from_numpy(indexes).to(device)
-        mask = torch.from_numpy(mask).to(device)
-        return indexes, mask
-    def revert_pattern_sequence(self, s: torch.Tensor, special_token: int, keep_only_valid_steps: bool = False):
-        """Revert a sequence built from the pattern back to the original multi-codebook sequence without interleaving.
-        The sequence is reverted using up to timesteps if specified, and non-pattern coordinates
-        are filled with the special token.
-        Args:
-            s (torch.Tensor): Interleaved sequence tensor obtained from the pattern, of shape [B, K, S].
-            special_token (int or float): Special token used to fill non-pattern coordinates in the new sequence.
-        Returns:
-            values (torch.Tensor): Interleaved sequence matching the pattern, of shape [B, K, T] with T
-                corresponding either to the timesteps if provided, or the total timesteps in pattern otherwise.
-            indexes (torch.Tensor): Indexes corresponding to the interleaved sequence, of shape [K, T].
-            mask (torch.Tensor): Mask corresponding to indexes that matches valid indexes of shape [K, T].
-        """
-        B, K, S = s.shape
-        indexes, mask = self._build_reverted_sequence_scatter_indexes(
-            S, K, keep_only_valid_steps, is_model_output=False, device=str(s.device)
-        )
-        s = s.view(B, -1)
-        # we append the special token as the last index of our flattened z tensor
-        s = torch.cat([s, torch.zeros_like(s[:, :1]) + special_token], dim=1)
-        values = s[:, indexes.view(-1)]
-        values = values.view(B, K, indexes.shape[-1])
-        return values, indexes, mask
-    def revert_pattern_logits(self, logits: torch.Tensor, special_token: float, keep_only_valid_steps: bool = False):
-        """Revert model logits obtained on a sequence built from the pattern
-        back to a tensor matching the original sequence.
-        This method is similar to ``revert_pattern_sequence`` with the following specificities:
-        1. It is designed to work with the extra cardinality dimension
-        2. We return the logits for the first sequence item that matches the special_token and
-        which matching target in the original sequence is the first item of the sequence,
-        while we skip the last logits as there is no matching target
-        """
-        B, card, K, S = logits.shape
-        indexes, mask = self._build_reverted_sequence_scatter_indexes(
-            S, K, keep_only_valid_steps, is_model_output=True, device=logits.device
-        )
-        logits = logits.reshape(B, card, -1)
-        # we append the special token as the last index of our flattened z tensor
-        logits = torch.cat([logits, torch.zeros_like(logits[:, :, :1]) + special_token], dim=-1)  # [B, card, K x S]
-        values = logits[:, :, indexes.view(-1)]
-        values = values.view(B, card, K, indexes.shape[-1])
-        return values, indexes, mask
-class CodebooksPatternProvider(ABC):
-    """Abstraction around providing pattern for interleaving codebooks.
-    The CodebooksPatternProvider abstraction allows to implement various strategies to
-    define interleaving pattern of sequences composed of multiple codebooks. For a given
-    number of codebooks `n_q`, the pattern provider can generate a specified pattern
-    corresponding to a sequence of `T` timesteps with `n_q` parallel codebooks. This pattern
-    can be used to construct a new sequence from the original codes respecting the specified
-    pattern. The pattern is defined as a list of list of code coordinates, code coordinate
-    being a tuple with the original timestep and codebook to build the new sequence.
-    Note that all patterns must start with an empty list that is then used to insert a first
-    sequence step of special tokens in the newly generated sequence.
-    Args:
-        n_q (int): number of codebooks.
-        cached (bool): if True, patterns for a given length are cached. In general
-            that should be true for efficiency reason to avoid synchronization points.
-    """
-    def __init__(self, n_q: int, cached: bool = True):
-        assert n_q > 0
-        self.n_q = n_q
-        self.get_pattern = lru_cache(100)(self.get_pattern)  # type: ignore
-    @abstractmethod
-    def get_pattern(self, timesteps: int) -> Pattern:
-        """Builds pattern with specific interleaving between codebooks.
-        Args:
-            timesteps (int): Total number of timesteps.
-        """
-        raise NotImplementedError()
-class DelayedPatternProvider(CodebooksPatternProvider):
-    """Provider for delayed pattern across delayed codebooks.
-    Codebooks are delayed in the sequence and sequence steps will contain codebooks
-    from different timesteps.
-    Example:
-        Taking timesteps=4 and n_q=3, delays=None, the multi-codebook sequence:
-        [[1, 2, 3, 4],
-        [1, 2, 3, 4],
-        [1, 2, 3, 4]]
-        The resulting sequence obtained from the returned pattern is:
-        [[S, 1, 2, 3, 4],
-        [S, S, 1, 2, 3],
-        [S, S, S, 1, 2]]
-        (with S being a special token)
-    Args:
-        n_q (int): Number of codebooks.
-        delays (list of int, optional): Delay for each of the codebooks.
-            If delays not defined, each codebook is delayed by 1 compared to the previous one.
-        flatten_first (int): Flatten the first N timesteps.
-        empty_initial (int): Prepend with N empty list of coordinates.
-    """
-    def __init__(self, n_q: int, delays: tp.Optional[tp.List[int]] = None,
-                 flatten_first: int = 0, empty_initial: int = 0):
-        super().__init__(n_q)
-        if delays is None:
-            delays = list(range(n_q))
-        self.delays = delays
-        self.flatten_first = flatten_first
-        self.empty_initial = empty_initial
-        assert len(self.delays) == self.n_q
-        assert sorted(self.delays) == self.delays
-    def get_pattern(self, timesteps: int) -> Pattern:
-        omit_special_token = self.empty_initial < 0
-        out: PatternLayout = [] if omit_special_token else [[]]
-        max_delay = max(self.delays)
-        if self.empty_initial:
-            out += [[] for _ in range(self.empty_initial)]
-        if self.flatten_first:
-            for t in range(min(timesteps, self.flatten_first)):
-                for q in range(self.n_q):
-                    out.append([LayoutCoord(t, q)])
-        for t in range(self.flatten_first, timesteps + max_delay):
-            v = []
-            for q, delay in enumerate(self.delays):
-                t_for_q = t - delay
-                if t_for_q >= self.flatten_first:
-                    v.append(LayoutCoord(t_for_q, q))
-            out.append(v)
-        return Pattern(out, n_q=self.n_q, timesteps=timesteps)
-class ParallelPatternProvider(DelayedPatternProvider):
-    """Provider for parallel pattern across codebooks.
-    This pattern provider is a special case of the delayed pattern with actually no delay,
-    hence delays=repeat(0, n_q).
-    Args:
-        n_q (int): Number of codebooks.
-        empty_initial (int): Prepend with N empty list of coordinates.
-    """
-    def __init__(self, n_q: int, empty_initial: int = 0):
-        super().__init__(n_q, [0] * n_q, empty_initial=empty_initial)
-class UnrolledPatternProvider(CodebooksPatternProvider):
-    """Provider for unrolling codebooks pattern.
-    This pattern provider enables to represent the codebook flattened completely or only to some extend
-    while also specifying a given delay between the flattened codebooks representation, allowing to
-    unroll the codebooks in the sequence.
-    Example:
-        1. Flattening of the codebooks.
-        By default, the pattern provider will fully flatten the codebooks such as flattening=range(n_q),
-        taking n_q = 3 and timesteps = 4:
-        [[1, 2, 3, 4],
-         [1, 2, 3, 4],
-         [1, 2, 3, 4]]
-        will result into:
-        [[S, S, 1, S, S, 2, S, S, 3, S, S, 4],
-         [S, 1, S, S, 2, S, S, 3, S, S, 4, S],
-         [1, S, S, 2, S, S, 3, S, S, 4, S, S]]
-        2. Partial flattening of the codebooks. The ``flattening`` parameter allows to specify the inner step
-        for each of the codebook, allowing to define which codebook to flatten (or keep in parallel), for example
-        taking n_q = 3, timesteps = 4 and flattening = [0, 1, 1]:
-        [[1, 2, 3, 4],
-         [1, 2, 3, 4],
-         [1, 2, 3, 4]]
-        will result into:
-        [[S, 1, S, S, 2, S, S, 3, S, S, 4, S],
-         [S, 1, S, S, 2, S, S, 3, S, S, 4, S],
-         [1, S, S, 2, S, S, 3, S, S, 4, S, S]]
-        3. Flattening with delay. The ``delay`` parameter allows to further unroll the sequence of codebooks
-        allowing to specify the delay per codebook. Note that the delay between codebooks flattened to the
-        same inner timestep should be coherent. For example, taking n_q = 3, timesteps = 4, flattening = [0, 1, 1]
-        and delays = [0, 3, 3]:
-        [[1, 2, 3, 4],
-         [1, 2, 3, 4],
-         [1, 2, 3, 4]]
-        will result into:
-        [[S, S, S, 1, S, 2, S, 3, S, 4],
-         [S, S, S, 1, S, 2, S, 3, S, 4],
-         [1, 2, 3, S, 4, S, 5, S, 6, S]]
-    Args:
-        n_q (int): Number of codebooks.
-        flattening (list of int, optional): Flattening schema over the codebooks. If not defined,
-            the codebooks will be flattened to 1 codebook per step, meaning that the sequence will
-            have n_q extra steps for each timestep.
-        delays (list of int, optional): Delay for each of the codebooks. If not defined,
-            no delay is added and therefore will default to [0] * ``n_q``.
-            Note that two codebooks that will be flattened to the same inner step
-            should have the same delay, otherwise the pattern is considered as invalid.
-    """
-    FlattenedCodebook = namedtuple('FlattenedCodebook', ['codebooks', 'delay'])
-    def __init__(self, n_q: int, flattening: tp.Optional[tp.List[int]] = None,
-                 delays: tp.Optional[tp.List[int]] = None):
-        super().__init__(n_q)
-        if flattening is None:
-            flattening = list(range(n_q))
-        if delays is None:
-            delays = [0] * n_q
-        assert len(flattening) == n_q
-        assert len(delays) == n_q
-        assert sorted(flattening) == flattening
-        assert sorted(delays) == delays
-        self._flattened_codebooks = self._build_flattened_codebooks(delays, flattening)
-        self.max_delay = max(delays)
-    def _build_flattened_codebooks(self, delays: tp.List[int], flattening: tp.List[int]):
-        """Build a flattened codebooks representation as a dictionary of inner step
-        and the actual codebook indices corresponding to the flattened codebook. For convenience, we
-        also store the delay associated to the flattened codebook to avoid maintaining an extra mapping.
-        """
-        flattened_codebooks: dict = {}
-        for q, (inner_step, delay) in enumerate(zip(flattening, delays)):
-            if inner_step not in flattened_codebooks:
-                flat_codebook = UnrolledPatternProvider.FlattenedCodebook(codebooks=[q], delay=delay)
-            else:
-                flat_codebook = flattened_codebooks[inner_step]
-                assert flat_codebook.delay == delay, (
-                    "Delay and flattening between codebooks is inconsistent: ",
-                    "two codebooks flattened to the same position should have the same delay."
-                )
-                flat_codebook.codebooks.append(q)
-            flattened_codebooks[inner_step] = flat_codebook
-        return flattened_codebooks
-    @property
-    def _num_inner_steps(self):
-        """Number of inner steps to unroll between timesteps in order to flatten the codebooks.
-        """
-        return max([inner_step for inner_step in self._flattened_codebooks.keys()]) + 1
-    def num_virtual_steps(self, timesteps: int) -> int:
-        return timesteps * self._num_inner_steps + 1
-    def get_pattern(self, timesteps: int) -> Pattern:
-        """Builds pattern for delay across codebooks.
-        Args:
-            timesteps (int): Total number of timesteps.
-        """
-        # the PatternLayout is built as a tuple of sequence position and list of coordinates
-        # so that it can be reordered properly given the required delay between codebooks of given timesteps
-        indexed_out: list = [(-1, [])]
-        max_timesteps = timesteps + self.max_delay
-        for t in range(max_timesteps):
-            # for each timestep, we unroll the flattened codebooks,
-            # emitting the sequence step with the corresponding delay
-            for step in range(self._num_inner_steps):
-                if step in self._flattened_codebooks:
-                    # we have codebooks at this virtual step to emit
-                    step_codebooks = self._flattened_codebooks[step]
-                    t_for_q = t + step_codebooks.delay
-                    coords = [LayoutCoord(t, q) for q in step_codebooks.codebooks]
-                    if t_for_q < max_timesteps and t < max_timesteps:
-                        indexed_out.append((t_for_q, coords))
-                else:
-                    # there is no codebook in this virtual step so we emit an empty list
-                    indexed_out.append((t, []))
-        out = [coords for _, coords in sorted(indexed_out)]
-        return Pattern(out, n_q=self.n_q, timesteps=timesteps)
-class CoarseFirstPattern(CodebooksPatternProvider):
-    """First generates all the codebooks #1 (e.g. coarser), then the remaining ones,
-    potentially with delays.
-    ..Warning:: You must always generate the full training duration at test time, for instance,
-        30 seconds, as otherwise, the fine codebooks will start being generated in an unexpected
-        location. This is due to the non causality of the remaining codebooks with respect to
-        the first ones.
-    Args:
-        n_q (int): Number of codebooks.
-        delays (list of int, optional): Delay for each of the codebooks.
-            If delays not defined, each codebook is delayed by 1 compared to the previous one.
-    """
-    def __init__(self, n_q: int, delays: tp.Optional[tp.List[int]] = None):
-        super().__init__(n_q)
-        if delays is None:
-            delays = [0] * (n_q - 1)
-        self.delays = delays
-        assert len(self.delays) == self.n_q - 1
-        assert sorted(self.delays) == self.delays
-    def get_pattern(self, timesteps: int) -> Pattern:
-        out: PatternLayout = [[]]
-        for t in range(timesteps):
-            out.append([LayoutCoord(t, 0)])
-        max_delay = max(self.delays)
-        for t in range(timesteps + max_delay):
-            v = []
-            for q, delay in enumerate(self.delays):
-                t_for_q = t - delay
-                if t_for_q >= 0:
-                    v.append(LayoutCoord(t_for_q, q + 1))
-            out.append(v)
-        return Pattern(out, n_q=self.n_q, timesteps=timesteps)
-class MusicLMPattern(CodebooksPatternProvider):
-    """Almost MusicLM style pattern. This is equivalent to full flattening
-    but in a different order.
-    Args:
-        n_q (int): Number of codebooks.
-        group_by (int): Number of codebooks to group together.
-    """
-    def __init__(self, n_q: int, group_by: int = 2):
-        super().__init__(n_q)
-        self.group_by = group_by
-    def get_pattern(self, timesteps: int) -> Pattern:
-        out: PatternLayout = [[]]
-        for offset in range(0, self.n_q, self.group_by):
-            for t in range(timesteps):
-                for q in range(offset, offset + self.group_by):
-                    out.append([LayoutCoord(t, q)])
-        return Pattern(out, n_q=self.n_q, timesteps=timesteps)

audiocraft/modules/conditioners.py DELETED Viewed

@@ -1,1763 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-from collections import defaultdict
-from copy import deepcopy
-from dataclasses import dataclass, field
-from itertools import chain
-import logging
-import math
-from pathlib import Path
-import random
-import re
-import typing as tp
-import warnings
-import einops
-import flashy
-from num2words import num2words
-import spacy
-from transformers import RobertaTokenizer, T5EncoderModel, T5Tokenizer  # type: ignore
-import torch
-from torch import nn
-import torch.nn.functional as F
-from torch.nn.utils.rnn import pad_sequence
-from enum import Enum
-from .chroma import ChromaExtractor
-from .streaming import StreamingModule
-from .transformer import create_sin_embedding, StreamingTransformer
-from ..data.audio import audio_read
-from ..data.audio_dataset import SegmentInfo
-from ..data.audio_utils import convert_audio
-from ..environment import AudioCraftEnvironment
-from ..quantization import ResidualVectorQuantizer
-from ..utils.autocast import TorchAutocast
-from ..utils.cache import EmbeddingCache
-from ..utils.utils import collate, hash_trick, length_to_mask, load_clap_state_dict, warn_once
-logger = logging.getLogger(__name__)
-TextCondition = tp.Optional[str]  # a text condition can be a string or None (if doesn't exist)
-ConditionType = tp.Tuple[torch.Tensor, torch.Tensor]  # condition, mask
-class JascoCondConst(Enum):
-    DRM = 'self_wav'
-    CRD = 'chords'
-    MLD = 'melody'
-    SYM = {'chords', 'melody'}
-    LAT = {'self_wav'}
-    ALL = ['chords', 'self_wav', 'melody']  # order matters
-class WavCondition(tp.NamedTuple):
-    wav: torch.Tensor
-    length: torch.Tensor
-    sample_rate: tp.List[int]
-    path: tp.List[tp.Optional[str]] = []
-    seek_time: tp.List[tp.Optional[float]] = []
-class JointEmbedCondition(tp.NamedTuple):
-    wav: torch.Tensor
-    text: tp.List[tp.Optional[str]]
-    length: torch.Tensor
-    sample_rate: tp.List[int]
-    path: tp.List[tp.Optional[str]] = []
-    seek_time: tp.List[tp.Optional[float]] = []
-class SymbolicCondition(tp.NamedTuple):
-    frame_chords: tp.Optional[torch.Tensor] = None
-    melody: tp.Optional[torch.Tensor] = None
-@dataclass
-class ConditioningAttributes:
-    text: tp.Dict[str, tp.Optional[str]] = field(default_factory=dict)
-    wav: tp.Dict[str, WavCondition] = field(default_factory=dict)
-    joint_embed: tp.Dict[str, JointEmbedCondition] = field(default_factory=dict)
-    symbolic: tp.Dict[str, SymbolicCondition] = field(default_factory=dict)
-    def __getitem__(self, item):
-        return getattr(self, item)
-    @property
-    def text_attributes(self):
-        return self.text.keys()
-    @property
-    def wav_attributes(self):
-        return self.wav.keys()
-    @property
-    def joint_embed_attributes(self):
-        return self.joint_embed.keys()
-    @property
-    def symbolic_attributes(self):
-        return self.symbolic.keys()
-    @property
-    def attributes(self):
-        return {
-            "text": self.text_attributes,
-            "wav": self.wav_attributes,
-            "joint_embed": self.joint_embed_attributes,
-            "symbolic": self.symbolic_attributes,
-        }
-    def to_flat_dict(self):
-        return {
-            **{f"text.{k}": v for k, v in self.text.items()},
-            **{f"wav.{k}": v for k, v in self.wav.items()},
-            **{f"joint_embed.{k}": v for k, v in self.joint_embed.items()},
-            **{f"symbolic.{k}": v for k, v in self.symbolic.items()}
-        }
-    @classmethod
-    def from_flat_dict(cls, x):
-        out = cls()
-        for k, v in x.items():
-            kind, att = k.split(".")
-            out[kind][att] = v
-        return out
-class SegmentWithAttributes(SegmentInfo):
-    """Base class for all dataclasses that are used for conditioning.
-    All child classes should implement `to_condition_attributes` that converts
-    the existing attributes to a dataclass of type ConditioningAttributes.
-    """
-    def to_condition_attributes(self) -> ConditioningAttributes:
-        raise NotImplementedError()
-def nullify_condition(condition: ConditionType, dim: int = 1):
-    """Transform an input condition to a null condition.
-    The way it is done by converting it to a single zero vector similarly
-    to how it is done inside WhiteSpaceTokenizer and NoopTokenizer.
-    Args:
-        condition (ConditionType): A tuple of condition and mask (tuple[torch.Tensor, torch.Tensor])
-        dim (int): The dimension that will be truncated (should be the time dimension)
-        WARNING!: dim should not be the batch dimension!
-    Returns:
-        ConditionType: A tuple of null condition and mask
-    """
-    assert dim != 0, "dim cannot be the batch dimension!"
-    assert isinstance(condition, tuple) and \
-        isinstance(condition[0], torch.Tensor) and \
-        isinstance(condition[1], torch.Tensor), "'nullify_condition' got an unexpected input type!"
-    cond, mask = condition
-    B = cond.shape[0]
-    last_dim = cond.dim() - 1
-    out = cond.transpose(dim, last_dim)
-    out = 0. * out[..., :1]
-    out = out.transpose(dim, last_dim)
-    mask = torch.zeros((B, 1), device=out.device).int()
-    assert cond.dim() == out.dim()
-    return out, mask
-def nullify_wav(cond: WavCondition) -> WavCondition:
-    """Transform a WavCondition to a nullified WavCondition.
-    It replaces the wav by a null tensor, forces its length to 0, and replaces metadata by dummy attributes.
-    Args:
-        cond (WavCondition): Wav condition with wav, tensor of shape [B, T].
-    Returns:
-        WavCondition: Nullified wav condition.
-    """
-    null_wav, _ = nullify_condition((cond.wav, torch.zeros_like(cond.wav)), dim=cond.wav.dim() - 1)
-    return WavCondition(
-        wav=null_wav,
-        length=torch.tensor([0] * cond.wav.shape[0], device=cond.wav.device),
-        sample_rate=cond.sample_rate,
-        path=[None] * cond.wav.shape[0],
-        seek_time=[None] * cond.wav.shape[0],
-    )
-def nullify_joint_embed(embed: JointEmbedCondition) -> JointEmbedCondition:
-    """Nullify the joint embedding condition by replacing it by a null tensor, forcing its length to 0,
-    and replacing metadata by dummy attributes.
-    Args:
-        cond (JointEmbedCondition): Joint embedding condition with wav and text, wav tensor of shape [B, C, T].
-    """
-    null_wav, _ = nullify_condition((embed.wav, torch.zeros_like(embed.wav)), dim=embed.wav.dim() - 1)
-    return JointEmbedCondition(
-        wav=null_wav, text=[None] * len(embed.text),
-        length=torch.LongTensor([0]).to(embed.wav.device),
-        sample_rate=embed.sample_rate,
-        path=[None] * embed.wav.shape[0],
-        seek_time=[0] * embed.wav.shape[0],
-    )
-def nullify_chords(sym_cond: SymbolicCondition, null_chord_idx: int = 194) -> SymbolicCondition:
-    """Nullify the symbolic condition by setting all frame chords to a specified null chord index.
-    Args:
-        sym_cond (SymbolicCondition): The symbolic condition containing frame chords to be nullified.
-        null_chord_idx (int, optional): The index to use for nullifying the chords. Defaults to 194 (Chordino).
-    Returns:
-        SymbolicCondition: A new symbolic condition with all frame chords set to the null chord index.
-    """
-    return SymbolicCondition(frame_chords=torch.ones_like(sym_cond.frame_chords) * null_chord_idx)  # type: ignore
-def nullify_melody(sym_cond: SymbolicCondition) -> SymbolicCondition:
-    """Nullify the symbolic condition by replacing the melody matrix with zeros matrix.
-    Args:
-        sym_cond (SymbolicCondition): The symbolic condition containing frame chords to be nullified.
-        null_chord_idx (int, optional): The index to use for nullifying the chords. Defaults to 194 (Chordino).
-    Returns:
-        SymbolicCondition: A new symbolic condition with all frame chords set to the null chord index.
-    """
-    return SymbolicCondition(melody=torch.zeros_like(sym_cond.melody))  # type: ignore
-def _drop_description_condition(conditions: tp.List[ConditioningAttributes]) -> tp.List[ConditioningAttributes]:
-    """Drop the text condition but keep the wav conditon on a list of ConditioningAttributes.
-    This is useful to calculate l_style in the double classifier free guidance formula.
-    See paragraph 4.3 in https://arxiv.org/pdf/2407.12563
-    Args:
-        conditions (tp.List[ConditioningAttributes]): List of conditions.
-    """
-    # We assert that description and self_wav are in the conditions
-    for condition in conditions:
-        assert 'description' in condition.text.keys()
-        assert 'self_wav' in condition.wav.keys()
-    return AttributeDropout(p={'text': {'description': 1.0},
-                               'wav': {'self_wav': 0.0}})(conditions)
-class Tokenizer:
-    """Base tokenizer implementation
-    (in case we want to introduce more advances tokenizers in the future).
-    """
-    def __call__(self, texts: tp.List[tp.Optional[str]]) -> tp.Tuple[torch.Tensor, torch.Tensor]:
-        raise NotImplementedError()
-class WhiteSpaceTokenizer(Tokenizer):
-    """This tokenizer should be used for natural language descriptions.
-    For example:
-    ["he didn't, know he's going home.", 'shorter sentence'] =>
-    [[78, 62, 31,  4, 78, 25, 19, 34],
-    [59, 77,  0,  0,  0,  0,  0,  0]]
-    """
-    PUNCTUATION = "?:!.,;"
-    def __init__(self, n_bins: int, pad_idx: int = 0, language: str = "en_core_web_sm",
-                 lemma: bool = True, stopwords: bool = True) -> None:
-        self.n_bins = n_bins
-        self.pad_idx = pad_idx
-        self.lemma = lemma
-        self.stopwords = stopwords
-        try:
-            self.nlp = spacy.load(language)
-        except IOError:
-            spacy.cli.download(language)  # type: ignore
-            self.nlp = spacy.load(language)
-    @tp.no_type_check
-    def __call__(self, texts: tp.List[tp.Optional[str]],
-                 return_text: bool = False) -> tp.Tuple[torch.Tensor, torch.Tensor]:
-        """Take a list of strings and convert them to a tensor of indices.
-        Args:
-            texts (list[str]): List of strings.
-            return_text (bool, optional): Whether to return text as additional tuple item. Defaults to False.
-        Returns:
-            tuple[torch.Tensor, torch.Tensor]:
-                - Indices of words in the LUT.
-                - And a mask indicating where the padding tokens are
-        """
-        output, lengths = [], []
-        texts = deepcopy(texts)
-        for i, text in enumerate(texts):
-            # if current sample doesn't have a certain attribute, replace with pad token
-            if text is None:
-                output.append(torch.Tensor([self.pad_idx]))
-                lengths.append(0)
-                continue
-            # convert numbers to words
-            text = re.sub(r"(\d+)", lambda x: num2words(int(x.group(0))), text)  # type: ignore
-            # normalize text
-            text = self.nlp(text)  # type: ignore
-            # remove stopwords
-            if self.stopwords:
-                text = [w for w in text if not w.is_stop]  # type: ignore
-            # remove punctuation
-            text = [w for w in text if w.text not in self.PUNCTUATION]  # type: ignore
-            # lemmatize if needed
-            text = [getattr(t, "lemma_" if self.lemma else "text") for t in text]  # type: ignore
-            texts[i] = " ".join(text)
-            lengths.append(len(text))
-            # convert to tensor
-            tokens = torch.Tensor([hash_trick(w, self.n_bins) for w in text])
-            output.append(tokens)
-        mask = length_to_mask(torch.IntTensor(lengths)).int()
-        padded_output = pad_sequence(output, padding_value=self.pad_idx).int().t()
-        if return_text:
-            return padded_output, mask, texts  # type: ignore
-        return padded_output, mask
-class NoopTokenizer(Tokenizer):
-    """This tokenizer should be used for global conditioners such as: artist, genre, key, etc.
-    The difference between this and WhiteSpaceTokenizer is that NoopTokenizer does not split
-    strings, so "Jeff Buckley" will get it's own index. Whereas WhiteSpaceTokenizer will
-    split it to ["Jeff", "Buckley"] and return an index per word.
-    For example:
-    ["Queen", "ABBA", "Jeff Buckley"] => [43, 55, 101]
-    ["Metal", "Rock", "Classical"] => [0, 223, 51]
-    """
-    def __init__(self, n_bins: int, pad_idx: int = 0):
-        self.n_bins = n_bins
-        self.pad_idx = pad_idx
-    def __call__(self, texts: tp.List[tp.Optional[str]]) -> tp.Tuple[torch.Tensor, torch.Tensor]:
-        output, lengths = [], []
-        for text in texts:
-            # if current sample doesn't have a certain attribute, replace with pad token
-            if text is None:
-                output.append(self.pad_idx)
-                lengths.append(0)
-            else:
-                output.append(hash_trick(text, self.n_bins))
-                lengths.append(1)
-        tokens = torch.LongTensor(output).unsqueeze(1)
-        mask = length_to_mask(torch.IntTensor(lengths)).int()
-        return tokens, mask
-class BaseConditioner(nn.Module):
-    """Base model for all conditioner modules.
-    We allow the output dim to be different than the hidden dim for two reasons:
-    1) keep our LUTs small when the vocab is large;
-    2) make all condition dims consistent.
-    Args:
-        dim (int): Hidden dim of the model.
-        output_dim (int): Output dim of the conditioner.
-    """
-    def __init__(self, dim: int, output_dim: int):
-        super().__init__()
-        self.dim = dim
-        self.output_dim = output_dim
-        if self.output_dim > -1:  # omit projection when output_dim <= 0
-            self.output_proj = nn.Linear(dim, output_dim)
-    def tokenize(self, *args, **kwargs) -> tp.Any:
-        """Should be any part of the processing that will lead to a synchronization
-        point, e.g. BPE tokenization with transfer to the GPU.
-        The returned value will be saved and return later when calling forward().
-        """
-        raise NotImplementedError()
-    def forward(self, inputs: tp.Any) -> ConditionType:
-        """Gets input that should be used as conditioning (e.g, genre, description or a waveform).
-        Outputs a ConditionType, after the input data was embedded as a dense vector.
-        Returns:
-            ConditionType:
-                - A tensor of size [B, T, D] where B is the batch size, T is the length of the
-                  output embedding and D is the dimension of the embedding.
-                - And a mask indicating where the padding tokens.
-        """
-        raise NotImplementedError()
-class TextConditioner(BaseConditioner):
-    ...
-class LUTConditioner(TextConditioner):
-    """Lookup table TextConditioner.
-    Args:
-        n_bins (int): Number of bins.
-        dim (int): Hidden dim of the model (text-encoder/LUT).
-        output_dim (int): Output dim of the conditioner.
-        tokenizer (str): Name of the tokenizer.
-        pad_idx (int, optional): Index for padding token. Defaults to 0.
-    """
-    def __init__(self, n_bins: int, dim: int, output_dim: int, tokenizer: str, pad_idx: int = 0):
-        super().__init__(dim, output_dim)
-        self.embed = nn.Embedding(n_bins, dim)
-        self.tokenizer: Tokenizer
-        if tokenizer == 'whitespace':
-            self.tokenizer = WhiteSpaceTokenizer(n_bins, pad_idx=pad_idx)
-        elif tokenizer == 'noop':
-            self.tokenizer = NoopTokenizer(n_bins, pad_idx=pad_idx)
-        else:
-            raise ValueError(f"unrecognized tokenizer `{tokenizer}`.")
-    def tokenize(self, x: tp.List[tp.Optional[str]]) -> tp.Tuple[torch.Tensor, torch.Tensor]:
-        device = self.embed.weight.device
-        tokens, mask = self.tokenizer(x)
-        tokens, mask = tokens.to(device), mask.to(device)
-        return tokens, mask
-    def forward(self, inputs: tp.Tuple[torch.Tensor, torch.Tensor]) -> ConditionType:
-        tokens, mask = inputs
-        embeds = self.embed(tokens)
-        embeds = self.output_proj(embeds)
-        embeds = (embeds * mask.unsqueeze(-1))
-        return embeds, mask
-class T5Conditioner(TextConditioner):
-    """T5-based TextConditioner.
-    Args:
-        name (str): Name of the T5 model.
-        output_dim (int): Output dim of the conditioner.
-        finetune (bool): Whether to fine-tune T5 at train time.
-        device (str): Device for T5 Conditioner.
-        autocast_dtype (tp.Optional[str], optional): Autocast dtype.
-        word_dropout (float, optional): Word dropout probability.
-        normalize_text (bool, optional): Whether to apply text normalization.
-    """
-    MODELS = ["t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b",
-              "google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large",
-              "google/flan-t5-xl", "google/flan-t5-xxl"]
-    MODELS_DIMS = {
-        "t5-small": 512,
-        "t5-base": 768,
-        "t5-large": 1024,
-        "t5-3b": 1024,
-        "t5-11b": 1024,
-        "google/flan-t5-small": 512,
-        "google/flan-t5-base": 768,
-        "google/flan-t5-large": 1024,
-        "google/flan-t5-3b": 1024,
-        "google/flan-t5-11b": 1024,
-    }
-    def __init__(self, name: str, output_dim: int, finetune: bool, device: str,
-                 autocast_dtype: tp.Optional[str] = 'float32', word_dropout: float = 0.,
-                 normalize_text: bool = False):
-        assert name in self.MODELS, f"Unrecognized t5 model name (should in {self.MODELS})"
-        super().__init__(self.MODELS_DIMS[name], output_dim)
-        self.device = device
-        self.name = name
-        self.finetune = finetune
-        self.word_dropout = word_dropout
-        if autocast_dtype is None or self.device == 'cpu':
-            self.autocast = TorchAutocast(enabled=False)
-            if self.device != 'cpu':
-                logger.warning("T5 has no autocast, this might lead to NaN")
-        else:
-            dtype = getattr(torch, autocast_dtype)
-            assert isinstance(dtype, torch.dtype)
-            logger.info(f"T5 will be evaluated with autocast as {autocast_dtype}")
-            self.autocast = TorchAutocast(enabled=True, device_type=self.device, dtype=dtype)
-        # Let's disable logging temporarily because T5 will vomit some errors otherwise.
-        # thanks https://gist.github.com/simon-weber/7853144
-        previous_level = logging.root.manager.disable
-        logging.disable(logging.ERROR)
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore")
-            try:
-                self.t5_tokenizer = T5Tokenizer.from_pretrained(name)
-                t5 = T5EncoderModel.from_pretrained(name).train(mode=finetune)
-            finally:
-                logging.disable(previous_level)
-        if finetune:
-            self.t5 = t5
-        else:
-            # this makes sure that the t5 models is not part
-            # of the saved checkpoint
-            self.__dict__['t5'] = t5.to(device)
-        self.normalize_text = normalize_text
-        if normalize_text:
-            self.text_normalizer = WhiteSpaceTokenizer(1, lemma=True, stopwords=True)
-    def tokenize(self, x: tp.List[tp.Optional[str]]) -> tp.Dict[str, torch.Tensor]:
-        # if current sample doesn't have a certain attribute, replace with empty string
-        entries: tp.List[str] = [xi if xi is not None else "" for xi in x]
-        if self.normalize_text:
-            _, _, entries = self.text_normalizer(entries, return_text=True)
-        if self.word_dropout > 0. and self.training:
-            new_entries = []
-            for entry in entries:
-                words = [word for word in entry.split(" ") if random.random() >= self.word_dropout]
-                new_entries.append(" ".join(words))
-            entries = new_entries
-        empty_idx = torch.LongTensor([i for i, xi in enumerate(entries) if xi == ""])
-        inputs = self.t5_tokenizer(entries, return_tensors='pt', padding=True).to(self.device)
-        mask = inputs['attention_mask']
-        mask[empty_idx, :] = 0  # zero-out index where the input is non-existant
-        return inputs
-    def forward(self, inputs: tp.Dict[str, torch.Tensor]) -> ConditionType:
-        mask = inputs['attention_mask']
-        with torch.set_grad_enabled(self.finetune), self.autocast:
-            embeds = self.t5(**inputs).last_hidden_state
-        embeds = self.output_proj(embeds.to(self.output_proj.weight))
-        embeds = (embeds * mask.unsqueeze(-1))
-        return embeds, mask
-class WaveformConditioner(BaseConditioner):
-    """Base class for all conditioners that take a waveform as input.
-    Classes that inherit must implement `_get_wav_embedding` that outputs
-    a continuous tensor, and `_downsampling_factor` that returns the down-sampling
-    factor of the embedding model.
-    Args:
-        dim (int): The internal representation dimension.
-        output_dim (int): Output dimension.
-        device (tp.Union[torch.device, str]): Device.
-    """
-    def __init__(self, dim: int, output_dim: int, device: tp.Union[torch.device, str]):
-        super().__init__(dim, output_dim)
-        self.device = device
-        # if False no masking is done, used in ChromaStemConditioner when completing by periodicity a sample.
-        self._use_masking = True
-    def tokenize(self, x: WavCondition) -> WavCondition:
-        wav, length, sample_rate, path, seek_time = x
-        assert length is not None
-        return WavCondition(wav.to(self.device), length.to(self.device), sample_rate, path, seek_time)
-    def _get_wav_embedding(self, x: WavCondition) -> torch.Tensor:
-        """Gets as input a WavCondition and returns a dense embedding."""
-        raise NotImplementedError()
-    def _downsampling_factor(self):
-        """Returns the downsampling factor of the embedding model."""
-        raise NotImplementedError()
-    def forward(self, x: WavCondition) -> ConditionType:
-        """Extract condition embedding and mask from a waveform and its metadata.
-        Args:
-            x (WavCondition): Waveform condition containing raw waveform and metadata.
-        Returns:
-            ConditionType: a dense vector representing the conditioning along with its mask
-        """
-        wav, lengths, *_ = x
-        with torch.no_grad():
-            embeds = self._get_wav_embedding(x)
-        if hasattr(self, 'output_proj'):
-            embeds = embeds.to(self.output_proj.weight)
-            embeds = self.output_proj(embeds)
-        if lengths is not None and self._use_masking:
-            lengths = lengths / self._downsampling_factor()
-            mask = length_to_mask(lengths, max_len=embeds.shape[1]).int()  # type: ignore
-        else:
-            mask = torch.ones_like(embeds[..., 0])
-        embeds = (embeds * mask.unsqueeze(-1))
-        return embeds, mask
-class ChromaStemConditioner(WaveformConditioner):
-    """Chroma conditioner based on stems.
-    The ChromaStemConditioner uses DEMUCS to first filter out drums and bass, as
-    the drums and bass often dominate the chroma leading to the chroma features
-    not containing information about the melody.
-    Args:
-        output_dim (int): Output dimension for the conditioner.
-        sample_rate (int): Sample rate for the chroma extractor.
-        n_chroma (int): Number of chroma bins for the chroma extractor.
-        radix2_exp (int): Size of stft window for the chroma extractor (power of 2, e.g. 12 -> 2^12).
-        duration (int): duration used during training. This is later used for correct padding
-            in case we are using chroma as prefix.
-        match_len_on_eval (bool, optional): if True then all chromas are padded to the training
-            duration. Defaults to False.
-        eval_wavs (str, optional): path to a dataset manifest with waveform, this waveforms are used as
-            conditions during eval (for cases where we don't want to leak test conditions like MusicCaps).
-            Defaults to None.
-        n_eval_wavs (int, optional): limits the number of waveforms used for conditioning. Defaults to 0.
-        device (tp.Union[torch.device, str], optional): Device for the conditioner.
-        **kwargs: Additional parameters for the chroma extractor.
-    """
-    def __init__(self, output_dim: int, sample_rate: int, n_chroma: int, radix2_exp: int,
-                 duration: float, match_len_on_eval: bool = True, eval_wavs: tp.Optional[str] = None,
-                 n_eval_wavs: int = 0, cache_path: tp.Optional[tp.Union[str, Path]] = None,
-                 device: tp.Union[torch.device, str] = 'cpu', **kwargs):
-        from demucs import pretrained
-        super().__init__(dim=n_chroma, output_dim=output_dim, device=device)
-        self.autocast = TorchAutocast(enabled=device != 'cpu', device_type=self.device, dtype=torch.float32)
-        self.sample_rate = sample_rate
-        self.match_len_on_eval = match_len_on_eval
-        if match_len_on_eval:
-            self._use_masking = False
-        self.duration = duration
-        self.__dict__['demucs'] = pretrained.get_model('htdemucs').to(device)
-        stem_sources: list = self.demucs.sources  # type: ignore
-        self.stem_indices = torch.LongTensor([stem_sources.index('vocals'), stem_sources.index('other')]).to(device)
-        self.chroma = ChromaExtractor(sample_rate=sample_rate, n_chroma=n_chroma,
-                                      radix2_exp=radix2_exp, **kwargs).to(device)
-        self.chroma_len = self._get_chroma_len()
-        self.eval_wavs: tp.Optional[torch.Tensor] = self._load_eval_wavs(eval_wavs, n_eval_wavs)
-        self.cache = None
-        if cache_path is not None:
-            self.cache = EmbeddingCache(Path(cache_path) / 'wav', self.device,
-                                        compute_embed_fn=self._get_full_chroma_for_cache,
-                                        extract_embed_fn=self._extract_chroma_chunk)
-    def _downsampling_factor(self) -> int:
-        return self.chroma.winhop
-    def _load_eval_wavs(self, path: tp.Optional[str], num_samples: int) -> tp.Optional[torch.Tensor]:
-        """Load pre-defined waveforms from a json.
-        These waveforms will be used for chroma extraction during evaluation.
-        This is done to make the evaluation on MusicCaps fair (we shouldn't see the chromas of MusicCaps).
-        """
-        if path is None:
-            return None
-        logger.info(f"Loading evaluation wavs from {path}")
-        from audiocraft.data.audio_dataset import AudioDataset
-        dataset: AudioDataset = AudioDataset.from_meta(
-            path, segment_duration=self.duration, min_audio_duration=self.duration,
-            sample_rate=self.sample_rate, channels=1)
-        if len(dataset) > 0:
-            eval_wavs = dataset.collater([dataset[i] for i in range(num_samples)]).to(self.device)
-            logger.info(f"Using {len(eval_wavs)} evaluation wavs for chroma-stem conditioner")
-            return eval_wavs
-        else:
-            raise ValueError("Could not find evaluation wavs, check lengths of wavs")
-    def reset_eval_wavs(self, eval_wavs: tp.Optional[torch.Tensor]) -> None:
-        self.eval_wavs = eval_wavs
-    def has_eval_wavs(self) -> bool:
-        return self.eval_wavs is not None
-    def _sample_eval_wavs(self, num_samples: int) -> torch.Tensor:
-        """Sample wavs from a predefined list."""
-        assert self.eval_wavs is not None, "Cannot sample eval wavs as no eval wavs provided."
-        total_eval_wavs = len(self.eval_wavs)
-        out = self.eval_wavs
-        if num_samples > total_eval_wavs:
-            out = self.eval_wavs.repeat(num_samples // total_eval_wavs + 1, 1, 1)
-        return out[torch.randperm(len(out))][:num_samples]
-    def _get_chroma_len(self) -> int:
-        """Get length of chroma during training."""
-        dummy_wav = torch.zeros((1, int(self.sample_rate * self.duration)), device=self.device)
-        dummy_chr = self.chroma(dummy_wav)
-        return dummy_chr.shape[1]
-    @torch.no_grad()
-    def _get_stemmed_wav(self, wav: torch.Tensor, sample_rate: int) -> torch.Tensor:
-        """Get parts of the wav that holds the melody, extracting the main stems from the wav."""
-        from demucs.apply import apply_model
-        from demucs.audio import convert_audio
-        with self.autocast:
-            wav = convert_audio(
-                wav, sample_rate, self.demucs.samplerate, self.demucs.audio_channels)  # type: ignore
-            stems = apply_model(self.demucs, wav, device=self.device)  # type: ignore
-            stems = stems[:, self.stem_indices]  # extract relevant stems for melody conditioning
-            mix_wav = stems.sum(1)  # merge extracted stems to single waveform
-            mix_wav = convert_audio(mix_wav, self.demucs.samplerate, self.sample_rate, 1)  # type: ignore
-            return mix_wav
-    @torch.no_grad()
-    def _extract_chroma(self, wav: torch.Tensor) -> torch.Tensor:
-        """Extract chroma features from the waveform."""
-        with self.autocast:
-            return self.chroma(wav)
-    @torch.no_grad()
-    def _compute_wav_embedding(self, wav: torch.Tensor, sample_rate: int) -> torch.Tensor:
-        """Compute wav embedding, applying stem and chroma extraction."""
-        # avoid 0-size tensors when we are working with null conds
-        if wav.shape[-1] == 1:
-            return self._extract_chroma(wav)
-        stems = self._get_stemmed_wav(wav, sample_rate)
-        chroma = self._extract_chroma(stems)
-        return chroma
-    @torch.no_grad()
-    def _get_full_chroma_for_cache(self, path: tp.Union[str, Path], x: WavCondition, idx: int) -> torch.Tensor:
-        """Extract chroma from the whole audio waveform at the given path."""
-        wav, sr = audio_read(path)
-        wav = wav[None].to(self.device)
-        wav = convert_audio(wav, sr, self.sample_rate, to_channels=1)
-        chroma = self._compute_wav_embedding(wav, self.sample_rate)[0]
-        return chroma
-    def _extract_chroma_chunk(self, full_chroma: torch.Tensor, x: WavCondition, idx: int) -> torch.Tensor:
-        """Extract a chunk of chroma from the full chroma derived from the full waveform."""
-        wav_length = x.wav.shape[-1]
-        seek_time = x.seek_time[idx]
-        assert seek_time is not None, (
-            "WavCondition seek_time is required "
-            "when extracting chroma chunks from pre-computed chroma.")
-        full_chroma = full_chroma.float()
-        frame_rate = self.sample_rate / self._downsampling_factor()
-        target_length = int(frame_rate * wav_length / self.sample_rate)
-        index = int(frame_rate * seek_time)
-        out = full_chroma[index: index + target_length]
-        out = F.pad(out[None], (0, 0, 0, target_length - out.shape[0]))[0]
-        return out.to(self.device)
-    @torch.no_grad()
-    def _get_wav_embedding(self, x: WavCondition) -> torch.Tensor:
-        """Get the wav embedding from the WavCondition.
-        The conditioner will either extract the embedding on-the-fly computing it from the condition wav directly
-        or will rely on the embedding cache to load the pre-computed embedding if relevant.
-        """
-        sampled_wav: tp.Optional[torch.Tensor] = None
-        if not self.training and self.eval_wavs is not None:
-            warn_once(logger, "Using precomputed evaluation wavs!")
-            sampled_wav = self._sample_eval_wavs(len(x.wav))
-        no_undefined_paths = all(p is not None for p in x.path)
-        no_nullified_cond = x.wav.shape[-1] > 1
-        if sampled_wav is not None:
-            chroma = self._compute_wav_embedding(sampled_wav, self.sample_rate)
-        elif self.cache is not None and no_undefined_paths and no_nullified_cond:
-            paths = [Path(p) for p in x.path if p is not None]
-            chroma = self.cache.get_embed_from_cache(paths, x)
-        else:
-            assert all(sr == x.sample_rate[0] for sr in x.sample_rate), "All sample rates in batch should be equal."
-            chroma = self._compute_wav_embedding(x.wav, x.sample_rate[0])
-        if self.match_len_on_eval:
-            B, T, C = chroma.shape
-            if T > self.chroma_len:
-                chroma = chroma[:, :self.chroma_len]
-                logger.debug(f"Chroma was truncated to match length! ({T} -> {chroma.shape[1]})")
-            elif T < self.chroma_len:
-                n_repeat = int(math.ceil(self.chroma_len / T))
-                chroma = chroma.repeat(1, n_repeat, 1)
-                chroma = chroma[:, :self.chroma_len]
-                logger.debug(f"Chroma was repeated to match length! ({T} -> {chroma.shape[1]})")
-        return chroma
-    def tokenize(self, x: WavCondition) -> WavCondition:
-        """Apply WavConditioner tokenization and populate cache if needed."""
-        x = super().tokenize(x)
-        no_undefined_paths = all(p is not None for p in x.path)
-        if self.cache is not None and no_undefined_paths:
-            paths = [Path(p) for p in x.path if p is not None]
-            self.cache.populate_embed_cache(paths, x)
-        return x
-class FeatureExtractor(WaveformConditioner):
-    """
-    Feature Extractor used for the style conditioner of the paper AUDIO CONDITIONING
-        FOR MUSIC GENERATION VIA DISCRETE BOTTLENECK FEATURES.
-    Given a waveform, we extract an excerpt of defined length randomly subsampled.
-        Then, we feed this excerpt to a feature extractor.
-    Args:
-        model_name (str): 'encodec' or 'mert'.
-        sample_rate (str): sample rate of the input audio. (32000)
-        encodec_checkpoint (str): if encodec is used as a feature extractor, checkpoint
-            of the model. ('//pretrained/facebook/encodec_32khz' is the default)
-        encodec_n_q (int): if encodec is used as a feature extractor it sets the number of
-            quantization streams used in it.
-        length (float): length in seconds of the random subsampled excerpt that is used
-            for conditioning.
-        dim (int): The internal representation dimension.
-        output_dim (int): Output dimension for the conditioner.
-        device (tp.Union[torch.device, str], optional): Device for the conditioner.
-        compute_mask (bool): whether to mask the tokens corresponding to the subsampled
-            excerpt in the computation of the music language model cross-entropy loss.
-        use_middle_of_segment (bool): if True, always take the middle of the input
-            instead of a random subsampled excerpt.
-        ds_rate_compression (int): downsampling parameter of the compression model used
-            for the music language model. (640 for encodec_32khz)
-        num_codebooks_lm (int): the number of codebooks used by the music language model.
-    """
-    def __init__(
-        self, model_name: str,
-        sample_rate: int, encodec_checkpoint: str, encodec_n_q: int, length: float,
-        dim: int, output_dim: int, device: tp.Union[torch.device, str],
-        compute_mask: bool = True,
-        use_middle_of_segment: bool = False, ds_rate_compression: int = 640,
-        num_codebooks_lm: int = 4
-    ):
-        assert model_name in ['encodec', 'mert']
-        if model_name == 'encodec':
-            from ..solvers.compression import CompressionSolver
-            feat_extractor = CompressionSolver.model_from_checkpoint(encodec_checkpoint, device)
-        elif model_name == 'mert':
-            from transformers import AutoModel
-            feat_extractor = AutoModel.from_pretrained("m-a-p/MERT-v1-95M", trust_remote_code=True)
-        super().__init__(
-            dim=dim,
-            output_dim=output_dim,
-            device=device
-        )
-        self.sample_rate = sample_rate
-        self.compute_mask = compute_mask
-        self.feat_extractor: nn.Module
-        self.embed: tp.Union[nn.ModuleList, nn.Linear]
-        if model_name == 'encodec':
-            self.__dict__["feat_extractor"] = feat_extractor.to(device)
-            self.encodec_n_q = encodec_n_q
-            self.embed = nn.ModuleList([nn.Embedding(feat_extractor.cardinality, dim) for _ in range(encodec_n_q)])
-        if model_name == 'mert':
-            self.__dict__["feat_extractor"] = feat_extractor.eval().to(device)
-            self.embed = nn.Linear(768, dim)  # hardcoded
-        self.length_subwav = int(length * sample_rate)
-        self.ds_rate_compression = ds_rate_compression
-        self.model_name = model_name
-        self.use_middle_of_segment = use_middle_of_segment
-        self.num_codebooks_lm = num_codebooks_lm
-    def _get_wav_embedding(self, x: WavCondition) -> torch.Tensor:
-        if x.wav.shape[-1] == 1:
-            self.temp_mask = None
-            return torch.zeros(x.wav.shape[0], 1, self.dim, device=self.device)
-        else:
-            with torch.no_grad():
-                if self.use_middle_of_segment:
-                    start = int((x.wav.shape[-1] - self.length_subwav) / 2)
-                    wav = x.wav[:, :, start:start+self.length_subwav]
-                else:
-                    start = random.randint(0, x.wav.shape[-1] - self.length_subwav)
-                    wav = x.wav[:, :, start:start+self.length_subwav]
-                if self.compute_mask:
-                    self.temp_mask = self._get_mask_wav(x, start)
-                if self.model_name == 'encodec':
-                    tokens = self.feat_extractor.encode(wav)[0]  # type: ignore
-                elif self.model_name == 'mert':
-                    wav = convert_audio(wav, from_rate=x.sample_rate[0], to_rate=24000, to_channels=1)
-                    embeds = self.feat_extractor(wav.squeeze(-2)).last_hidden_state
-            if self.model_name == 'encodec':
-                tokens = tokens[:, :self.encodec_n_q]
-                embeds = sum([self.embed[k](tokens[:, k]) for k in range(self.encodec_n_q)])  # type: ignore
-            else:
-                embeds = self.embed(embeds)
-            return embeds  # [B, T, dim]
-    def _downsampling_factor(self):
-        if self.model_name == 'encodec':
-            return self.sample_rate / self.feat_extractor.frame_rate
-        elif self.model_name == 'mert':
-            return self.sample_rate / 75
-    def _get_mask_wav(self, x: WavCondition, start: int) -> tp.Union[torch.Tensor, None]:
-        if x.wav.shape[-1] == 1:
-            return None
-        total_length = int(x.wav.shape[-1] / self.ds_rate_compression)
-        mask_length = int(self.length_subwav / self.ds_rate_compression)
-        start = int(start / self.ds_rate_compression)
-        mask = torch.ones(x.wav.shape[0], self.num_codebooks_lm,
-                          total_length, device=self.device, dtype=torch.bool)
-        mask[:, :, start:start+mask_length] = 0
-        return mask
-class StyleConditioner(FeatureExtractor):
-    """Conditioner from the paper AUDIO CONDITIONING FOR MUSIC GENERATION VIA
-    DISCRETE BOTTLENECK FEATURES.
-    Given an audio input, it is passed through a Feature Extractor and a
-    transformer encoder. Then it is quantized through RVQ.
-    Args:
-        transformer_scale (str): size of the transformer. See in the __init__ to have more infos.
-        ds_factor (int): the downsampling factor applied to the representation after quantization.
-        encodec_n_q (int): if encodec is used as a feature extractor it sets the number of
-            quantization streams used in it.
-        n_q_out (int): the number of quantization streams used for the RVQ. If increased, there
-            is more information passing as a conditioning.
-        eval_q (int): the number of quantization streams used for the RVQ at evaluation time.
-        q_dropout (bool): if True, at training time, a random number of stream is sampled
-            at each step in the interval [1, n_q_out].
-        bins (int): the codebook size used for each quantization stream.
-        varying_lengths (List[float]): list of the min and max duration in seconds for the
-            randomly subsampled excerpt at training time. For each step a length is sampled
-            in this interval.
-        batch_norm (bool): use of batch normalization after the transformer. Stabilizes the
-            training.
-        rvq_threshold_ema_dead_code (float): threshold for dropping dead codes in the
-            RVQ.
-    """
-    def __init__(self, transformer_scale: str = 'default', ds_factor: int = 15, encodec_n_q: int = 4,
-                 n_q_out: int = 6, eval_q: int = 3, q_dropout: bool = True, bins: int = 1024,
-                 varying_lengths: tp.List[float] = [1.5, 4.5],
-                 batch_norm: bool = True, rvq_threshold_ema_dead_code: float = 0.1,
-                 **kwargs):
-        tr_args: tp.Dict[str, tp.Any]
-        if transformer_scale == 'xsmall':
-            tr_args = {'d_model': 256, 'num_heads': 8, 'num_layers': 4}
-        elif transformer_scale == 'large':
-            tr_args = {'d_model': 1024, 'num_heads': 16, 'num_layers': 24}
-        elif transformer_scale == 'default':
-            tr_args = {'d_model': 512, 'num_heads': 8, 'num_layers': 8}
-        elif transformer_scale == 'none':
-            tr_args = {'d_model': 512}
-        tr_args.update({
-            'memory_efficient': True, 'activation': 'gelu',
-            'norm_first': True, 'causal': False, 'layer_scale': None,
-            'bias_ff': False, 'bias_attn': False,
-        })
-        dim = tr_args['d_model']
-        super().__init__(dim=dim, encodec_n_q=encodec_n_q, **kwargs)
-        self.ds_factor = ds_factor
-        if transformer_scale == 'none':
-            self.transformer = None
-        else:
-            self.transformer = StreamingTransformer(dim_feedforward=int(4 * dim), **tr_args)
-        self.n_q_out = n_q_out
-        self.eval_q = eval_q
-        self.rvq = None
-        if n_q_out > 0:
-            self.rvq = ResidualVectorQuantizer(dim, n_q=n_q_out, q_dropout=q_dropout, bins=bins,
-                                               threshold_ema_dead_code=rvq_threshold_ema_dead_code)
-        self.autocast = TorchAutocast(enabled=self.device != 'cpu', device_type=self.device, dtype=torch.float32)
-        self.varying_lengths = varying_lengths
-        self.batch_norm = None
-        if batch_norm:
-            self.batch_norm = nn.BatchNorm1d(dim, affine=False)
-        self.mask = None
-    def _get_wav_embedding(self, wav: WavCondition) -> torch.Tensor:
-        with self.autocast:
-            # Sample the length of the excerpts
-            if self.varying_lengths and self.training:
-                assert len(self.varying_lengths) == 2
-                length = random.uniform(self.varying_lengths[0], self.varying_lengths[1])
-                self.length_subwav = int(length * self.sample_rate)
-            z1 = super()._get_wav_embedding(wav)
-            if self.compute_mask:
-                self.mask = self.temp_mask  # type: ignore
-            self.temp_mask = None
-            if self.transformer is not None:
-                out1 = self.transformer(z1)
-            else:
-                out1 = z1
-            if self.batch_norm:
-                out1 = self.batch_norm(out1.transpose(1, 2)).transpose(1, 2)
-            # Apply quantization
-            if self.rvq:
-                if self.training:
-                    self.rvq.set_num_codebooks(self.n_q_out)
-                else:
-                    self.rvq.set_num_codebooks(self.eval_q)
-                out1 = self.rvq(out1.transpose(1, 2), frame_rate=1.)
-                if self.training:
-                    flashy.distrib.average_tensors(self.rvq.buffers())
-                out1 = out1.x.transpose(1, 2)
-            # Apply fix downsample
-            out1 = out1[:, ::self.ds_factor]
-        return out1
-    def set_params(self, eval_q: int = 3,
-                   excerpt_length: float = 3.0,
-                   ds_factor: tp.Optional[int] = None, encodec_n_q: tp.Optional[int] = None):
-        """Modify the parameters of the SSL or introduce new parameters to add noise to
-        the conditioning or to downsample it
-        Args:
-            eval_q (int): number of codebooks used when evaluating the model
-            excerpt_length (float): the length of the excerpts used to condition the model
-        """
-        self.eval_q = eval_q
-        self.length_subwav = int(excerpt_length * self.sample_rate)
-        if ds_factor is not None:
-            self.ds_factor = ds_factor
-        if encodec_n_q is not None:
-            self.encodec_n_q = encodec_n_q
-    def _downsampling_factor(self):
-        df = super()._downsampling_factor()
-        return df * self.ds_factor
-    def forward(self, x: WavCondition) -> ConditionType:
-        wav, lengths, *_ = x
-        embeds = self._get_wav_embedding(x)
-        embeds = embeds.to(self.output_proj.weight)
-        embeds = self.output_proj(embeds)
-        lengths = lengths / self._downsampling_factor()
-        mask = length_to_mask(lengths, max_len=embeds.shape[1]).int()  # type: ignore
-        embeds = (embeds * mask.unsqueeze(2).to(self.device))
-        return embeds, mask
-class JointEmbeddingConditioner(BaseConditioner):
-    """Joint embedding conditioning supporting both audio or text conditioning.
-    Args:
-        dim (int): Dimension.
-        output_dim (int): Output dimension.
-        device (str): Device.
-        attribute (str): Attribute used by the conditioner.
-        autocast_dtype (str): Autocast for the conditioner.
-        quantize (bool): Whether to quantize the CLAP embedding.
-        n_q (int): Number of residual quantizers (used if quantize is true).
-        bins (int): Quantizers' codebooks size (used if quantize is true).
-        kwargs: Additional parameters for residual vector quantizer.
-    """
-    def __init__(self, dim: int, output_dim: int, device: str, attribute: str,
-                 autocast_dtype: tp.Optional[str] = 'float32', quantize: bool = True,
-                 n_q: int = 12, bins: int = 1024, **kwargs):
-        super().__init__(dim=dim, output_dim=output_dim)
-        self.device = device
-        self.attribute = attribute
-        if autocast_dtype is None or device == 'cpu':
-            self.autocast = TorchAutocast(enabled=False)
-            logger.warning("JointEmbeddingConditioner has no autocast, this might lead to NaN.")
-        else:
-            dtype = getattr(torch, autocast_dtype)
-            assert isinstance(dtype, torch.dtype)
-            logger.info(f"JointEmbeddingConditioner will be evaluated with autocast as {autocast_dtype}.")
-            self.autocast = TorchAutocast(enabled=True, device_type=self.device, dtype=dtype)
-        # residual vector quantizer to discretize the conditioned embedding
-        self.quantizer: tp.Optional[ResidualVectorQuantizer] = None
-        if quantize:
-            self.quantizer = ResidualVectorQuantizer(dim, n_q=n_q, bins=bins, **kwargs)
-    def _get_embed(self, x: JointEmbedCondition) -> tp.Tuple[torch.Tensor, torch.Tensor]:
-        """Get joint embedding in latent space from the inputs.
-        Returns:
-            tuple[torch.Tensor, torch.Tensor]: Tensor for the latent embedding
-                and corresponding empty indexes.
-        """
-        raise NotImplementedError()
-    def forward(self, x: JointEmbedCondition) -> ConditionType:
-        with self.autocast:
-            embed, empty_idx = self._get_embed(x)
-            if self.quantizer is not None:
-                embed = embed.view(-1, self.dim, 1)
-                q_res = self.quantizer(embed, frame_rate=1)
-                out_embed = q_res.x.view(-1, self.dim)
-            else:
-                out_embed = embed
-            out_embed = self.output_proj(out_embed).view(-1, 1, self.output_dim)
-            mask = torch.ones(*out_embed.shape[:2], device=out_embed.device)
-            mask[empty_idx, :] = 0  # zero-out index where the input is non-existant
-            out_embed = (out_embed * mask.unsqueeze(-1))
-            return out_embed, mask
-    def tokenize(self, x: JointEmbedCondition) -> JointEmbedCondition:
-        return x
-class CLAPEmbeddingConditioner(JointEmbeddingConditioner):
-    """Joint Embedding conditioner based on pre-trained CLAP model.
-    This CLAP-based conditioner supports a caching mechanism
-    over the computed embeddings for faster training.
-    Args:
-        dim (int): Dimension.
-        output_dim (int): Output dimension.
-        device (str): Device.
-        attribute (str): Attribute used by the conditioner.
-        quantize (bool): Whether to quantize the CLAP embedding.
-        n_q (int): Number of residual quantizers (used if quantize is true).
-        bins (int): Quantizers' codebooks size (used if quantize is true).
-        checkpoint (str): Path to CLAP checkpoint.
-        model_arch (str): CLAP model architecture.
-        enable_fusion (bool): Enable fusion for CLAP model.
-        sample_rate (int): Sample rate used by CLAP model.
-        max_audio_length (float): Maximum audio length for CLAP model.
-        audio_stride (float): Stride to use for getting a CLAP embedding on the full sequence.
-        normalize (bool): Whether to normalize the CLAP embedding.
-        text_p (float): Probability of using text representation instead of audio at train time.
-        batch_size (Optional[int]): Batch size for CLAP embedding computation.
-        autocast_dtype (str): Autocast for the conditioner.
-        cache_path (Optional[str]): Path for pre-computed embeddings caching.
-        kwargs: Additional parameters for residual vector quantizer.
-    """
-    def __init__(self, dim: int, output_dim: int, device: str, attribute: str,
-                 quantize: bool, n_q: int, bins: int, checkpoint: tp.Union[str, Path], model_arch: str,
-                 enable_fusion: bool, sample_rate: int, max_audio_length: int, audio_stride: int,
-                 normalize: bool, text_p: bool, batch_size: tp.Optional[int] = None,
-                 autocast_dtype: tp.Optional[str] = 'float32', cache_path: tp.Optional[str] = None, **kwargs):
-        try:
-            import laion_clap  # type: ignore
-        except ImportError:
-            raise ImportError("Please install CLAP to use the CLAPEmbeddingConditioner: 'pip install laion_clap'")
-        warnings.warn("Sample rate for CLAP conditioner was fixed in version v1.1.0, (from 44.1 to 48 kHz). "
-                      "Please retrain all models.")
-        checkpoint = AudioCraftEnvironment.resolve_reference_path(checkpoint)
-        clap_tokenize = RobertaTokenizer.from_pretrained('roberta-base')
-        clap_model = laion_clap.CLAP_Module(enable_fusion=enable_fusion, amodel=model_arch)
-        load_clap_state_dict(clap_model, checkpoint)
-        clap_model.eval()
-        clap_model.to(device)
-        super().__init__(dim=dim, output_dim=output_dim, device=device, attribute=attribute,
-                         autocast_dtype=autocast_dtype, quantize=quantize, n_q=n_q, bins=bins,
-                         **kwargs)
-        self.checkpoint = checkpoint
-        self.enable_fusion = enable_fusion
-        self.model_arch = model_arch
-        self.clap: laion_clap.CLAP_Module
-        self.clap_tokenize: RobertaTokenizer
-        self.clap_sample_rate = sample_rate
-        self.clap_max_frames = int(self.clap_sample_rate * max_audio_length)
-        self.clap_stride = int(self.clap_sample_rate * audio_stride)
-        self.batch_size = batch_size or 1
-        self.normalize = normalize
-        self.text_p = text_p
-        self.__dict__['clap_tokenize'] = clap_tokenize
-        self.__dict__['clap'] = clap_model
-        self.wav_cache, self.text_cache = None, None
-        if cache_path is not None:
-            self.wav_cache = EmbeddingCache(Path(cache_path) / 'wav', self.device,
-                                            compute_embed_fn=self._get_wav_embedding_for_cache,
-                                            extract_embed_fn=self._extract_wav_embedding_chunk)
-            self.text_cache = EmbeddingCache(Path(cache_path) / 'text', self.device,
-                                             compute_embed_fn=self._get_text_embedding_for_cache)
-    def _tokenizer(self, texts: tp.Union[str, tp.List[str]]) -> dict:
-        # we use the default params from CLAP module here as well
-        return self.clap_tokenize(texts, padding="max_length", truncation=True, max_length=77, return_tensors="pt")
-    def _compute_text_embedding(self, text: tp.List[str]) -> torch.Tensor:
-        """Compute text embedding from CLAP model on a given a batch of text.
-        Args:
-            text (list[str]): List of text for the batch, with B items.
-        Returns:
-            torch.Tensor: CLAP embedding derived from text, of shape [B, 1, D], with D the CLAP embedding dimension.
-        """
-        with torch.no_grad():
-            embed = self.clap.get_text_embedding(text, tokenizer=self._tokenizer, use_tensor=True)
-            return embed.view(embed.size(0), 1, embed.size(-1))
-    def _get_text_embedding_for_cache(self, path: tp.Union[Path, str],
-                                      x: JointEmbedCondition, idx: int) -> torch.Tensor:
-        """Get text embedding function for the cache."""
-        text = x.text[idx]
-        text = text if text is not None else ""
-        return self._compute_text_embedding([text])[0]
-    def _preprocess_wav(self, wav: torch.Tensor, length: torch.Tensor, sample_rates: tp.List[int]) -> torch.Tensor:
-        """Preprocess wav to expected format by CLAP model.
-        Args:
-            wav (torch.Tensor): Audio wav, of shape [B, C, T].
-            length (torch.Tensor): Actual length of the audio for each item in the batch, of shape [B].
-            sample_rates (list[int]): Sample rates for each sample in the batch
-        Returns:
-            torch.Tensor: Audio wav of shape [B, T].
-        """
-        assert wav.dim() == 3, "Expecting wav to be [B, C, T]"
-        if sample_rates is not None:
-            _wav = []
-            for i, audio in enumerate(wav):
-                sr = sample_rates[i]
-                audio = convert_audio(audio, from_rate=sr, to_rate=self.clap_sample_rate, to_channels=1)
-                _wav.append(audio)
-            wav = torch.stack(_wav, dim=0)
-        wav = wav.mean(dim=1)
-        return wav
-    def _compute_wav_embedding(self, wav: torch.Tensor, length: torch.Tensor,
-                               sample_rates: tp.List[int], reduce_mean: bool = False) -> torch.Tensor:
-        """Compute audio wave embedding from CLAP model.
-        Since CLAP operates on a fixed sequence length audio inputs and we need to process longer audio sequences,
-        we calculate the wav embeddings on `clap_max_frames` windows with `clap_stride`-second stride and
-        average the resulting embeddings.
-        Args:
-            wav (torch.Tensor): Audio wav, of shape [B, C, T].
-            length (torch.Tensor): Actual length of the audio for each item in the batch, of shape [B].
-            sample_rates (list[int]): Sample rates for each sample in the batch.
-            reduce_mean (bool): Whether to get the average tensor.
-        Returns:
-            torch.Tensor: Audio embedding of shape [B, F, D], F being the number of chunks, D the dimension.
-        """
-        with torch.no_grad():
-            wav = self._preprocess_wav(wav, length, sample_rates)
-            B, T = wav.shape
-            if T >= self.clap_max_frames:
-                wav = wav.unfold(-1, self.clap_max_frames, self.clap_stride)  # [B, F, T]
-            else:
-                wav = wav.view(-1, 1, T)  # [B, F, T] with F=1
-            wav = einops.rearrange(wav, 'b f t -> (b f) t')
-            embed_list = []
-            for i in range(0, wav.size(0), self.batch_size):
-                _wav = wav[i:i+self.batch_size, ...]
-                _embed = self.clap.get_audio_embedding_from_data(_wav, use_tensor=True)
-                embed_list.append(_embed)
-            embed = torch.cat(embed_list, dim=0)
-            embed = einops.rearrange(embed, '(b f) d -> b f d', b=B)
-            if reduce_mean:
-                embed = embed.mean(dim=1, keepdim=True)
-            return embed  # [B, F, D] with F=1 if reduce_mean is True
-    def _get_wav_embedding_for_cache(self, path: tp.Union[str, Path],
-                                     x: JointEmbedCondition, idx: int) -> torch.Tensor:
-        """Compute audio wave embedding for the cache.
-        The embedding is computed on a given audio read from file.
-        Args:
-            path (str or Path): Path to the full audio file.
-        Returns:
-            torch.Tensor: Single-item tensor of shape [F, D], F being the number of chunks, D the dimension.
-        """
-        wav, sr = audio_read(path)  # [C, T]
-        wav = wav.unsqueeze(0).to(self.device)  # [1, C, T]
-        wav_len = torch.LongTensor([wav.shape[-1]]).to(self.device)
-        embed = self._compute_wav_embedding(wav, wav_len, [sr], reduce_mean=False)  # [B, F, D]
-        return embed.squeeze(0)  # [F, D]
-    def _extract_wav_embedding_chunk(self, full_embed: torch.Tensor, x: JointEmbedCondition, idx: int) -> torch.Tensor:
-        """Extract the chunk of embedding matching the seek_time and length from the full CLAP audio embedding.
-        Args:
-            full_embed (torch.Tensor): CLAP embedding computed on the full wave, of shape [F, D].
-            x (JointEmbedCondition): Joint embedding condition for the full batch.
-            idx (int): Index considered for the given embedding to extract.
-        Returns:
-            torch.Tensor: Wav embedding averaged on sliding window, of shape [1, D].
-        """
-        sample_rate = x.sample_rate[idx]
-        seek_time = x.seek_time[idx]
-        seek_time = 0. if seek_time is None else seek_time
-        clap_stride = int(self.clap_stride / self.clap_sample_rate) * sample_rate
-        end_seek_time = seek_time + self.clap_max_frames / self.clap_sample_rate
-        start_offset = int(seek_time * sample_rate // clap_stride)
-        end_offset = int(end_seek_time * sample_rate // clap_stride)
-        wav_embed = full_embed[start_offset:end_offset, ...]
-        wav_embed = wav_embed.mean(dim=0, keepdim=True)
-        return wav_embed.to(self.device)  # [F, D]
-    def _get_text_embedding(self, x: JointEmbedCondition) -> torch.Tensor:
-        """Get CLAP embedding from a batch of text descriptions."""
-        no_nullified_cond = x.wav.shape[-1] > 1  # we don't want to read from cache when condition dropout
-        if self.text_cache is not None and no_nullified_cond:
-            assert all(p is not None for p in x.path), "Cache requires all JointEmbedCondition paths to be provided"
-            paths = [Path(p) for p in x.path if p is not None]
-            embed = self.text_cache.get_embed_from_cache(paths, x)
-        else:
-            text = [xi if xi is not None else "" for xi in x.text]
-            embed = self._compute_text_embedding(text)
-        if self.normalize:
-            embed = torch.nn.functional.normalize(embed, p=2.0, dim=-1)
-        return embed
-    def _get_wav_embedding(self, x: JointEmbedCondition) -> torch.Tensor:
-        """Get CLAP embedding from a batch of audio tensors (and corresponding sample rates)."""
-        no_undefined_paths = all(p is not None for p in x.path)
-        no_nullified_cond = x.wav.shape[-1] > 1  # we don't want to read from cache when condition dropout
-        if self.wav_cache is not None and no_undefined_paths and no_nullified_cond:
-            paths = [Path(p) for p in x.path if p is not None]
-            embed = self.wav_cache.get_embed_from_cache(paths, x)
-        else:
-            embed = self._compute_wav_embedding(x.wav, x.length, x.sample_rate, reduce_mean=True)
-        if self.normalize:
-            embed = torch.nn.functional.normalize(embed, p=2.0, dim=-1)
-        return embed
-    def tokenize(self, x: JointEmbedCondition) -> JointEmbedCondition:
-        # Trying to limit as much as possible sync points when the cache is warm.
-        no_undefined_paths = all(p is not None for p in x.path)
-        if self.wav_cache is not None and no_undefined_paths:
-            assert all([p is not None for p in x.path]), "Cache requires all JointEmbedCondition paths to be provided"
-            paths = [Path(p) for p in x.path if p is not None]
-            self.wav_cache.populate_embed_cache(paths, x)
-        if self.text_cache is not None and no_undefined_paths:
-            assert all([p is not None for p in x.path]), "Cache requires all JointEmbedCondition paths to be provided"
-            paths = [Path(p) for p in x.path if p is not None]
-            self.text_cache.populate_embed_cache(paths, x)
-        return x
-    def _get_embed(self, x: JointEmbedCondition) -> tp.Tuple[torch.Tensor, torch.Tensor]:
-        """Extract shared latent representation from either the wav or the text using CLAP."""
-        # decide whether to use text embedding at train time or not
-        use_text_embed = random.random() < self.text_p
-        if self.training and not use_text_embed:
-            embed = self._get_wav_embedding(x)
-            empty_idx = torch.LongTensor([])  # we assume we always have the audio wav
-        else:
-            embed = self._get_text_embedding(x)
-            empty_idx = torch.LongTensor([i for i, xi in enumerate(x.text) if xi is None or xi == ""])
-        return embed, empty_idx
-def dropout_symbolic_conditions(sample: ConditioningAttributes,
-                                condition: str, null_chord_idx: int = 194) -> ConditioningAttributes:
-    """
-    Applies dropout to symbolic conditions within the sample based on the specified condition by setting the condition
-    value to a null index.
-    Args:
-        sample (ConditioningAttributes): The sample containing symbolic attributes to potentially dropout.
-        condition (str): The specific condition within the symbolic attributes to apply dropout.
-        null_chord_idx (int, optional): The index used to represent a null chord. Defaults to 194.
-    Returns:
-        ConditioningAttributes: The modified sample with dropout applied to the specified condition.
-    Raises:
-        ValueError: If the specified condition is not present in the sample's symbolic attributes.
-    """
-    if sample.symbolic == {} or sample.symbolic[JascoCondConst.CRD.value].frame_chords.shape[-1] <= 1:  # type: ignore
-        # nothing to drop
-        return sample
-    if condition not in getattr(sample, 'symbolic'):
-        raise ValueError(
-            "dropout_symbolic_condition received an unexpected condition!"
-            f" expected {sample.symbolic.keys()}"
-            f" but got '{condition}'!"
-        )
-    if condition == JascoCondConst.CRD.value:
-        sample.symbolic[condition] = nullify_chords(sample.symbolic[condition], null_chord_idx=null_chord_idx)
-    elif condition == JascoCondConst.MLD.value:
-        sample.symbolic[condition] = nullify_melody(sample.symbolic[condition])
-    return sample
-def dropout_condition(sample: ConditioningAttributes,
-                      condition_type: str, condition: str,
-                      **kwargs) -> ConditioningAttributes:
-    """Utility function for nullifying an attribute inside an ConditioningAttributes object.
-    If the condition is of type "wav", then nullify it using `nullify_condition` function.
-    If the condition is of any other type, set its value to None.
-    Works in-place.
-    """
-    if condition_type not in ['text', 'wav', 'joint_embed', 'symbolic']:
-        raise ValueError(
-            "dropout_condition got an unexpected condition type!"
-            f" expected 'text', 'wav' or 'joint_embed' but got '{condition_type}'"
-        )
-    if condition not in getattr(sample, condition_type):
-        raise ValueError(
-            "dropout_condition received an unexpected condition!"
-            f" expected wav={sample.wav.keys()} and text={sample.text.keys()}"
-            f" but got '{condition}' of type '{condition_type}'!"
-        )
-    if condition_type == 'wav':
-        wav_cond = sample.wav[condition]
-        sample.wav[condition] = nullify_wav(wav_cond)
-    elif condition_type == 'joint_embed':
-        embed = sample.joint_embed[condition]
-        sample.joint_embed[condition] = nullify_joint_embed(embed)
-    elif condition_type == 'symbolic':
-        sample = dropout_symbolic_conditions(sample=sample, condition=condition, **kwargs)
-    else:
-        sample.text[condition] = None
-    return sample
-class DropoutModule(nn.Module):
-    """Base module for all dropout modules."""
-    def __init__(self, seed: int = 1234):
-        super().__init__()
-        self.rng = torch.Generator()
-        self.rng.manual_seed(seed)
-class AttributeDropout(DropoutModule):
-    """Dropout with a given probability per attribute.
-    This is different from the behavior of ClassifierFreeGuidanceDropout as this allows for attributes
-    to be dropped out separately. For example, "artist" can be dropped while "genre" remains.
-    This is in contrast to ClassifierFreeGuidanceDropout where if "artist" is dropped "genre"
-    must also be dropped.
-    Args:
-        p (tp.Dict[str, float]): A dict mapping between attributes and dropout probability. For example:
-            ...
-            "genre": 0.1,
-            "artist": 0.5,
-            "wav": 0.25,
-            ...
-        active_on_eval (bool, optional): Whether the dropout is active at eval. Default to False.
-        seed (int, optional): Random seed.
-    """
-    def __init__(self, p: tp.Dict[str, tp.Dict[str, float]], active_on_eval: bool = False, seed: int = 1234):
-        super().__init__(seed=seed)
-        self.active_on_eval = active_on_eval
-        # construct dict that return the values from p otherwise 0
-        self.p = {}
-        for condition_type, probs in p.items():
-            self.p[condition_type] = defaultdict(lambda: 0, probs)
-    def forward(self, samples: tp.List[ConditioningAttributes]) -> tp.List[ConditioningAttributes]:
-        """
-        Args:
-            samples (list[ConditioningAttributes]): List of conditions.
-        Returns:
-            list[ConditioningAttributes]: List of conditions after certain attributes were set to None.
-        """
-        if not self.training and not self.active_on_eval:
-            return samples
-        samples = deepcopy(samples)
-        for condition_type, ps in self.p.items():  # for condition types [text, wav, symbolic]
-            for condition, p in ps.items():  # for attributes of each type (e.g., [artist, genre])
-                if torch.rand(1, generator=self.rng).item() < p:
-                    for sample in samples:
-                        dropout_condition(sample, condition_type, condition)
-        return samples
-    def __repr__(self):
-        return f"AttributeDropout({dict(self.p)})"
-class ClassifierFreeGuidanceDropout(DropoutModule):
-    """Classifier Free Guidance dropout.
-    All attributes are dropped with the same probability.
-    Args:
-        p (float): Probability to apply condition dropout during training.
-        seed (int): Random seed.
-    """
-    def __init__(self, p: float, seed: int = 1234):
-        super().__init__(seed=seed)
-        self.p = p
-    def forward(self, samples: tp.List[ConditioningAttributes],
-                cond_types: tp.List[str] = ["wav", "text"],
-                **kwargs) -> tp.List[ConditioningAttributes]:
-        """
-        Args:
-            samples (list[ConditioningAttributes]): List of conditions.
-        Returns:
-            list[ConditioningAttributes]: List of conditions after all attributes were set to None.
-        """
-        if not self.training:
-            return samples
-        # decide on which attributes to drop in a batched fashion
-        drop = torch.rand(1, generator=self.rng).item() < self.p
-        if not drop:
-            return samples
-        # nullify conditions of all attributes
-        samples = deepcopy(samples)
-        for condition_type in cond_types:
-            for sample in samples:
-                for condition in sample.attributes[condition_type]:
-                    dropout_condition(sample, condition_type, condition,
-                                      **kwargs)
-        return samples
-    def __repr__(self):
-        return f"ClassifierFreeGuidanceDropout(p={self.p})"
-class ConditioningProvider(nn.Module):
-    """Prepare and provide conditions given all the supported conditioners.
-    Args:
-        conditioners (dict): Dictionary of conditioners.
-        device (torch.device or str, optional): Device for conditioners and output condition types.
-    """
-    def __init__(self, conditioners: tp.Dict[str, BaseConditioner], device: tp.Union[torch.device, str] = "cpu"):
-        super().__init__()
-        self.device = device
-        self.conditioners = nn.ModuleDict(conditioners)
-    @property
-    def joint_embed_conditions(self):
-        return [m.attribute for m in self.conditioners.values() if isinstance(m, JointEmbeddingConditioner)]
-    @property
-    def has_joint_embed_conditions(self):
-        return len(self.joint_embed_conditions) > 0
-    @property
-    def text_conditions(self):
-        return [k for k, v in self.conditioners.items() if isinstance(v, TextConditioner)]
-    @property
-    def wav_conditions(self):
-        return [k for k, v in self.conditioners.items() if isinstance(v, WaveformConditioner)]
-    @property
-    def has_wav_condition(self):
-        return len(self.wav_conditions) > 0
-    def tokenize(self, inputs: tp.List[ConditioningAttributes]) -> tp.Dict[str, tp.Any]:
-        """Match attributes/wavs with existing conditioners in self, and compute tokenize them accordingly.
-        This should be called before starting any real GPU work to avoid synchronization points.
-        This will return a dict matching conditioner names to their arbitrary tokenized representations.
-        Args:
-            inputs (list[ConditioningAttributes]): List of ConditioningAttributes objects containing
-                text and wav conditions.
-        """
-        assert all([isinstance(x, ConditioningAttributes) for x in inputs]), (
-            "Got unexpected types input for conditioner! should be tp.List[ConditioningAttributes]",
-            f" but types were {set([type(x) for x in inputs])}"
-        )
-        output = {}
-        text = self._collate_text(inputs)
-        wavs = self._collate_wavs(inputs)
-        joint_embeds = self._collate_joint_embeds(inputs)
-        assert set(text.keys() | wavs.keys() | joint_embeds.keys()).issubset(set(self.conditioners.keys())), (
-            f"Got an unexpected attribute! Expected {self.conditioners.keys()}, ",
-            f"got {text.keys(), wavs.keys(), joint_embeds.keys()}"
-        )
-        for attribute, batch in chain(text.items(), wavs.items(), joint_embeds.items()):
-            output[attribute] = self.conditioners[attribute].tokenize(batch)
-        return output
-    def forward(self, tokenized: tp.Dict[str, tp.Any]) -> tp.Dict[str, ConditionType]:
-        """Compute pairs of `(embedding, mask)` using the configured conditioners and the tokenized representations.
-        The output is for example:
-        {
-            "genre": (torch.Tensor([B, 1, D_genre]), torch.Tensor([B, 1])),
-            "description": (torch.Tensor([B, T_desc, D_desc]), torch.Tensor([B, T_desc])),
-            ...
-        }
-        Args:
-            tokenized (dict): Dict of tokenized representations as returned by `tokenize()`.
-        """
-        output = {}
-        for attribute, inputs in tokenized.items():
-            condition, mask = self.conditioners[attribute](inputs)
-            output[attribute] = (condition, mask)
-        return output
-    def _collate_text(self, samples: tp.List[ConditioningAttributes]) -> tp.Dict[str, tp.List[tp.Optional[str]]]:
-        """Given a list of ConditioningAttributes objects, compile a dictionary where the keys
-        are the attributes and the values are the aggregated input per attribute.
-        For example:
-        Input:
-        [
-            ConditioningAttributes(text={"genre": "Rock", "description": "A rock song with a guitar solo"}, wav=...),
-            ConditioningAttributes(text={"genre": "Hip-hop", "description": "A hip-hop verse"}, wav=...),
-        ]
-        Output:
-        {
-            "genre": ["Rock", "Hip-hop"],
-            "description": ["A rock song with a guitar solo", "A hip-hop verse"]
-        }
-        Args:
-            samples (list of ConditioningAttributes): List of ConditioningAttributes samples.
-        Returns:
-            dict[str, list[str, optional]]: A dictionary mapping an attribute name to text batch.
-        """
-        out: tp.Dict[str, tp.List[tp.Optional[str]]] = defaultdict(list)
-        texts = [x.text for x in samples]
-        for text in texts:
-            for condition in self.text_conditions:
-                out[condition].append(text[condition])
-        return out
-    def _collate_wavs(self, samples: tp.List[ConditioningAttributes]) -> tp.Dict[str, WavCondition]:
-        """Generate a dict where the keys are attributes by which we fetch similar wavs,
-        and the values are Tensors of wavs according to said attributes.
-        *Note*: by the time the samples reach this function, each sample should have some waveform
-        inside the "wav" attribute. It should be either:
-        1. A real waveform
-        2. A null waveform due to the sample having no similar waveforms (nullified by the dataset)
-        3. A null waveform due to it being dropped in a dropout module (nullified by dropout)
-        Args:
-            samples (list of ConditioningAttributes): List of ConditioningAttributes samples.
-        Returns:
-            dict[str, WavCondition]: A dictionary mapping an attribute name to wavs.
-        """
-        wavs = defaultdict(list)
-        lengths = defaultdict(list)
-        sample_rates = defaultdict(list)
-        paths = defaultdict(list)
-        seek_times = defaultdict(list)
-        out: tp.Dict[str, WavCondition] = {}
-        for sample in samples:
-            for attribute in self.wav_conditions:
-                wav, length, sample_rate, path, seek_time = sample.wav[attribute]
-                assert wav.dim() == 3, f"Got wav with dim={wav.dim()}, but expected 3 [1, C, T]"
-                assert wav.size(0) == 1, f"Got wav [B, C, T] with shape={wav.shape}, but expected B == 1"
-                # mono-channel conditioning
-                wav = wav.mean(1, keepdim=True)  # [1, 1, T]
-                wavs[attribute].append(wav.flatten())  # [T]
-                lengths[attribute].append(length)
-                sample_rates[attribute].extend(sample_rate)
-                paths[attribute].extend(path)
-                seek_times[attribute].extend(seek_time)
-        # stack all wavs to a single tensor
-        for attribute in self.wav_conditions:
-            stacked_wav, _ = collate(wavs[attribute], dim=0)
-            out[attribute] = WavCondition(
-                stacked_wav.unsqueeze(1), torch.cat(lengths[attribute]), sample_rates[attribute],
-                paths[attribute], seek_times[attribute])
-        return out
-    def _collate_joint_embeds(self, samples: tp.List[ConditioningAttributes]) -> tp.Dict[str, JointEmbedCondition]:
-        """Generate a dict where the keys are attributes by which we compute joint embeddings,
-        and the values are Tensors of pre-computed embeddings and the corresponding text attributes.
-        Args:
-            samples (list[ConditioningAttributes]): List of ConditioningAttributes samples.
-        Returns:
-            A dictionary mapping an attribute name to joint embeddings.
-        """
-        texts = defaultdict(list)
-        wavs = defaultdict(list)
-        lengths = defaultdict(list)
-        sample_rates = defaultdict(list)
-        paths = defaultdict(list)
-        seek_times = defaultdict(list)
-        channels: int = 0
-        out = {}
-        for sample in samples:
-            for attribute in self.joint_embed_conditions:
-                wav, text, length, sample_rate, path, seek_time = sample.joint_embed[attribute]
-                assert wav.dim() == 3
-                if channels == 0:
-                    channels = wav.size(1)
-                else:
-                    assert channels == wav.size(1), "not all audio has same number of channels in batch"
-                assert wav.size(0) == 1, "Expecting single-wav batch in the collate method"
-                wav = einops.rearrange(wav, "b c t -> (b c t)")  # [1, C, T] => [C * T]
-                wavs[attribute].append(wav)
-                texts[attribute].extend(text)
-                lengths[attribute].append(length)
-                sample_rates[attribute].extend(sample_rate)
-                paths[attribute].extend(path)
-                seek_times[attribute].extend(seek_time)
-        for attribute in self.joint_embed_conditions:
-            stacked_texts = texts[attribute]
-            stacked_paths = paths[attribute]
-            stacked_seek_times = seek_times[attribute]
-            stacked_wavs = pad_sequence(wavs[attribute]).to(self.device)
-            stacked_wavs = einops.rearrange(stacked_wavs, "(c t) b -> b c t", c=channels)
-            stacked_sample_rates = sample_rates[attribute]
-            stacked_lengths = torch.cat(lengths[attribute]).to(self.device)
-            assert stacked_lengths.size(0) == stacked_wavs.size(0)
-            assert len(stacked_sample_rates) == stacked_wavs.size(0)
-            assert len(stacked_texts) == stacked_wavs.size(0)
-            out[attribute] = JointEmbedCondition(
-                text=stacked_texts, wav=stacked_wavs,
-                length=stacked_lengths, sample_rate=stacked_sample_rates,
-                path=stacked_paths, seek_time=stacked_seek_times)
-        return out
-class ConditionFuser(StreamingModule):
-    """Condition fuser handles the logic to combine the different conditions
-    to the actual model input.
-    Args:
-        fuse2cond (tp.Dict[str, str]): A dictionary that says how to fuse
-            each condition. For example:
-            {
-                "prepend": ["description"],
-                "sum": ["genre", "bpm"],
-                "cross": ["description"],
-            }
-        cross_attention_pos_emb (bool, optional): Use positional embeddings in cross attention.
-        cross_attention_pos_emb_scale (int): Scale for positional embeddings in cross attention if used.
-    """
-    FUSING_METHODS = ["sum", "prepend", "cross", "ignore", "input_interpolate"]
-    def __init__(self, fuse2cond: tp.Dict[str, tp.List[str]], cross_attention_pos_emb: bool = False,
-                 cross_attention_pos_emb_scale: float = 1.0):
-        super().__init__()
-        assert all(
-            [k in self.FUSING_METHODS for k in fuse2cond.keys()]
-        ), f"Got invalid fuse method, allowed methods: {self.FUSING_METHODS}"
-        self.cross_attention_pos_emb = cross_attention_pos_emb
-        self.cross_attention_pos_emb_scale = cross_attention_pos_emb_scale
-        self.fuse2cond: tp.Dict[str, tp.List[str]] = fuse2cond
-        self.cond2fuse: tp.Dict[str, str] = {}
-        for fuse_method, conditions in fuse2cond.items():
-            for condition in conditions:
-                self.cond2fuse[condition] = fuse_method
-    def forward(
-        self,
-        input: torch.Tensor,
-        conditions: tp.Dict[str, ConditionType]
-    ) -> tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
-        """Fuse the conditions to the provided model input.
-        Args:
-            input (torch.Tensor): Transformer input.
-            conditions (dict[str, ConditionType]): Dict of conditions.
-        Returns:
-            tuple[torch.Tensor, torch.Tensor]: The first tensor is the transformer input
-                after the conditions have been fused. The second output tensor is the tensor
-                used for cross-attention or None if no cross attention inputs exist.
-        """
-        B, T, _ = input.shape
-        if 'offsets' in self._streaming_state:
-            first_step = False
-            offsets = self._streaming_state['offsets']
-        else:
-            first_step = True
-            offsets = torch.zeros(input.shape[0], dtype=torch.long, device=input.device)
-        assert set(conditions.keys()).issubset(set(self.cond2fuse.keys())), \
-            f"given conditions contain unknown attributes for fuser, " \
-            f"expected {self.cond2fuse.keys()}, got {conditions.keys()}"
-        cross_attention_output = None
-        for cond_type, (cond, cond_mask) in conditions.items():
-            op = self.cond2fuse[cond_type]
-            if op == 'sum':
-                input += cond
-            elif op == 'input_interpolate':
-                cond = einops.rearrange(cond, "b t d -> b d t")
-                cond = F.interpolate(cond, size=input.shape[1])
-                input += einops.rearrange(cond, "b d t -> b t d")
-            elif op == 'prepend':
-                if first_step:
-                    input = torch.cat([cond, input], dim=1)
-            elif op == 'cross':
-                if cross_attention_output is not None:
-                    cross_attention_output = torch.cat([cross_attention_output, cond], dim=1)
-                else:
-                    cross_attention_output = cond
-            elif op == 'ignore':
-                continue
-            else:
-                raise ValueError(f"unknown op ({op})")
-        if self.cross_attention_pos_emb and cross_attention_output is not None:
-            positions = torch.arange(
-                cross_attention_output.shape[1],
-                device=cross_attention_output.device
-            ).view(1, -1, 1)
-            pos_emb = create_sin_embedding(positions, cross_attention_output.shape[-1])
-            cross_attention_output = cross_attention_output + self.cross_attention_pos_emb_scale * pos_emb
-        if self._is_streaming:
-            self._streaming_state['offsets'] = offsets + T
-        return input, cross_attention_output

audiocraft/modules/conv.py DELETED Viewed

@@ -1,245 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-import math
-import typing as tp
-import warnings
-import torch
-from torch import nn
-from torch.nn import functional as F
-from torch.nn.utils.parametrizations import spectral_norm, weight_norm
-CONV_NORMALIZATIONS = frozenset(['none', 'weight_norm', 'spectral_norm',
-                                 'time_group_norm'])
-def apply_parametrization_norm(module: nn.Module, norm: str = 'none'):
-    assert norm in CONV_NORMALIZATIONS
-    if norm == 'weight_norm':
-        return weight_norm(module)
-    elif norm == 'spectral_norm':
-        return spectral_norm(module)
-    else:
-        # We already check was in CONV_NORMALIZATION, so any other choice
-        # doesn't need reparametrization.
-        return module
-def get_norm_module(module: nn.Module, causal: bool = False, norm: str = 'none', **norm_kwargs):
-    """Return the proper normalization module. If causal is True, this will ensure the returned
-    module is causal, or return an error if the normalization doesn't support causal evaluation.
-    """
-    assert norm in CONV_NORMALIZATIONS
-    if norm == 'time_group_norm':
-        if causal:
-            raise ValueError("GroupNorm doesn't support causal evaluation.")
-        assert isinstance(module, nn.modules.conv._ConvNd)
-        return nn.GroupNorm(1, module.out_channels, **norm_kwargs)
-    else:
-        return nn.Identity()
-def get_extra_padding_for_conv1d(x: torch.Tensor, kernel_size: int, stride: int,
-                                 padding_total: int = 0) -> int:
-    """See `pad_for_conv1d`.
-    """
-    length = x.shape[-1]
-    n_frames = (length - kernel_size + padding_total) / stride + 1
-    ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total)
-    return ideal_length - length
-def pad_for_conv1d(x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0):
-    """Pad for a convolution to make sure that the last window is full.
-    Extra padding is added at the end. This is required to ensure that we can rebuild
-    an output of the same length, as otherwise, even with padding, some time steps
-    might get removed.
-    For instance, with total padding = 4, kernel size = 4, stride = 2:
-        0 0 1 2 3 4 5 0 0   # (0s are padding)
-        1   2   3           # (output frames of a convolution, last 0 is never used)
-        0 0 1 2 3 4 5 0     # (output of tr. conv., but pos. 5 is going to get removed as padding)
-            1 2 3 4         # once you removed padding, we are missing one time step !
-    """
-    extra_padding = get_extra_padding_for_conv1d(x, kernel_size, stride, padding_total)
-    return F.pad(x, (0, extra_padding))
-def pad1d(x: torch.Tensor, paddings: tp.Tuple[int, int], mode: str = 'constant', value: float = 0.):
-    """Tiny wrapper around F.pad, just to allow for reflect padding on small input.
-    If this is the case, we insert extra 0 padding to the right before the reflection happen.
-    """
-    length = x.shape[-1]
-    padding_left, padding_right = paddings
-    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
-    if mode == 'reflect':
-        max_pad = max(padding_left, padding_right)
-        extra_pad = 0
-        if length <= max_pad:
-            extra_pad = max_pad - length + 1
-            x = F.pad(x, (0, extra_pad))
-        padded = F.pad(x, paddings, mode, value)
-        end = padded.shape[-1] - extra_pad
-        return padded[..., :end]
-    else:
-        return F.pad(x, paddings, mode, value)
-def unpad1d(x: torch.Tensor, paddings: tp.Tuple[int, int]):
-    """Remove padding from x, handling properly zero padding. Only for 1d!
-    """
-    padding_left, padding_right = paddings
-    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
-    assert (padding_left + padding_right) <= x.shape[-1]
-    end = x.shape[-1] - padding_right
-    return x[..., padding_left: end]
-class NormConv1d(nn.Module):
-    """Wrapper around Conv1d and normalization applied to this conv
-    to provide a uniform interface across normalization approaches.
-    """
-    def __init__(self, *args, causal: bool = False, norm: str = 'none',
-                 norm_kwargs: tp.Dict[str, tp.Any] = {}, **kwargs):
-        super().__init__()
-        self.conv = apply_parametrization_norm(nn.Conv1d(*args, **kwargs), norm)
-        self.norm = get_norm_module(self.conv, causal, norm, **norm_kwargs)
-        self.norm_type = norm
-    def forward(self, x):
-        x = self.conv(x)
-        x = self.norm(x)
-        return x
-class NormConv2d(nn.Module):
-    """Wrapper around Conv2d and normalization applied to this conv
-    to provide a uniform interface across normalization approaches.
-    """
-    def __init__(self, *args, norm: str = 'none', norm_kwargs: tp.Dict[str, tp.Any] = {}, **kwargs):
-        super().__init__()
-        self.conv = apply_parametrization_norm(nn.Conv2d(*args, **kwargs), norm)
-        self.norm = get_norm_module(self.conv, causal=False, norm=norm, **norm_kwargs)
-        self.norm_type = norm
-    def forward(self, x):
-        x = self.conv(x)
-        x = self.norm(x)
-        return x
-class NormConvTranspose1d(nn.Module):
-    """Wrapper around ConvTranspose1d and normalization applied to this conv
-    to provide a uniform interface across normalization approaches.
-    """
-    def __init__(self, *args, causal: bool = False, norm: str = 'none',
-                 norm_kwargs: tp.Dict[str, tp.Any] = {}, **kwargs):
-        super().__init__()
-        self.convtr = apply_parametrization_norm(nn.ConvTranspose1d(*args, **kwargs), norm)
-        self.norm = get_norm_module(self.convtr, causal, norm, **norm_kwargs)
-        self.norm_type = norm
-    def forward(self, x):
-        x = self.convtr(x)
-        x = self.norm(x)
-        return x
-class NormConvTranspose2d(nn.Module):
-    """Wrapper around ConvTranspose2d and normalization applied to this conv
-    to provide a uniform interface across normalization approaches.
-    """
-    def __init__(self, *args, norm: str = 'none', norm_kwargs: tp.Dict[str, tp.Any] = {}, **kwargs):
-        super().__init__()
-        self.convtr = apply_parametrization_norm(nn.ConvTranspose2d(*args, **kwargs), norm)
-        self.norm = get_norm_module(self.convtr, causal=False, norm=norm, **norm_kwargs)
-    def forward(self, x):
-        x = self.convtr(x)
-        x = self.norm(x)
-        return x
-class StreamableConv1d(nn.Module):
-    """Conv1d with some builtin handling of asymmetric or causal padding
-    and normalization.
-    """
-    def __init__(self, in_channels: int, out_channels: int,
-                 kernel_size: int, stride: int = 1, dilation: int = 1,
-                 groups: int = 1, bias: bool = True, causal: bool = False,
-                 norm: str = 'none', norm_kwargs: tp.Dict[str, tp.Any] = {},
-                 pad_mode: str = 'reflect'):
-        super().__init__()
-        # warn user on unusual setup between dilation and stride
-        if stride > 1 and dilation > 1:
-            warnings.warn('StreamableConv1d has been initialized with stride > 1 and dilation > 1'
-                          f' (kernel_size={kernel_size} stride={stride}, dilation={dilation}).')
-        self.conv = NormConv1d(in_channels, out_channels, kernel_size, stride,
-                               dilation=dilation, groups=groups, bias=bias, causal=causal,
-                               norm=norm, norm_kwargs=norm_kwargs)
-        self.causal = causal
-        self.pad_mode = pad_mode
-    def forward(self, x):
-        B, C, T = x.shape
-        kernel_size = self.conv.conv.kernel_size[0]
-        stride = self.conv.conv.stride[0]
-        dilation = self.conv.conv.dilation[0]
-        kernel_size = (kernel_size - 1) * dilation + 1  # effective kernel size with dilations
-        padding_total = kernel_size - stride
-        extra_padding = get_extra_padding_for_conv1d(x, kernel_size, stride, padding_total)
-        if self.causal:
-            # Left padding for causal
-            x = pad1d(x, (padding_total, extra_padding), mode=self.pad_mode)
-        else:
-            # Asymmetric padding required for odd strides
-            padding_right = padding_total // 2
-            padding_left = padding_total - padding_right
-            x = pad1d(x, (padding_left, padding_right + extra_padding), mode=self.pad_mode)
-        return self.conv(x)
-class StreamableConvTranspose1d(nn.Module):
-    """ConvTranspose1d with some builtin handling of asymmetric or causal padding
-    and normalization.
-    """
-    def __init__(self, in_channels: int, out_channels: int,
-                 kernel_size: int, stride: int = 1, causal: bool = False,
-                 norm: str = 'none', trim_right_ratio: float = 1.,
-                 norm_kwargs: tp.Dict[str, tp.Any] = {}):
-        super().__init__()
-        self.convtr = NormConvTranspose1d(in_channels, out_channels, kernel_size, stride,
-                                          causal=causal, norm=norm, norm_kwargs=norm_kwargs)
-        self.causal = causal
-        self.trim_right_ratio = trim_right_ratio
-        assert self.causal or self.trim_right_ratio == 1., \
-            "`trim_right_ratio` != 1.0 only makes sense for causal convolutions"
-        assert self.trim_right_ratio >= 0. and self.trim_right_ratio <= 1.
-    def forward(self, x):
-        kernel_size = self.convtr.convtr.kernel_size[0]
-        stride = self.convtr.convtr.stride[0]
-        padding_total = kernel_size - stride
-        y = self.convtr(x)
-        # We will only trim fixed padding. Extra padding from `pad_for_conv1d` would be
-        # removed at the very end, when keeping only the right length for the output,
-        # as removing it here would require also passing the length at the matching layer
-        # in the encoder.
-        if self.causal:
-            # Trim the padding on the right according to the specified ratio
-            # if trim_right_ratio = 1.0, trim everything from right
-            padding_right = math.ceil(padding_total * self.trim_right_ratio)
-            padding_left = padding_total - padding_right
-            y = unpad1d(y, (padding_left, padding_right))
-        else:
-            # Asymmetric padding required for odd strides
-            padding_right = padding_total // 2
-            padding_left = padding_total - padding_right
-            y = unpad1d(y, (padding_left, padding_right))
-        return y

audiocraft/modules/diffusion_schedule.py DELETED Viewed

@@ -1,272 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Functions for Noise Schedule, defines diffusion process, reverse process and data processor.
-"""
-from collections import namedtuple
-import random
-import typing as tp
-import julius
-import torch
-TrainingItem = namedtuple("TrainingItem", "noisy noise step")
-def betas_from_alpha_bar(alpha_bar):
-    alphas = torch.cat([torch.Tensor([alpha_bar[0]]), alpha_bar[1:]/alpha_bar[:-1]])
-    return 1 - alphas
-class SampleProcessor(torch.nn.Module):
-    def project_sample(self, x: torch.Tensor):
-        """Project the original sample to the 'space' where the diffusion will happen."""
-        return x
-    def return_sample(self, z: torch.Tensor):
-        """Project back from diffusion space to the actual sample space."""
-        return z
-class MultiBandProcessor(SampleProcessor):
-    """
-    MultiBand sample processor. The input audio is splitted across
-    frequency bands evenly distributed in mel-scale.
-    Each band will be rescaled to match the power distribution
-    of Gaussian noise in that band, using online metrics
-    computed on the first few samples.
-    Args:
-        n_bands (int): Number of mel-bands to split the signal over.
-        sample_rate (int): Sample rate of the audio.
-        num_samples (int): Number of samples to use to fit the rescaling
-            for each band. The processor won't be stable
-            until it has seen that many samples.
-        power_std (float or list/tensor): The rescaling factor computed to match the
-            power of Gaussian noise in each band is taken to
-            that power, i.e. `1.` means full correction of the energy
-            in each band, and values less than `1` means only partial
-            correction. Can be used to balance the relative importance
-            of low vs. high freq in typical audio signals.
-    """
-    def __init__(self, n_bands: int = 8, sample_rate: float = 24_000,
-                 num_samples: int = 10_000, power_std: tp.Union[float, tp.List[float], torch.Tensor] = 1.):
-        super().__init__()
-        self.n_bands = n_bands
-        self.split_bands = julius.SplitBands(sample_rate, n_bands=n_bands)
-        self.num_samples = num_samples
-        self.power_std = power_std
-        if isinstance(power_std, list):
-            assert len(power_std) == n_bands
-            power_std = torch.tensor(power_std)
-        self.register_buffer('counts', torch.zeros(1))
-        self.register_buffer('sum_x', torch.zeros(n_bands))
-        self.register_buffer('sum_x2', torch.zeros(n_bands))
-        self.register_buffer('sum_target_x2', torch.zeros(n_bands))
-        self.counts: torch.Tensor
-        self.sum_x: torch.Tensor
-        self.sum_x2: torch.Tensor
-        self.sum_target_x2: torch.Tensor
-    @property
-    def mean(self):
-        mean = self.sum_x / self.counts
-        return mean
-    @property
-    def std(self):
-        std = (self.sum_x2 / self.counts - self.mean**2).clamp(min=0).sqrt()
-        return std
-    @property
-    def target_std(self):
-        target_std = self.sum_target_x2 / self.counts
-        return target_std
-    def project_sample(self, x: torch.Tensor):
-        assert x.dim() == 3
-        bands = self.split_bands(x)
-        if self.counts.item() < self.num_samples:
-            ref_bands = self.split_bands(torch.randn_like(x))
-            self.counts += len(x)
-            self.sum_x += bands.mean(dim=(2, 3)).sum(dim=1)
-            self.sum_x2 += bands.pow(2).mean(dim=(2, 3)).sum(dim=1)
-            self.sum_target_x2 += ref_bands.pow(2).mean(dim=(2, 3)).sum(dim=1)
-        rescale = (self.target_std / self.std.clamp(min=1e-12)) ** self.power_std  # same output size
-        bands = (bands - self.mean.view(-1, 1, 1, 1)) * rescale.view(-1, 1, 1, 1)
-        return bands.sum(dim=0)
-    def return_sample(self, x: torch.Tensor):
-        assert x.dim() == 3
-        bands = self.split_bands(x)
-        rescale = (self.std / self.target_std) ** self.power_std
-        bands = bands * rescale.view(-1, 1, 1, 1) + self.mean.view(-1, 1, 1, 1)
-        return bands.sum(dim=0)
-class NoiseSchedule:
-    """Noise schedule for diffusion.
-    Args:
-        beta_t0 (float): Variance of the first diffusion step.
-        beta_t1 (float): Variance of the last diffusion step.
-        beta_exp (float): Power schedule exponent
-        num_steps (int): Number of diffusion step.
-        variance (str): choice of the sigma value for the denoising eq. Choices: "beta" or "beta_tilde"
-        clip (float): clipping value for the denoising steps
-        rescale (float): rescaling value to avoid vanishing signals unused by default (i.e 1)
-        repartition (str): shape of the schedule only power schedule is supported
-        sample_processor (SampleProcessor): Module that normalize data to match better the gaussian distribution
-        noise_scale (float): Scaling factor for the noise
-    """
-    def __init__(self, beta_t0: float = 1e-4, beta_t1: float = 0.02, num_steps: int = 1000, variance: str = 'beta',
-                 clip: float = 5., rescale: float = 1., device='cuda', beta_exp: float = 1,
-                 repartition: str = "power", alpha_sigmoid: dict = {}, n_bands: tp.Optional[int] = None,
-                 sample_processor: SampleProcessor = SampleProcessor(), noise_scale: float = 1.0, **kwargs):
-        self.beta_t0 = beta_t0
-        self.beta_t1 = beta_t1
-        self.variance = variance
-        self.num_steps = num_steps
-        self.clip = clip
-        self.sample_processor = sample_processor
-        self.rescale = rescale
-        self.n_bands = n_bands
-        self.noise_scale = noise_scale
-        assert n_bands is None
-        if repartition == "power":
-            self.betas = torch.linspace(beta_t0 ** (1 / beta_exp), beta_t1 ** (1 / beta_exp), num_steps,
-                                        device=device, dtype=torch.float) ** beta_exp
-        else:
-            raise RuntimeError('Not implemented')
-        self.rng = random.Random(1234)
-    def get_beta(self, step: tp.Union[int, torch.Tensor]):
-        if self.n_bands is None:
-            return self.betas[step]
-        else:
-            return self.betas[:, step]  # [n_bands, len(step)]
-    def get_initial_noise(self, x: torch.Tensor):
-        if self.n_bands is None:
-            return torch.randn_like(x)
-        return torch.randn((x.size(0), self.n_bands, x.size(2)))
-    def get_alpha_bar(self, step: tp.Optional[tp.Union[int, torch.Tensor]] = None) -> torch.Tensor:
-        """Return 'alpha_bar', either for a given step, or as a tensor with its value for each step."""
-        if step is None:
-            return (1 - self.betas).cumprod(dim=-1)  # works for simgle and multi bands
-        if type(step) is int:
-            return (1 - self.betas[:step + 1]).prod()
-        else:
-            return (1 - self.betas).cumprod(dim=0)[step].view(-1, 1, 1)
-    def get_training_item(self, x: torch.Tensor, tensor_step: bool = False) -> TrainingItem:
-        """Create a noisy data item for diffusion model training:
-        Args:
-            x (torch.Tensor): clean audio data torch.tensor(bs, 1, T)
-            tensor_step (bool): If tensor_step = false, only one step t is sample,
-                the whole batch is diffused to the same step and t is int.
-                If tensor_step = true, t is a tensor of size (x.size(0),)
-                every element of the batch is diffused to a independently sampled.
-        """
-        step: tp.Union[int, torch.Tensor]
-        if tensor_step:
-            bs = x.size(0)
-            step = torch.randint(0, self.num_steps, size=(bs,), device=x.device)
-        else:
-            step = self.rng.randrange(self.num_steps)
-        alpha_bar = self.get_alpha_bar(step)  # [batch_size, n_bands, 1]
-        x = self.sample_processor.project_sample(x)
-        noise = torch.randn_like(x)
-        noisy = (alpha_bar.sqrt() / self.rescale) * x + (1 - alpha_bar).sqrt() * noise * self.noise_scale
-        return TrainingItem(noisy, noise, step)
-    def generate(self, model: torch.nn.Module, initial: tp.Optional[torch.Tensor] = None,
-                 condition: tp.Optional[torch.Tensor] = None, return_list: bool = False):
-        """Full ddpm reverse process.
-        Args:
-            model (nn.Module): Diffusion model.
-            initial (tensor): Initial Noise.
-            condition (tensor): Input conditionning Tensor (e.g. encodec compressed representation).
-            return_list (bool): Whether to return the whole process or only the sampled point.
-        """
-        alpha_bar = self.get_alpha_bar(step=self.num_steps - 1)
-        current = initial
-        iterates = [initial]
-        for step in range(self.num_steps)[::-1]:
-            with torch.no_grad():
-                estimate = model(current, step, condition=condition).sample
-            alpha = 1 - self.betas[step]
-            previous = (current - (1 - alpha) / (1 - alpha_bar).sqrt() * estimate) / alpha.sqrt()
-            previous_alpha_bar = self.get_alpha_bar(step=step - 1)
-            if step == 0:
-                sigma2 = 0
-            elif self.variance == 'beta':
-                sigma2 = 1 - alpha
-            elif self.variance == 'beta_tilde':
-                sigma2 = (1 - previous_alpha_bar) / (1 - alpha_bar) * (1 - alpha)
-            elif self.variance == 'none':
-                sigma2 = 0
-            else:
-                raise ValueError(f'Invalid variance type {self.variance}')
-            if sigma2 > 0:
-                previous += sigma2**0.5 * torch.randn_like(previous) * self.noise_scale
-            if self.clip:
-                previous = previous.clamp(-self.clip, self.clip)
-            current = previous
-            alpha_bar = previous_alpha_bar
-            if step == 0:
-                previous *= self.rescale
-            if return_list:
-                iterates.append(previous.cpu())
-        if return_list:
-            return iterates
-        else:
-            return self.sample_processor.return_sample(previous)
-    def generate_subsampled(self, model: torch.nn.Module, initial: torch.Tensor, step_list: tp.Optional[list] = None,
-                            condition: tp.Optional[torch.Tensor] = None, return_list: bool = False):
-        """Reverse process that only goes through Markov chain states in step_list."""
-        if step_list is None:
-            step_list = list(range(1000))[::-50] + [0]
-        alpha_bar = self.get_alpha_bar(step=self.num_steps - 1)
-        alpha_bars_subsampled = (1 - self.betas).cumprod(dim=0)[list(reversed(step_list))].cpu()
-        betas_subsampled = betas_from_alpha_bar(alpha_bars_subsampled)
-        current = initial * self.noise_scale
-        iterates = [current]
-        for idx, step in enumerate(step_list[:-1]):
-            with torch.no_grad():
-                estimate = model(current, step, condition=condition).sample * self.noise_scale
-            alpha = 1 - betas_subsampled[-1 - idx]
-            previous = (current - (1 - alpha) / (1 - alpha_bar).sqrt() * estimate) / alpha.sqrt()
-            previous_alpha_bar = self.get_alpha_bar(step_list[idx + 1])
-            if step == step_list[-2]:
-                sigma2 = 0
-                previous_alpha_bar = torch.tensor(1.0)
-            else:
-                sigma2 = (1 - previous_alpha_bar) / (1 - alpha_bar) * (1 - alpha)
-            if sigma2 > 0:
-                previous += sigma2**0.5 * torch.randn_like(previous) * self.noise_scale
-            if self.clip:
-                previous = previous.clamp(-self.clip, self.clip)
-            current = previous
-            alpha_bar = previous_alpha_bar
-            if step == 0:
-                previous *= self.rescale
-            if return_list:
-                iterates.append(previous.cpu())
-        if return_list:
-            return iterates
-        else:
-            return self.sample_processor.return_sample(previous)

audiocraft/modules/jasco_conditioners.py DELETED Viewed

@@ -1,300 +0,0 @@
-import torch
-import typing as tp
-from itertools import chain
-from pathlib import Path
-from torch import nn
-from .conditioners import (ConditioningAttributes, BaseConditioner, ConditionType,
-                           ConditioningProvider, JascoCondConst,
-                           WaveformConditioner, WavCondition, SymbolicCondition)
-from ..data.audio import audio_read
-from ..data.audio_utils import convert_audio
-from ..utils.autocast import TorchAutocast
-from ..utils.cache import EmbeddingCache
-class MelodyConditioner(BaseConditioner):
-    """
-    A conditioner that handles melody conditioning from pre-computed salience matrix.
-    Attributes:
-        card (int): The cardinality of the melody matrix.
-        out_dim (int): The dimensionality of the output projection.
-        device (Union[torch.device, str]): The device on which the embeddings are stored.
-    """
-    def __init__(self, card: int, out_dim: int, device: tp.Union[torch.device, str] = 'cpu', **kwargs):
-        super().__init__(dim=card, output_dim=out_dim)
-        self.device = device
-    def tokenize(self, x: SymbolicCondition) -> SymbolicCondition:
-        return SymbolicCondition(melody=x.melody.to(self.device))  # type: ignore
-    def forward(self, x: SymbolicCondition) -> ConditionType:
-        embeds = self.output_proj(x.melody.permute(0, 2, 1))  # type: ignore
-        mask = torch.ones_like(embeds[..., 0])
-        return embeds, mask
-class ChordsEmbConditioner(BaseConditioner):
-    """
-    A conditioner that embeds chord symbols into a continuous vector space.
-    Attributes:
-        card (int): The cardinality of the chord vocabulary.
-        out_dim (int): The dimensionality of the output embeddings.
-        device (Union[torch.device, str]): The device on which the embeddings are stored.
-    """
-    def __init__(self, card: int, out_dim: int, device: tp.Union[torch.device, str] = 'cpu', **kwargs):
-        vocab_size = card + 1  # card + 1 - for null chord used during dropout
-        super().__init__(dim=vocab_size, output_dim=-1)  # out_dim=-1 to avoid another projection
-        self.emb = nn.Embedding(vocab_size, out_dim, device=device)
-        self.device = device
-    def tokenize(self, x: SymbolicCondition) -> SymbolicCondition:
-        return SymbolicCondition(frame_chords=x.frame_chords.to(self.device))   # type: ignore
-    def forward(self, x: SymbolicCondition) -> ConditionType:
-        embeds = self.emb(x.frame_chords)
-        mask = torch.ones_like(embeds[..., 0])
-        return embeds, mask
-class DrumsConditioner(WaveformConditioner):
-    def __init__(self, out_dim: int, sample_rate: int, blurring_factor: int = 3,
-                 cache_path: tp.Optional[tp.Union[str, Path]] = None,
-                 compression_model_latent_dim: int = 128,
-                 compression_model_framerate: float = 50,
-                 segment_duration: float = 10.0,
-                 device: tp.Union[torch.device, str] = 'cpu',
-                 **kwargs):
-        """Drum condition conditioner
-        Args:
-            out_dim (int): _description_
-            sample_rate (int): _description_
-            blurring_factor (int, optional): _description_. Defaults to 3.
-            cache_path (tp.Optional[tp.Union[str, Path]], optional): path to precomputed cache. Defaults to None.
-            compression_model_latent_dim (int, optional): latent dimensino. Defaults to 128.
-            compression_model_framerate (float, optional): frame rate of the representation model. Defaults to 50.
-            segment_duration (float, optional): duration in sec for each audio segment. Defaults to 10.0.
-            device (tp.Union[torch.device, str], optional): device. Defaults to 'cpu'.
-        """
-        from demucs import pretrained
-        self.sample_rate = sample_rate
-        self.__dict__['demucs'] = pretrained.get_model('htdemucs').to(device)
-        stem_sources: list = self.demucs.sources  # type: ignore
-        self.stem_idx = stem_sources.index('drums')
-        self.compression_model = None
-        self.latent_dim = compression_model_latent_dim
-        super().__init__(dim=self.latent_dim, output_dim=out_dim, device=device)
-        self.autocast = TorchAutocast(enabled=device != 'cpu', device_type=self.device, dtype=torch.float32)
-        self._use_masking = False
-        self.blurring_factor = blurring_factor
-        self.seq_len = int(segment_duration * compression_model_framerate)
-        self.cache = None  # If you wish to train with EmbeddingCache, call self.create_embedding_cache(cache_path)
-    def create_embedding_cache(self, cache_path):
-        if cache_path is not None:
-            self.cache = EmbeddingCache(Path(cache_path) / 'wav', self.device,
-                                        compute_embed_fn=self._calc_coarse_drum_codes_for_cache,
-                                        extract_embed_fn=self._load_drum_codes_chunk)
-    @torch.no_grad()
-    def _get_drums_stem(self, wav: torch.Tensor, sample_rate: int) -> torch.Tensor:
-        """Get parts of the wav that holds the drums, extracting the main stems from the wav."""
-        from demucs.apply import apply_model
-        from demucs.audio import convert_audio
-        with self.autocast:
-            wav = convert_audio(
-                wav, sample_rate, self.demucs.samplerate, self.demucs.audio_channels)  # type: ignore
-            stems = apply_model(self.demucs, wav, device=self.device)
-            drum_stem = stems[:, self.stem_idx]  # extract relevant stems for drums conditioning
-            return convert_audio(drum_stem, self.demucs.samplerate, self.sample_rate, 1)  # type: ignore
-    def _temporal_blur(self, z: torch.Tensor):
-        # z: (B, T, C)
-        B, T, C = z.shape
-        if T % self.blurring_factor != 0:
-            # pad with reflect for T % self.temporal_blurring on the right in dim=1
-            pad_val = self.blurring_factor - T % self.blurring_factor
-            z = torch.nn.functional.pad(z, (0, 0, 0, pad_val), mode='reflect')
-        z = z.reshape(B, -1, self.blurring_factor, C).sum(dim=2) / self.blurring_factor
-        z = z.unsqueeze(2).repeat(1, 1, self.blurring_factor, 1).reshape(B, -1, C)
-        z = z[:, :T]
-        assert z.shape == (B, T, C)
-        return z
-    @torch.no_grad()
-    def _extract_coarse_drum_codes(self, wav: torch.Tensor, sample_rate: int) -> torch.Tensor:
-        assert self.compression_model is not None
-        # stem separation of drums
-        drums = self._get_drums_stem(wav, sample_rate)
-        # continuous encoding with compression model
-        latents = self.compression_model.model.encoder(drums)
-        # quantization to coarsest codebook
-        coarsest_quantizer = self.compression_model.model.quantizer.layers[0]
-        drums = coarsest_quantizer.encode(latents).to(torch.int16)
-        return drums
-    @torch.no_grad()
-    def _calc_coarse_drum_codes_for_cache(self, path: tp.Union[str, Path],
-                                          x: WavCondition, idx: int,
-                                          max_duration_to_process: float = 600) -> torch.Tensor:
-        """Extract blurred drum latents from the whole audio waveform at the given path."""
-        wav, sr = audio_read(path)
-        wav = wav[None].to(self.device)
-        wav = convert_audio(wav, sr, self.sample_rate, to_channels=1)
-        max_frames_to_process = int(max_duration_to_process * self.sample_rate)
-        if wav.shape[-1] > max_frames_to_process:
-            # process very long tracks in chunks
-            start = 0
-            codes = []
-            while start < wav.shape[-1] - 1:
-                wav_chunk = wav[..., start: start + max_frames_to_process]
-                codes.append(self._extract_coarse_drum_codes(wav_chunk, self.sample_rate)[0])
-                start += max_frames_to_process
-            return torch.cat(codes)
-        return self._extract_coarse_drum_codes(wav, self.sample_rate)[0]
-    def _load_drum_codes_chunk(self, full_coarse_drum_codes: torch.Tensor, x: WavCondition, idx: int) -> torch.Tensor:
-        """Extract a chunk of coarse drum codes from the full coarse drum codes derived from the full waveform."""
-        wav_length = x.wav.shape[-1]
-        seek_time = x.seek_time[idx]
-        assert seek_time is not None, (
-            "WavCondition seek_time is required "
-            "when extracting chunks from pre-computed drum codes.")
-        assert self.compression_model is not None
-        frame_rate = self.compression_model.frame_rate
-        target_length = int(frame_rate * wav_length / self.sample_rate)
-        target_length = max(target_length, self.seq_len)
-        index = int(frame_rate * seek_time)
-        out = full_coarse_drum_codes[index: index + target_length]
-        # pad
-        out = torch.cat((out, torch.zeros(target_length - out.shape[0], dtype=out.dtype, device=out.device)))
-        return out.to(self.device)
-    @torch.no_grad()
-    def _get_wav_embedding(self, x: WavCondition) -> torch.Tensor:
-        bs = x.wav.shape[0]
-        if x.wav.shape[-1] <= 1:
-            # null condition
-            return torch.zeros((bs, self.seq_len, self.latent_dim), device=x.wav.device, dtype=x.wav.dtype)
-        # extract coarse drum codes
-        no_undefined_paths = all(p is not None for p in x.path)
-        no_nullified_cond = x.wav.shape[-1] > 1
-        if self.cache is not None and no_undefined_paths and no_nullified_cond:
-            paths = [Path(p) for p in x.path if p is not None]
-            codes = self.cache.get_embed_from_cache(paths, x)
-        else:
-            assert all(sr == x.sample_rate[0] for sr in x.sample_rate), "All sample rates in batch should be equal."
-            codes = self._extract_coarse_drum_codes(x.wav, x.sample_rate[0])
-        assert self.compression_model is not None
-        # decode back to the continuous representation of compression model
-        codes = codes.unsqueeze(1).permute(1, 0, 2)  # (B, T) -> (1, B, T)
-        codes = codes.to(torch.int64)
-        latents = self.compression_model.model.quantizer.decode(codes)
-        latents = latents.permute(0, 2, 1)  # [B, C, T] -> [B, T, C]
-        # temporal blurring
-        return self._temporal_blur(latents)
-    def tokenize(self, x: WavCondition) -> WavCondition:
-        """Apply WavConditioner tokenization and populate cache if needed."""
-        x = super().tokenize(x)
-        no_undefined_paths = all(p is not None for p in x.path)
-        if self.cache is not None and no_undefined_paths:
-            paths = [Path(p) for p in x.path if p is not None]
-            self.cache.populate_embed_cache(paths, x)
-        return x
-class JascoConditioningProvider(ConditioningProvider):
-    """
-    A cond-provider that manages and tokenizes various types of conditioning attributes for Jasco models.
-    Attributes:
-        chords_card (int): The cardinality of the chord vocabulary.
-        sequence_length (int): The length of the sequence for padding purposes.
-        melody_dim (int): The dimensionality of the melody matrix.
-    """
-    def __init__(self, *args,
-                 chords_card: int = 194,
-                 sequence_length: int = 500,
-                 melody_dim: int = 53, **kwargs):
-        self.null_chord = chords_card
-        self.sequence_len = sequence_length
-        self.melody_dim = melody_dim
-        super().__init__(*args, **kwargs)
-    def tokenize(self, inputs: tp.List[ConditioningAttributes]) -> tp.Dict[str, tp.Any]:
-        """Match attributes/wavs with existing conditioners in self, and compute tokenize them accordingly.
-        This should be called before starting any real GPU work to avoid synchronization points.
-        This will return a dict matching conditioner names to their arbitrary tokenized representations.
-        Args:
-            inputs (list[ConditioningAttributes]): List of ConditioningAttributes objects containing
-                text and wav conditions.
-        """
-        assert all([isinstance(x, ConditioningAttributes) for x in inputs]), (
-            "Got unexpected types input for conditioner! should be tp.List[ConditioningAttributes]",
-            f" but types were {set([type(x) for x in inputs])}"
-        )
-        output = {}
-        text = self._collate_text(inputs)
-        wavs = self._collate_wavs(inputs)
-        symbolic = self._collate_symbolic(inputs, self.conditioners.keys())
-        assert set(text.keys() | wavs.keys() | symbolic.keys()).issubset(set(self.conditioners.keys())), (
-            f"Got an unexpected attribute! Expected {self.conditioners.keys()}, ",
-            f"got {text.keys(), wavs.keys(), symbolic.keys()}"
-        )
-        for attribute, batch in chain(text.items(), wavs.items(), symbolic.items()):
-            output[attribute] = self.conditioners[attribute].tokenize(batch)
-        return output
-    def _collate_symbolic(self, samples: tp.List[ConditioningAttributes],
-                          conditioner_keys: tp.Set) -> tp.Dict[str, SymbolicCondition]:
-        output = {}
-        # collate if symbolic cond exists
-        if any(x in conditioner_keys for x in JascoCondConst.SYM.value):
-            for s in samples:
-                # hydrate with null chord if chords not exist - for inference support
-                if (s.symbolic == {} or
-                        s.symbolic[JascoCondConst.CRD.value].frame_chords is None or
-                        s.symbolic[JascoCondConst.CRD.value].frame_chords.shape[-1] <= 1):  # type: ignore
-                    # no chords conditioning - fill with null chord token
-                    s.symbolic[JascoCondConst.CRD.value] = SymbolicCondition(
-                        frame_chords=torch.ones(self.sequence_len, dtype=torch.int32) * self.null_chord)
-                if (s.symbolic == {} or
-                        s.symbolic[JascoCondConst.MLD.value].melody is None or
-                        s.symbolic[JascoCondConst.MLD.value].melody.shape[-1] <= 1):  # type: ignore
-                    # no chords conditioning - fill with null chord token
-                    s.symbolic[JascoCondConst.MLD.value] = SymbolicCondition(
-                        melody=torch.zeros((self.melody_dim, self.sequence_len)))
-            if JascoCondConst.CRD.value in conditioner_keys:
-                # pad to max
-                max_seq_len = max(
-                    [s.symbolic[JascoCondConst.CRD.value].frame_chords.shape[-1] for s in samples])  # type: ignore
-                padded_chords = [
-                    torch.cat((x.symbolic[JascoCondConst.CRD.value].frame_chords,   # type: ignore
-                               torch.ones(max_seq_len -
-                                          x.symbolic[JascoCondConst.CRD.value].frame_chords.shape[-1],  # type: ignore
-                                          dtype=torch.int32) * self.null_chord))
-                    for x in samples
-                ]
-                output[JascoCondConst.CRD.value] = SymbolicCondition(frame_chords=torch.stack(padded_chords))
-            if JascoCondConst.MLD.value in conditioner_keys:
-                melodies = torch.stack([x.symbolic[JascoCondConst.MLD.value].melody for x in samples])  # type: ignore
-                output[JascoCondConst.MLD.value] = SymbolicCondition(melody=melodies)
-        return output

audiocraft/modules/lstm.py DELETED Viewed

@@ -1,25 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-from torch import nn
-class StreamableLSTM(nn.Module):
-    """LSTM without worrying about the hidden state, nor the layout of the data.
-    Expects input as convolutional layout.
-    """
-    def __init__(self, dimension: int, num_layers: int = 2, skip: bool = True):
-        super().__init__()
-        self.skip = skip
-        self.lstm = nn.LSTM(dimension, dimension, num_layers)
-    def forward(self, x):
-        x = x.permute(2, 0, 1)
-        y, _ = self.lstm(x)
-        if self.skip:
-            y = y + x
-        y = y.permute(1, 2, 0)
-        return y

audiocraft/modules/rope.py DELETED Viewed

@@ -1,125 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-import typing as tp
-from torch import nn
-import torch
-class XPos(nn.Module):
-    """Length-extrapolatable positional embedding (xPos) from [Sun et al 2022](https://arxiv.org/abs/2212.10554v1).
-    This applies an exponential decay to the RoPE rotation matrix.
-    Args:
-        dim (int): Embedding dimension.
-        smoothing (float): Smoothing factor applied to the decay rates.
-        base_scale (int): Base decay rate, given in terms of scaling time.
-        device (torch.device, optional): Device on which to initialize the module.
-        dtype (torch.dtype): dtype to use to generate the embedding.
-    """
-    def __init__(self, dim: int, smoothing: float = 0.4, base_scale: int = 512,
-                 device=None, dtype: torch.dtype = torch.float32):
-        super().__init__()
-        assert dim % 2 == 0
-        assert dtype in [torch.float64, torch.float32]
-        self.dtype = dtype
-        self.base_scale = base_scale
-        half_dim = dim // 2
-        adim = torch.arange(half_dim, device=device, dtype=dtype)
-        decay_rates = (adim / half_dim + smoothing) / (1.0 + smoothing)
-        self.register_buffer("decay_rates", decay_rates)
-        self.decay: tp.Optional[torch.Tensor] = None
-    def get_decay(self, start: int, end: int):
-        """Create complex decay tensor, cache values for fast computation."""
-        if self.decay is None or end > self.decay.shape[0]:
-            assert isinstance(self.decay_rates, torch.Tensor)  # Satisfy type checker.
-            idx = torch.arange(end, device=self.decay_rates.device, dtype=self.dtype)
-            power = idx / self.base_scale
-            scale = self.decay_rates ** power.unsqueeze(-1)
-            self.decay = torch.polar(scale, torch.zeros_like(scale))
-        return self.decay[start:end]  # [T, C/2]
-class RotaryEmbedding(nn.Module):
-    """Rotary positional embedding (RoPE) from [Su et al 2022](https://arxiv.org/abs/2104.09864).
-    Args:
-        dim (int): Embedding dimension (twice the number of frequencies).
-        max_period (float): Maximum period of the rotation frequencies.
-        xpos (bool): Use xPos, applies an exponential decay to rotation matrix.
-        scale (float): Scale of positional embedding, set to 0 to deactivate.
-        device (torch.device, optional): Device on which to initialize the module.
-        dtype (torch.dtype): dtype to use to generate the embedding.
-    """
-    def __init__(self, dim: int, max_period: float = 10000.0, xpos: bool = False,
-                 scale: float = 1.0, device=None, dtype: torch.dtype = torch.float32):
-        super().__init__()
-        assert dim % 2 == 0
-        self.scale = scale
-        assert dtype in [torch.float64, torch.float32]
-        self.dtype = dtype
-        adim = torch.arange(0, dim, 2, device=device, dtype=dtype)[: (dim // 2)]
-        frequencies = 1.0 / (max_period ** (adim / dim))
-        self.register_buffer("frequencies", frequencies)
-        self.rotation: tp.Optional[torch.Tensor] = None
-        self.xpos = XPos(dim, device=device, dtype=dtype) if xpos else None
-    def get_rotation(self, start: int, end: int):
-        """Create complex rotation tensor, cache values for fast computation."""
-        if self.rotation is None or end > self.rotation.shape[0]:
-            assert isinstance(self.frequencies, torch.Tensor)  # Satisfy type checker.
-            idx = torch.arange(end, device=self.frequencies.device, dtype=self.dtype)
-            angles = torch.outer(idx, self.frequencies)
-            self.rotation = torch.polar(torch.ones_like(angles), angles)
-        return self.rotation[start:end]
-    def rotate(self, x: torch.Tensor, start: int = 0, time_dim: int = 1, invert_decay: bool = False):
-        """Apply rope rotation to query or key tensor."""
-        T = x.shape[time_dim]
-        target_shape = [1] * x.dim()
-        target_shape[time_dim] = T
-        target_shape[-1] = -1
-        rotation = self.get_rotation(start, start + T).view(target_shape)
-        if self.xpos:
-            decay = self.xpos.get_decay(start, start + T).view(target_shape)
-        else:
-            decay = 1.0
-        if invert_decay:
-            decay = decay ** -1
-        x_complex = torch.view_as_complex(x.to(self.dtype).reshape(*x.shape[:-1], -1, 2))
-        scaled_rotation = (rotation * decay) * self.scale + (1.0 - self.scale)
-        x_out = torch.view_as_real(x_complex * scaled_rotation).view_as(x)
-        return x_out.type_as(x)
-    def rotate_qk(self, query: torch.Tensor, key: torch.Tensor, start: int = 0, time_dim: int = 1):
-        """ Apply rope rotation to both query and key tensors.
-        Supports streaming mode, in which query and key are not expected to have the same shape.
-        In streaming mode, key will be of length [P + C] with P the cached past timesteps, but
-        query will be [C] (typically C == 1).
-        Args:
-            query (torch.Tensor): Query to rotate.
-            key (torch.Tensor): Key to rotate.
-            start (int): Start index of the sequence for time offset.
-            time_dim (int): which dimension represent the time steps.
-        """
-        query_timesteps = query.shape[time_dim]
-        key_timesteps = key.shape[time_dim]
-        streaming_offset = key_timesteps - query_timesteps
-        query_out = self.rotate(query, start + streaming_offset, time_dim)
-        key_out = self.rotate(key, start, time_dim, invert_decay=True)
-        return query_out, key_out

audiocraft/modules/seanet.py DELETED Viewed

@@ -1,258 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-import typing as tp
-import numpy as np
-import torch.nn as nn
-from .conv import StreamableConv1d, StreamableConvTranspose1d
-from .lstm import StreamableLSTM
-class SEANetResnetBlock(nn.Module):
-    """Residual block from SEANet model.
-    Args:
-        dim (int): Dimension of the input/output.
-        kernel_sizes (list): List of kernel sizes for the convolutions.
-        dilations (list): List of dilations for the convolutions.
-        activation (str): Activation function.
-        activation_params (dict): Parameters to provide to the activation function.
-        norm (str): Normalization method.
-        norm_params (dict): Parameters to provide to the underlying normalization used along with the convolution.
-        causal (bool): Whether to use fully causal convolution.
-        pad_mode (str): Padding mode for the convolutions.
-        compress (int): Reduced dimensionality in residual branches (from Demucs v3).
-        true_skip (bool): Whether to use true skip connection or a simple
-            (streamable) convolution as the skip connection.
-    """
-    def __init__(self, dim: int, kernel_sizes: tp.List[int] = [3, 1], dilations: tp.List[int] = [1, 1],
-                 activation: str = 'ELU', activation_params: dict = {'alpha': 1.0},
-                 norm: str = 'none', norm_params: tp.Dict[str, tp.Any] = {}, causal: bool = False,
-                 pad_mode: str = 'reflect', compress: int = 2, true_skip: bool = True):
-        super().__init__()
-        assert len(kernel_sizes) == len(dilations), 'Number of kernel sizes should match number of dilations'
-        act = getattr(nn, activation)
-        hidden = dim // compress
-        block = []
-        for i, (kernel_size, dilation) in enumerate(zip(kernel_sizes, dilations)):
-            in_chs = dim if i == 0 else hidden
-            out_chs = dim if i == len(kernel_sizes) - 1 else hidden
-            block += [
-                act(**activation_params),
-                StreamableConv1d(in_chs, out_chs, kernel_size=kernel_size, dilation=dilation,
-                                 norm=norm, norm_kwargs=norm_params,
-                                 causal=causal, pad_mode=pad_mode),
-            ]
-        self.block = nn.Sequential(*block)
-        self.shortcut: nn.Module
-        if true_skip:
-            self.shortcut = nn.Identity()
-        else:
-            self.shortcut = StreamableConv1d(dim, dim, kernel_size=1, norm=norm, norm_kwargs=norm_params,
-                                             causal=causal, pad_mode=pad_mode)
-    def forward(self, x):
-        return self.shortcut(x) + self.block(x)
-class SEANetEncoder(nn.Module):
-    """SEANet encoder.
-    Args:
-        channels (int): Audio channels.
-        dimension (int): Intermediate representation dimension.
-        n_filters (int): Base width for the model.
-        n_residual_layers (int): nb of residual layers.
-        ratios (Sequence[int]): kernel size and stride ratios. The encoder uses downsampling ratios instead of
-            upsampling ratios, hence it will use the ratios in the reverse order to the ones specified here
-            that must match the decoder order. We use the decoder order as some models may only employ the decoder.
-        activation (str): Activation function.
-        activation_params (dict): Parameters to provide to the activation function.
-        norm (str): Normalization method.
-        norm_params (dict): Parameters to provide to the underlying normalization used along with the convolution.
-        kernel_size (int): Kernel size for the initial convolution.
-        last_kernel_size (int): Kernel size for the initial convolution.
-        residual_kernel_size (int): Kernel size for the residual layers.
-        dilation_base (int): How much to increase the dilation with each layer.
-        causal (bool): Whether to use fully causal convolution.
-        pad_mode (str): Padding mode for the convolutions.
-        true_skip (bool): Whether to use true skip connection or a simple
-            (streamable) convolution as the skip connection in the residual network blocks.
-        compress (int): Reduced dimensionality in residual branches (from Demucs v3).
-        lstm (int): Number of LSTM layers at the end of the encoder.
-        disable_norm_outer_blocks (int): Number of blocks for which we don't apply norm.
-            For the encoder, it corresponds to the N first blocks.
-    """
-    def __init__(self, channels: int = 1, dimension: int = 128, n_filters: int = 32, n_residual_layers: int = 3,
-                 ratios: tp.List[int] = [8, 5, 4, 2], activation: str = 'ELU', activation_params: dict = {'alpha': 1.0},
-                 norm: str = 'none', norm_params: tp.Dict[str, tp.Any] = {}, kernel_size: int = 7,
-                 last_kernel_size: int = 7, residual_kernel_size: int = 3, dilation_base: int = 2, causal: bool = False,
-                 pad_mode: str = 'reflect', true_skip: bool = True, compress: int = 2, lstm: int = 0,
-                 disable_norm_outer_blocks: int = 0):
-        super().__init__()
-        self.channels = channels
-        self.dimension = dimension
-        self.n_filters = n_filters
-        self.ratios = list(reversed(ratios))
-        del ratios
-        self.n_residual_layers = n_residual_layers
-        self.hop_length = np.prod(self.ratios)
-        self.n_blocks = len(self.ratios) + 2  # first and last conv + residual blocks
-        self.disable_norm_outer_blocks = disable_norm_outer_blocks
-        assert self.disable_norm_outer_blocks >= 0 and self.disable_norm_outer_blocks <= self.n_blocks, \
-            "Number of blocks for which to disable norm is invalid." \
-            "It should be lower or equal to the actual number of blocks in the network and greater or equal to 0."
-        act = getattr(nn, activation)
-        mult = 1
-        model: tp.List[nn.Module] = [
-            StreamableConv1d(channels, mult * n_filters, kernel_size,
-                             norm='none' if self.disable_norm_outer_blocks >= 1 else norm,
-                             norm_kwargs=norm_params, causal=causal, pad_mode=pad_mode)
-        ]
-        # Downsample to raw audio scale
-        for i, ratio in enumerate(self.ratios):
-            block_norm = 'none' if self.disable_norm_outer_blocks >= i + 2 else norm
-            # Add residual layers
-            for j in range(n_residual_layers):
-                model += [
-                    SEANetResnetBlock(mult * n_filters, kernel_sizes=[residual_kernel_size, 1],
-                                      dilations=[dilation_base ** j, 1],
-                                      norm=block_norm, norm_params=norm_params,
-                                      activation=activation, activation_params=activation_params,
-                                      causal=causal, pad_mode=pad_mode, compress=compress, true_skip=true_skip)]
-            # Add downsampling layers
-            model += [
-                act(**activation_params),
-                StreamableConv1d(mult * n_filters, mult * n_filters * 2,
-                                 kernel_size=ratio * 2, stride=ratio,
-                                 norm=block_norm, norm_kwargs=norm_params,
-                                 causal=causal, pad_mode=pad_mode),
-            ]
-            mult *= 2
-        if lstm:
-            model += [StreamableLSTM(mult * n_filters, num_layers=lstm)]
-        model += [
-            act(**activation_params),
-            StreamableConv1d(mult * n_filters, dimension, last_kernel_size,
-                             norm='none' if self.disable_norm_outer_blocks == self.n_blocks else norm,
-                             norm_kwargs=norm_params, causal=causal, pad_mode=pad_mode)
-        ]
-        self.model = nn.Sequential(*model)
-    def forward(self, x):
-        return self.model(x)
-class SEANetDecoder(nn.Module):
-    """SEANet decoder.
-    Args:
-        channels (int): Audio channels.
-        dimension (int): Intermediate representation dimension.
-        n_filters (int): Base width for the model.
-        n_residual_layers (int): nb of residual layers.
-        ratios (Sequence[int]): kernel size and stride ratios.
-        activation (str): Activation function.
-        activation_params (dict): Parameters to provide to the activation function.
-        final_activation (str): Final activation function after all convolutions.
-        final_activation_params (dict): Parameters to provide to the activation function.
-        norm (str): Normalization method.
-        norm_params (dict): Parameters to provide to the underlying normalization used along with the convolution.
-        kernel_size (int): Kernel size for the initial convolution.
-        last_kernel_size (int): Kernel size for the initial convolution.
-        residual_kernel_size (int): Kernel size for the residual layers.
-        dilation_base (int): How much to increase the dilation with each layer.
-        causal (bool): Whether to use fully causal convolution.
-        pad_mode (str): Padding mode for the convolutions.
-        true_skip (bool): Whether to use true skip connection or a simple.
-            (streamable) convolution as the skip connection in the residual network blocks.
-        compress (int): Reduced dimensionality in residual branches (from Demucs v3).
-        lstm (int): Number of LSTM layers at the end of the encoder.
-        disable_norm_outer_blocks (int): Number of blocks for which we don't apply norm.
-            For the decoder, it corresponds to the N last blocks.
-        trim_right_ratio (float): Ratio for trimming at the right of the transposed convolution under the causal setup.
-            If equal to 1.0, it means that all the trimming is done at the right.
-    """
-    def __init__(self, channels: int = 1, dimension: int = 128, n_filters: int = 32, n_residual_layers: int = 3,
-                 ratios: tp.List[int] = [8, 5, 4, 2], activation: str = 'ELU', activation_params: dict = {'alpha': 1.0},
-                 final_activation: tp.Optional[str] = None, final_activation_params: tp.Optional[dict] = None,
-                 norm: str = 'none', norm_params: tp.Dict[str, tp.Any] = {}, kernel_size: int = 7,
-                 last_kernel_size: int = 7, residual_kernel_size: int = 3, dilation_base: int = 2, causal: bool = False,
-                 pad_mode: str = 'reflect', true_skip: bool = True, compress: int = 2, lstm: int = 0,
-                 disable_norm_outer_blocks: int = 0, trim_right_ratio: float = 1.0):
-        super().__init__()
-        self.dimension = dimension
-        self.channels = channels
-        self.n_filters = n_filters
-        self.ratios = ratios
-        del ratios
-        self.n_residual_layers = n_residual_layers
-        self.hop_length = np.prod(self.ratios)
-        self.n_blocks = len(self.ratios) + 2  # first and last conv + residual blocks
-        self.disable_norm_outer_blocks = disable_norm_outer_blocks
-        assert self.disable_norm_outer_blocks >= 0 and self.disable_norm_outer_blocks <= self.n_blocks, \
-            "Number of blocks for which to disable norm is invalid." \
-            "It should be lower or equal to the actual number of blocks in the network and greater or equal to 0."
-        act = getattr(nn, activation)
-        mult = int(2 ** len(self.ratios))
-        model: tp.List[nn.Module] = [
-            StreamableConv1d(dimension, mult * n_filters, kernel_size,
-                             norm='none' if self.disable_norm_outer_blocks == self.n_blocks else norm,
-                             norm_kwargs=norm_params, causal=causal, pad_mode=pad_mode)
-        ]
-        if lstm:
-            model += [StreamableLSTM(mult * n_filters, num_layers=lstm)]
-        # Upsample to raw audio scale
-        for i, ratio in enumerate(self.ratios):
-            block_norm = 'none' if self.disable_norm_outer_blocks >= self.n_blocks - (i + 1) else norm
-            # Add upsampling layers
-            model += [
-                act(**activation_params),
-                StreamableConvTranspose1d(mult * n_filters, mult * n_filters // 2,
-                                          kernel_size=ratio * 2, stride=ratio,
-                                          norm=block_norm, norm_kwargs=norm_params,
-                                          causal=causal, trim_right_ratio=trim_right_ratio),
-            ]
-            # Add residual layers
-            for j in range(n_residual_layers):
-                model += [
-                    SEANetResnetBlock(mult * n_filters // 2, kernel_sizes=[residual_kernel_size, 1],
-                                      dilations=[dilation_base ** j, 1],
-                                      activation=activation, activation_params=activation_params,
-                                      norm=block_norm, norm_params=norm_params, causal=causal,
-                                      pad_mode=pad_mode, compress=compress, true_skip=true_skip)]
-            mult //= 2
-        # Add final layers
-        model += [
-            act(**activation_params),
-            StreamableConv1d(n_filters, channels, last_kernel_size,
-                             norm='none' if self.disable_norm_outer_blocks >= 1 else norm,
-                             norm_kwargs=norm_params, causal=causal, pad_mode=pad_mode)
-        ]
-        # Add optional final activation to decoder (eg. tanh)
-        if final_activation is not None:
-            final_act = getattr(nn, final_activation)
-            final_activation_params = final_activation_params or {}
-            model += [
-                final_act(**final_activation_params)
-            ]
-        self.model = nn.Sequential(*model)
-    def forward(self, z):
-        y = self.model(z)
-        return y

audiocraft/modules/streaming.py DELETED Viewed

@@ -1,135 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Streaming module API that should be implemented by all Streaming components,
-"""
-from contextlib import contextmanager
-import typing as tp
-from torch import nn
-import torch
-State = tp.Dict[str, torch.Tensor]
-class StreamingModule(nn.Module):
-    """Common API for streaming components.
-    Each streaming component has a streaming state, which is just a dict[str, Tensor].
-    By convention, the first dim of each tensor must be the batch size.
-    Don't use dots in the key names, as this would clash with submodules
-    (like in state_dict).
-    If `self._is_streaming` is True, the component should use and remember
-    the proper state inside `self._streaming_state`.
-    To set a streaming component in streaming state, use
-        with module.streaming():
-            ...
-    This will automatically reset the streaming state when exiting the context manager.
-    This also automatically propagates to all streaming children module.
-    Some module might also implement the `StreamingModule.flush` method, although
-    this one is trickier, as all parents module must be StreamingModule and implement
-    it as well for it to work properly. See `StreamingSequential` after.
-    """
-    def __init__(self) -> None:
-        super().__init__()
-        self._streaming_state: State = {}
-        self._is_streaming = False
-    def _apply_named_streaming(self, fn: tp.Any):
-        for name, module in self.named_modules():
-            if isinstance(module, StreamingModule):
-                fn(name, module)
-    def _set_streaming(self, streaming: bool):
-        def _set_streaming(name, module):
-            module._is_streaming = streaming
-        self._apply_named_streaming(_set_streaming)
-    @contextmanager
-    def streaming(self):
-        """Context manager to enter streaming mode. Reset streaming state on exit.
-        """
-        self._set_streaming(True)
-        try:
-            yield
-        finally:
-            self._set_streaming(False)
-            self.reset_streaming()
-    def reset_streaming(self):
-        """Reset the streaming state.
-        """
-        def _reset(name: str, module: StreamingModule):
-            module._streaming_state.clear()
-        self._apply_named_streaming(_reset)
-    def get_streaming_state(self) -> State:
-        """Return the streaming state, including that of sub-modules.
-        """
-        state: State = {}
-        def _add(name: str, module: StreamingModule):
-            if name:
-                name += "."
-            for key, value in module._streaming_state.items():
-                state[name + key] = value
-        self._apply_named_streaming(_add)
-        return state
-    def set_streaming_state(self, state: State):
-        """Set the streaming state, including that of sub-modules.
-        """
-        state = dict(state)
-        def _set(name: str, module: StreamingModule):
-            if name:
-                name += "."
-            module._streaming_state.clear()
-            for key, value in list(state.items()):
-                # complexity is not ideal here, but probably fine.
-                if key.startswith(name):
-                    local_key = key[len(name):]
-                    if '.' not in local_key:
-                        module._streaming_state[local_key] = value
-                        del state[key]
-        self._apply_named_streaming(_set)
-        assert len(state) == 0, list(state.keys())
-    def flush(self, x: tp.Optional[torch.Tensor] = None):
-        """Flush any remaining outputs that were waiting for completion.
-        Typically, for convolutions, this will add the final padding
-        and process the last buffer.
-        This should take an optional argument `x`, which will be provided
-        if a module before this one in the streaming pipeline has already
-        spitted out a flushed out buffer.
-        """
-        if x is None:
-            return None
-        else:
-            return self(x)
-class StreamingSequential(StreamingModule, nn.Sequential):
-    """A streaming compatible alternative of `nn.Sequential`.
-    """
-    def flush(self, x: tp.Optional[torch.Tensor] = None):
-        for module in self:
-            if isinstance(module, StreamingModule):
-                x = module.flush(x)
-            elif x is not None:
-                x = module(x)
-        return x

audiocraft/modules/transformer.py DELETED Viewed

@@ -1,755 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Transformer model, with streaming support, xformer attention support
-and easy causal attention with a potentially finite receptive field.
-See `StreamingTransformer` for more information.
-Unlike regular PyTorch Transformer, we make the hard choice that batches are first.
-"""
-import typing as tp
-from einops import rearrange
-import torch
-import torch.nn as nn
-from torch.nn import functional as F
-from torch.utils.checkpoint import checkpoint as torch_checkpoint
-from xformers import ops
-from .rope import RotaryEmbedding
-from .streaming import StreamingModule
-_efficient_attention_backend: str = 'torch'
-def set_efficient_attention_backend(backend: str = 'torch'):
-    # Using torch by default, it seems a bit faster on older P100 GPUs (~20% faster).
-    global _efficient_attention_backend
-    assert _efficient_attention_backend in ['xformers', 'torch']
-    _efficient_attention_backend = backend
-def _get_attention_time_dimension(memory_efficient: bool) -> int:
-    if _efficient_attention_backend == 'torch' and memory_efficient:
-        return 2
-    else:
-        return 1
-def _is_profiled() -> bool:
-    # Return true if we are currently running with a xformers profiler activated.
-    try:
-        from xformers.profiler import profiler
-    except ImportError:
-        return False
-    return profiler._Profiler._CURRENT_PROFILER is not None
-def create_norm_fn(norm_type: str, dim: int, **kwargs) -> nn.Module:
-    """Create normalization module for transformer encoder layer.
-    Args:
-        norm_type (str): Normalization method.
-        dim (int): Dimension of the normalized layer.
-        **kwargs (dict): Additional parameters for normalization layer.
-    Returns:
-        nn.Module: Normalization module.
-    """
-    if norm_type == 'layer_norm':
-        return nn.LayerNorm(dim, eps=1e-5, **kwargs)
-    else:
-        raise ValueError(f"Unknown norm type: {norm_type}")
-def create_sin_embedding(positions: torch.Tensor, dim: int, max_period: float = 10000,
-                         dtype: torch.dtype = torch.float32) -> torch.Tensor:
-    """Create sinusoidal positional embedding, with shape `[B, T, C]`.
-    Args:
-        positions (torch.Tensor): LongTensor of positions.
-        dim (int): Dimension of the embedding.
-        max_period (float): Maximum period of the cosine/sine functions.
-        dtype (torch.dtype or str): dtype to use to generate the embedding.
-    Returns:
-        torch.Tensor: Sinusoidal positional embedding.
-    """
-    # We aim for BTC format
-    assert dim % 2 == 0
-    half_dim = dim // 2
-    positions = positions.to(dtype)
-    adim = torch.arange(half_dim, device=positions.device, dtype=dtype).view(1, 1, -1)
-    max_period_tensor = torch.full([], max_period, device=positions.device, dtype=dtype)  # avoid sync point
-    phase = positions / (max_period_tensor ** (adim / (half_dim - 1)))
-    return torch.cat([torch.cos(phase), torch.sin(phase)], dim=-1)
-def expand_repeated_kv(x: torch.Tensor, n_rep: int, memory_efficient: bool) -> torch.Tensor:
-    """torch.repeat_interleave(x, dim=2, repeats=n_rep) from xlformers."""
-    if n_rep == 1:
-        return x
-    if _efficient_attention_backend == 'torch' and memory_efficient:
-        bs, n_kv_heads, slen, head_dim = x.shape
-        return (
-            x[:, :, None, :, :]
-            .expand(bs, n_kv_heads, n_rep, slen, head_dim)
-            .reshape(bs, n_kv_heads * n_rep, slen, head_dim)
-        )
-    else:
-        bs, slen, n_kv_heads, head_dim = x.shape
-        return (
-            x[:, :, :, None, :]
-            .expand(bs, slen, n_kv_heads, n_rep, head_dim)
-            .reshape(bs, slen, n_kv_heads * n_rep, head_dim)
-        )
-class LayerScale(nn.Module):
-    """Layer scale from [Touvron et al 2021] (https://arxiv.org/pdf/2103.17239.pdf).
-    This rescales diagonally the residual outputs close to 0, with a learnt scale.
-    Args:
-        channels (int): Number of channels.
-        init (float): Initial scale.
-        channel_last (bool): If True, expect `[*, C]` shaped tensors, otherwise, `[*, C, T]`.
-        device (torch.device or str, optional): Device on which to initialize the module.
-        dtype (torch.dtype, optional): dtype to use to initialize the module.
-    """
-    def __init__(self, channels: int, init: float = 1e-4, channel_last: bool = True,
-                 device=None, dtype=None):
-        super().__init__()
-        self.channel_last = channel_last
-        self.scale = nn.Parameter(
-            torch.full((channels,), init,
-                       requires_grad=True, device=device, dtype=dtype))
-    def forward(self, x: torch.Tensor):
-        if self.channel_last:
-            return self.scale * x
-        else:
-            return self.scale[:, None] * x
-class StreamingMultiheadAttention(StreamingModule):
-    """Similar to `nn.MultiheadAttention` but with support for streaming, causal evaluation.
-    Args:
-        embed_dim (int): Dimension to project to.
-        num_heads (int): Number of heads.
-        dropout (float): Dropout level.
-        bias (bool): Use bias in projections.
-        causal (bool): Causal mask applied automatically.
-        past_context (int, optional): Receptive field for the causal mask, infinite if None.
-        custom (bool): Use custom MHA implementation, for testing / benchmarking.
-        memory_efficient (bool): Use xformers based memory efficient attention.
-        attention_as_float32 (bool): Perform the attention as float32
-            (especially important with memory_efficient as autocast won't do this automatically).
-        rope (`RotaryEmbedding`, optional): Rope embedding to use.
-        cross_attention: Should be true when used as a cross attention.
-            All keys and values must be available at once, streaming is only for the queries.
-            Cannot be used with `causal` or `rope` (as it wouldn't make sens to
-            interpret the time steps in the keys relative to those in the queries).
-        safe_streaming (bool): Bug fix, will go away with xformers update.
-        qk_layer_norm (bool): Layer normalization applied to queries and keys before dot product.
-        kv_repeat (int): If > 1, will repeat keys and queries multiple times (need to divide num_heads).
-            This will lead to faster decoding time on A100 or other GPUs with tensorcore.
-        device (torch.device, optional): Device on which to initialize.
-        dtype (torch.dtype, optional): dtype to use.
-    """
-    def __init__(self, embed_dim: int, num_heads: int, dropout: float = 0.0, bias: bool = True,
-                 causal: bool = False, past_context: tp.Optional[int] = None, custom: bool = False,
-                 memory_efficient: bool = False, attention_as_float32: bool = False,
-                 rope: tp.Optional[RotaryEmbedding] = None, cross_attention: bool = False,
-                 safe_streaming: bool = True, qk_layer_norm: bool = False, kv_repeat: int = 1,
-                 device=None, dtype=None):
-        super().__init__()
-        factory_kwargs = {'device': device, 'dtype': dtype}
-        if past_context is not None:
-            assert causal
-        self.embed_dim = embed_dim
-        self.causal = causal
-        self.past_context = past_context
-        self.memory_efficient = memory_efficient
-        self.attention_as_float32 = attention_as_float32
-        self.rope = rope
-        self.cross_attention = cross_attention
-        self.safe_streaming = safe_streaming
-        self.num_heads = num_heads
-        self.dropout = dropout
-        self.kv_repeat = kv_repeat
-        if cross_attention:
-            assert not causal, "Causal cannot work with cross attention."
-            assert rope is None, "Rope cannot work with cross attention."
-        if memory_efficient:
-            _verify_xformers_memory_efficient_compat()
-        self.custom = _is_custom(custom, memory_efficient)
-        if self.custom:
-            out_dim = embed_dim
-            assert num_heads % kv_repeat == 0
-            assert not cross_attention or kv_repeat == 1
-            num_kv = num_heads // kv_repeat
-            kv_dim = (embed_dim // num_heads) * num_kv
-            out_dim += 2 * kv_dim
-            in_proj = nn.Linear(embed_dim, out_dim, bias=bias, **factory_kwargs)
-            # We try to follow the default PyTorch MHA convention, to easily compare results.
-            self.in_proj_weight = in_proj.weight
-            self.in_proj_bias = in_proj.bias
-            if bias:
-                self.in_proj_bias.data.zero_()  # Following Pytorch convention
-            self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias, **factory_kwargs)
-            if bias:
-                self.out_proj.bias.data.zero_()
-        else:
-            assert not qk_layer_norm
-            assert kv_repeat == 1
-            self.mha = nn.MultiheadAttention(
-                embed_dim, num_heads, dropout=dropout, bias=bias, batch_first=True,
-                **factory_kwargs)
-        self.qk_layer_norm = qk_layer_norm
-        if qk_layer_norm:
-            assert self.custom
-            assert kv_repeat == 1
-            ln_dim = embed_dim
-            self.q_layer_norm = nn.LayerNorm(ln_dim)
-            self.k_layer_norm = nn.LayerNorm(ln_dim)
-    def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
-        if not self.custom:
-            # Support compat with regular MHA
-            keys = [n for n, _ in self.mha.named_parameters()]
-            for key in keys:
-                if prefix + key in state_dict:
-                    state_dict[prefix + "mha." + key] = state_dict.pop(prefix + key)
-        super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
-    def _get_mask(self, current_steps: int, device: torch.device, dtype: torch.dtype):
-        # Return a causal mask, accounting for potentially stored past keys/values
-        # We actually return a bias for the attention score, as this has the same
-        # convention both in the builtin MHA in Pytorch, and Xformers functions.
-        time_dim = _get_attention_time_dimension(self.memory_efficient)
-        if self.memory_efficient:
-            from xformers.ops import LowerTriangularMask
-            if current_steps == 1:
-                # If we only have one step, then we do not need a mask.
-                return None
-            elif 'past_keys' in self._streaming_state:
-                raise RuntimeError("Not supported at the moment")
-            else:
-                # Then we can safely use a lower triangular mask
-                return LowerTriangularMask()
-        if self._streaming_state:
-            past_keys = self._streaming_state['past_keys']
-            past_steps = past_keys.shape[time_dim]
-        else:
-            past_steps = 0
-        queries_pos = torch.arange(
-            past_steps, current_steps + past_steps, device=device).view(-1, 1)
-        keys_pos = torch.arange(past_steps + current_steps, device=device).view(1, -1)
-        delta = queries_pos - keys_pos
-        valid = delta >= 0
-        if self.past_context is not None:
-            valid &= (delta <= self.past_context)
-        return torch.where(
-            valid,
-            torch.zeros([], device=device, dtype=dtype),
-            torch.full([], float('-inf'), device=device, dtype=dtype))
-    def _complete_kv(self, k, v):
-        time_dim = _get_attention_time_dimension(self.memory_efficient)
-        if self.cross_attention:
-            # With cross attention we assume all keys and values
-            # are already available, and streaming is with respect
-            # to the queries only.
-            return k, v
-        # Complete the key/value pair using the streaming state.
-        if self._streaming_state:
-            pk = self._streaming_state['past_keys']
-            nk = torch.cat([pk, k], dim=time_dim)
-            if v is k:
-                nv = nk
-            else:
-                pv = self._streaming_state['past_values']
-                nv = torch.cat([pv, v], dim=time_dim)
-        else:
-            nk = k
-            nv = v
-        assert nk.shape[time_dim] == nv.shape[time_dim]
-        offset = 0
-        if self.past_context is not None:
-            offset = max(0, nk.shape[time_dim] - self.past_context)
-        if self._is_streaming:
-            self._streaming_state['past_keys'] = nk[:, offset:]
-            if v is not k:
-                self._streaming_state['past_values'] = nv[:, offset:]
-            if 'offset' in self._streaming_state:
-                self._streaming_state['offset'] += offset
-            else:
-                self._streaming_state['offset'] = torch.tensor(0)
-        return nk, nv
-    def _apply_rope(self, query: torch.Tensor, key: torch.Tensor):
-        time_dim = _get_attention_time_dimension(self.memory_efficient)
-        # Apply rope embeddings to query and key tensors.
-        assert self.rope is not None
-        if 'past_keys' in self._streaming_state:
-            past_keys_offset = self._streaming_state['past_keys'].shape[1]
-        else:
-            past_keys_offset = 0
-        if 'offset' in self._streaming_state:
-            past_context_offset = int(self._streaming_state['offset'].item())
-        else:
-            past_context_offset = 0
-        streaming_offset = past_context_offset + past_keys_offset
-        return self.rope.rotate_qk(query, key, start=streaming_offset, time_dim=time_dim)
-    def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor,
-                key_padding_mask=None, need_weights=False, attn_mask=None,
-                average_attn_weights=True, is_causal=False):
-        assert not is_causal, ("New param added in torch 2.0.1 not supported, "
-                               "use the causal args in the constructor.")
-        time_dim = _get_attention_time_dimension(self.memory_efficient)
-        if time_dim == 2:
-            layout = "b h t d"
-        else:
-            layout = "b t h d"
-        dtype = query.dtype
-        if self._is_streaming:
-            assert self.causal or self.cross_attention, \
-                "Streaming only available for causal or cross attention"
-        custom_attn_mask = attn_mask is not None
-        if self.causal:
-            assert attn_mask is None
-            # At the moment we specialize only for the self-attention case.
-            assert query.shape[1] == key.shape[1], "Causal only for same length query / key / value"
-            assert value.shape[1] == key.shape[1], "Causal only for same length query / key / value"
-            attn_mask = self._get_mask(query.shape[1], query.device, query.dtype)
-        if self.custom:
-            # custom implementation
-            assert need_weights is False
-            assert key_padding_mask is None
-            if self.cross_attention:
-                # Different queries, keys, values, we have to spit manually the weights
-                # before applying the linear.
-                dim = self.in_proj_weight.shape[0] // 3
-                if self.in_proj_bias is None:
-                    bias_q, bias_k, bias_v = None, None, None
-                else:
-                    bias_q = self.in_proj_bias[:dim]
-                    bias_k = self.in_proj_bias[dim: 2 * dim]
-                    bias_v = self.in_proj_bias[2 * dim:]
-                q = nn.functional.linear(query, self.in_proj_weight[:dim], bias_q)
-                # todo: when streaming, we could actually save k, v and check the shape actually match.
-                k = nn.functional.linear(key, self.in_proj_weight[dim: 2 * dim], bias_k)
-                v = nn.functional.linear(value, self.in_proj_weight[2 * dim:], bias_v)
-                if self.qk_layer_norm is True:
-                    q = self.q_layer_norm(q)
-                    k = self.k_layer_norm(k)
-                q, k, v = [rearrange(x, f"b t (h d) -> {layout}", h=self.num_heads) for x in [q, k, v]]
-            else:
-                if not _is_profiled():
-                    # profiling breaks that propertysomehow.
-                    assert query is key, "specialized implementation"
-                    assert value is key, "specialized implementation"
-                projected = nn.functional.linear(query, self.in_proj_weight, self.in_proj_bias)
-                if self.kv_repeat == 1:
-                    if time_dim == 2:
-                        bound_layout = "b h p t d"
-                    else:
-                        bound_layout = "b t p h d"
-                    packed = rearrange(projected, f"b t (p h d) -> {bound_layout}", p=3, h=self.num_heads)
-                    q, k, v = ops.unbind(packed, dim=2)
-                else:
-                    embed_dim = self.embed_dim
-                    per_head_dim = (embed_dim // self.num_heads)
-                    kv_heads = self.num_heads // self.kv_repeat
-                    q = projected[:, :, :embed_dim]
-                    start = embed_dim
-                    end = start + per_head_dim * kv_heads
-                    k = projected[:, :, start: end]
-                    v = projected[:, :, end:]
-                    q = rearrange(q, f"b t (h d) -> {layout}", h=self.num_heads)
-                    k = rearrange(k, f"b t (h d) -> {layout}", h=kv_heads)
-                    v = rearrange(v, f"b t (h d) -> {layout}", h=kv_heads)
-                if self.qk_layer_norm is True:
-                    assert self.kv_repeat == 1
-                    q, k = [rearrange(x, f"{layout} -> b t (h d)") for x in [q, k]]
-                    q = self.q_layer_norm(q)
-                    k = self.k_layer_norm(k)
-                    q, k = [rearrange(x, f"b t (h d) -> {layout}", h=self.num_heads) for x in [q, k]]
-                if self.rope:
-                    q, k = self._apply_rope(q, k)
-                k, v = self._complete_kv(k, v)
-                if self.kv_repeat > 1:
-                    k = expand_repeated_kv(k, self.kv_repeat, self.memory_efficient)
-                    v = expand_repeated_kv(v, self.kv_repeat, self.memory_efficient)
-            if self.attention_as_float32:
-                q, k, v = [x.float() for x in [q, k, v]]
-            if self.memory_efficient:
-                if custom_attn_mask:
-                    # When using a custom attn mask:
-                    # Move to query's device, repeat for each sample, remove align8 padding
-                    seq_len = query.shape[1]
-                    attn_mask = attn_mask.to(q.dtype)
-                    attn_mask = attn_mask.repeat((q.shape[0], 1, 1, 1))
-                    attn_mask = attn_mask[..., :seq_len, :seq_len]
-                p = self.dropout if self.training else 0
-                if _efficient_attention_backend == 'torch':
-                    x = torch.nn.functional.scaled_dot_product_attention(
-                        q, k, v, is_causal=attn_mask is not None, dropout_p=p)
-                else:
-                    x = ops.memory_efficient_attention(q, k, v, attn_mask, p=p)
-            else:
-                # We include the dot product as float32, for consistency
-                # with the other implementations that include that step
-                # as part of the attention. Note that when using `autocast`,
-                # the einsums would be done as bfloat16, but the softmax
-                # would be done as bfloat16, so `attention_as_float32` will
-                # extend a bit the range of operations done in float32,
-                # although this should make no difference.
-                q = q / q.shape[-1] ** 0.5
-                key_layout = layout.replace('t', 'k')
-                query_layout = layout
-                if self._is_streaming and self.safe_streaming and q.device.type == 'cuda':
-                    with torch.autocast(device_type=q.device.type, dtype=torch.float32):
-                        pre_w = torch.einsum(f"{query_layout},{key_layout}-> b h t k", q, k)
-                else:
-                    pre_w = torch.einsum(f"{query_layout},{key_layout}-> b h t k", q, k)
-                if attn_mask is not None:
-                    pre_w = pre_w + attn_mask
-                w = torch.softmax(pre_w, dim=-1)
-                w = F.dropout(w, self.dropout, training=self.training).to(v)
-                # Key and value have the same format.
-                x = torch.einsum(f"b h t k, {key_layout} -> {layout}", w, v)
-            x = x.to(dtype)
-            x = rearrange(x, f"{layout} -> b t (h d)", h=self.num_heads)
-            x = self.out_proj(x)
-        else:
-            key, value = self._complete_kv(key, value)
-            if self.attention_as_float32:
-                query, key, value = [x.float() for x in [query, key, value]]
-            x, _ = self.mha(
-                query, key, value, key_padding_mask,
-                need_weights, attn_mask, average_attn_weights)
-            x = x.to(dtype)
-        return x, None
-class StreamingTransformerLayer(nn.TransformerEncoderLayer):
-    """TransformerLayer with Streaming / Causal support.
-    This also integrates cross_attention, when passing `cross_attention=True`,
-    rather than having two separate classes like in PyTorch.
-    Args:
-        d_model (int): Dimension of the data.
-        num_heads (int): Number of heads.
-        dim_feedforward (int): Intermediate dimension of FF module.
-        dropout (float): Dropout both for MHA and FF.
-        bias_ff (bool): Use bias for FF.
-        bias_attn (bool): Use bias for MHA.
-        causal (bool): Causal mask applied automatically.
-        past_context (int, optional): Receptive field for the causal mask, infinite if None.
-        custom (bool): Use custom MHA implementation, for testing / benchmarking.
-        memory_efficient (bool): Use xformers based memory efficient attention.
-        attention_as_float32 (bool): Perform the attention as float32
-            (especially important with memory_efficient as autocast won't do this automatically).
-        qk_layer_norm (bool): Layer normalization applied to queries and keys before dot product in attention.
-        qk_layer_norm_cross (bool): Same for the cross attention.
-        cross_attention (bool): If True, expect to get secondary input for cross-attention.
-            Cross attention will use the default MHA, as it typically won't require
-            special treatment.
-        layer_scale (float, optional): If not None, LayerScale will be used with
-            the given value as initial scale.
-        rope (`RotaryEmbedding`, optional): Rope embedding to use.
-        attention_dropout (float, optional): If not None, separate the value of the dimension dropout
-            in FFN and of the attention dropout.
-        kv_repeat (int): If > 1, will repeat keys and queries multiple times (need to divide num_heads).
-            This will lead to faster decoding time on A100 or other GPUs with tensorcore.
-        device (torch.device, optional): Device on which to initialize.
-        dtype (torch.dtype, optional): dtype to use.
-        **kwargs: See `nn.TransformerEncoderLayer`.
-    """
-    def __init__(self, d_model: int, num_heads: int, dim_feedforward: int = 2048, dropout: float = 0.1,
-                 bias_ff: bool = True, bias_attn: bool = True, causal: bool = False,
-                 past_context: tp.Optional[int] = None, custom: bool = False,
-                 memory_efficient: bool = False, attention_as_float32: bool = False,
-                 qk_layer_norm: bool = False, qk_layer_norm_cross: bool = False,
-                 cross_attention: bool = False, layer_scale: tp.Optional[float] = None,
-                 rope: tp.Optional[RotaryEmbedding] = None, attention_dropout: tp.Optional[float] = None,
-                 kv_repeat: int = 1, norm: str = 'layer_norm', device=None, dtype=None, **kwargs):
-        super().__init__(d_model, num_heads, dim_feedforward, dropout,
-                         device=device, dtype=dtype, batch_first=True, **kwargs)
-        factory_kwargs = {'device': device, 'dtype': dtype}
-        # Redefine self_attn to our streaming multi-head attention
-        attn_kwargs: tp.Dict[str, tp.Any] = {
-            'embed_dim': d_model,
-            'num_heads': num_heads,
-            'dropout': dropout if attention_dropout is None else attention_dropout,
-            'bias': bias_attn,
-            'custom': custom,
-            'memory_efficient': memory_efficient,
-            'attention_as_float32': attention_as_float32,
-        }
-        self.self_attn: StreamingMultiheadAttention = StreamingMultiheadAttention(
-            causal=causal, past_context=past_context, rope=rope, qk_layer_norm=qk_layer_norm,
-            kv_repeat=kv_repeat, **attn_kwargs, **factory_kwargs)  # type: ignore
-        # Redefine feedforward layers to expose bias parameter
-        self.linear1 = nn.Linear(d_model, dim_feedforward, bias=bias_ff, **factory_kwargs)
-        self.linear2 = nn.Linear(dim_feedforward, d_model, bias=bias_ff, **factory_kwargs)
-        self.layer_scale_1: nn.Module
-        self.layer_scale_2: nn.Module
-        if layer_scale is None:
-            self.layer_scale_1 = nn.Identity()
-            self.layer_scale_2 = nn.Identity()
-        else:
-            self.layer_scale_1 = LayerScale(d_model, layer_scale, **factory_kwargs)
-            self.layer_scale_2 = LayerScale(d_model, layer_scale, **factory_kwargs)
-        self.cross_attention: tp.Optional[nn.Module] = None
-        if cross_attention:
-            self.cross_attention = StreamingMultiheadAttention(
-                cross_attention=True, qk_layer_norm=qk_layer_norm_cross,
-                **attn_kwargs, **factory_kwargs)
-            # Norm and dropout
-            self.dropout_cross = nn.Dropout(dropout)
-            # eps value matching that used in PyTorch reference implementation.
-            self.norm_cross = nn.LayerNorm(d_model, eps=1e-5, **factory_kwargs)
-            self.layer_scale_cross: nn.Module
-            if layer_scale is None:
-                self.layer_scale_cross = nn.Identity()
-            else:
-                self.layer_scale_cross = LayerScale(d_model, layer_scale, **factory_kwargs)
-        self.norm1 = create_norm_fn(norm, d_model, **factory_kwargs)  # type: ignore
-        self.norm2 = create_norm_fn(norm, d_model, **factory_kwargs)  # type: ignore
-    def _cross_attention_block(self, src: torch.Tensor,
-                               cross_attention_src: torch.Tensor) -> torch.Tensor:
-        assert self.cross_attention is not None
-        # queries are from src, keys and values from cross_attention_src.
-        x = self.cross_attention(
-            src, cross_attention_src, cross_attention_src, need_weights=False)[0]
-        return self.dropout_cross(x)  # type: ignore
-    def forward(self, src: torch.Tensor, src_mask: tp.Optional[torch.Tensor] = None,  # type: ignore
-                src_key_padding_mask: tp.Optional[torch.Tensor] = None,
-                cross_attention_src: tp.Optional[torch.Tensor] = None):
-        if self.cross_attention is None:
-            assert cross_attention_src is None
-        else:
-            assert cross_attention_src is not None
-        x = src
-        if self.norm_first:
-            x = x + self.layer_scale_1(
-                self._sa_block(self.norm1(x), src_mask, src_key_padding_mask))
-            if cross_attention_src is not None:
-                x = x + self.layer_scale_cross(
-                    self._cross_attention_block(
-                        self.norm_cross(x), cross_attention_src))
-            x = x + self.layer_scale_2(self._ff_block(self.norm2(x)))
-        else:
-            x = self.norm1(x + self.layer_scale_1(
-                self._sa_block(x, src_mask, src_key_padding_mask)))
-            if cross_attention_src is not None:
-                x = self.norm_cross(
-                    x + self.layer_scale_cross(
-                        self._cross_attention_block(src, cross_attention_src)))
-            x = self.norm2(x + self.layer_scale_2(self._ff_block(x)))
-        return x
-class StreamingTransformer(StreamingModule):
-    """Transformer with Streaming / Causal support.
-    Args:
-        d_model (int): Dimension of the data.
-        num_heads (int): Number of heads.
-        dim_feedforward (int): Intermediate dimension of FF module.
-        dropout (float): Dropout both for MHA and FF.
-        bias_ff (bool): Use bias for FF.
-        bias_attn (bool): Use bias for MHA.
-        causal (bool): Causal mask applied automatically.
-        past_context (int, optional): Receptive field for the causal mask, infinite if None.
-        custom (bool): Use custom MHA implementation, for testing / benchmarking.
-        memory_efficient (bool): Use xformers based memory efficient attention.
-        attention_as_float32 (bool): Perform the attention as float32
-            (especially important with memory_efficient as autocast won't do this automatically).
-        cross_attention (bool): If True, expect to get secondary input for cross-attention.
-        layer_scale (float, optional): If not None, LayerScale will be used
-            with the given value as initial scale.
-        positional_embedding (str): Positional embedding strategy (sin, rope, or sin_rope).
-        max_period (float): Maximum period of the time embedding.
-        positional_scale (float): Scale of positional embedding, set to 0 to deactivate.
-        xpos (bool): Apply xpos exponential decay to positional embedding (rope only).
-        lr (float, optional): learning rate override through the `make_optim_group` API.
-        weight_decay (float, optional): Weight_decay override through the `make_optim_group` API.
-        layer_class: (subclass of `StreamingTransformerLayer): class to use
-            to initialize the layers, allowing further customization outside of AudioCraft.
-        checkpointing (str): Checkpointing strategy to reduce memory usage.
-            No checkpointing if set to 'none'. Per layer checkpointing using PyTorch
-            if set to 'torch' (entire layer checkpointed, i.e. linears are evaluated twice,
-            minimal memory usage, but maximal runtime). Finally, `xformers_default` provide
-            a policy for opting-out some operations of the checkpointing like
-            linear layers and attention, providing a middle ground between speed and memory.
-        device (torch.device, optional): Device on which to initialize.
-        dtype (torch.dtype, optional): dtype to use.
-        **kwargs: See `nn.TransformerEncoderLayer`.
-    """
-    def __init__(self, d_model: int, num_heads: int, num_layers: int, dim_feedforward: int = 2048,
-                 dropout: float = 0.1, bias_ff: bool = True, bias_attn: bool = True,
-                 causal: bool = False, past_context: tp.Optional[int] = None,
-                 custom: bool = False, memory_efficient: bool = False, attention_as_float32: bool = False,
-                 cross_attention: bool = False, layer_scale: tp.Optional[float] = None,
-                 positional_embedding: str = 'sin', max_period: float = 10_000, positional_scale: float = 1.,
-                 xpos: bool = False, lr: tp.Optional[float] = None, weight_decay: tp.Optional[float] = None,
-                 layer_class: tp.Type[StreamingTransformerLayer] = StreamingTransformerLayer,
-                 checkpointing: str = 'none', device=None, dtype=None, **kwargs):
-        super().__init__()
-        assert d_model % num_heads == 0
-        self.positional_embedding = positional_embedding
-        self.max_period = max_period
-        self.positional_scale = positional_scale
-        self.weight_decay = weight_decay
-        self.lr = lr
-        assert positional_embedding in ['sin', 'rope', 'sin_rope']
-        self.rope: tp.Optional[RotaryEmbedding] = None
-        if self.positional_embedding in ['rope', 'sin_rope']:
-            assert _is_custom(custom, memory_efficient)
-            self.rope = RotaryEmbedding(d_model // num_heads, max_period=max_period,
-                                        xpos=xpos, scale=positional_scale, device=device)
-        self.checkpointing = checkpointing
-        assert checkpointing in ['none', 'torch', 'xformers_default', 'xformers_mm']
-        if self.checkpointing.startswith('xformers'):
-            _verify_xformers_internal_compat()
-        self.layers = nn.ModuleList()
-        for idx in range(num_layers):
-            self.layers.append(
-                layer_class(
-                    d_model=d_model, num_heads=num_heads, dim_feedforward=dim_feedforward,
-                    dropout=dropout, bias_ff=bias_ff, bias_attn=bias_attn,
-                    causal=causal, past_context=past_context, custom=custom,
-                    memory_efficient=memory_efficient, attention_as_float32=attention_as_float32,
-                    cross_attention=cross_attention, layer_scale=layer_scale, rope=self.rope,
-                    device=device, dtype=dtype, **kwargs))
-        if self.checkpointing != 'none':
-            for layer in self.layers:
-                # see audiocraft/optim/fsdp.py, magic signal to indicate this requires fixing the
-                # backward hook inside of FSDP...
-                layer._magma_checkpointed = True  # type: ignore
-    def _apply_layer(self, layer, *args, **kwargs):
-        method = self.checkpointing
-        if method == 'none':
-            return layer(*args, **kwargs)
-        elif method == 'torch':
-            return torch_checkpoint(layer, *args, use_reentrant=False, **kwargs)
-        elif method.startswith('xformers'):
-            from xformers.checkpoint_fairinternal import checkpoint, _get_default_policy
-            if method == 'xformers_default':
-                # those operations will be saved, and not recomputed.
-                # According to Francisco we can get smarter policies but this is a good start.
-                allow_list = [
-                    "xformers.efficient_attention_forward_cutlass.default",
-                    "xformers_flash.flash_fwd.default",
-                    "aten.addmm.default",
-                    "aten.mm.default",
-                ]
-            elif method == 'xformers_mm':
-                # those operations will be saved, and not recomputed.
-                # According to Francisco we can get smarter policies but this is a good start.
-                allow_list = [
-                    "aten.addmm.default",
-                    "aten.mm.default",
-                ]
-            else:
-                raise ValueError(f"xformers checkpointing xformers policy {method} is not known.")
-            policy_fn = _get_default_policy(allow_list)
-            return checkpoint(layer, *args, policy_fn=policy_fn, **kwargs)
-        else:
-            raise ValueError(f"Checkpointing method {method} is unknown.")
-    def forward(self, x: torch.Tensor, *args, **kwargs):
-        B, T, C = x.shape
-        if 'offsets' in self._streaming_state:
-            offsets = self._streaming_state['offsets']
-        else:
-            offsets = torch.zeros(B, dtype=torch.long, device=x.device)
-        if self.positional_embedding in ['sin', 'sin_rope']:
-            positions = torch.arange(T, device=x.device).view(1, -1, 1)
-            positions = positions + offsets.view(-1, 1, 1)
-            pos_emb = create_sin_embedding(positions, C, max_period=self.max_period, dtype=x.dtype)
-            x = x + self.positional_scale * pos_emb
-        for layer in self.layers:
-            x = self._apply_layer(layer, x, *args, **kwargs)
-        if self._is_streaming:
-            self._streaming_state['offsets'] = offsets + T
-        return x
-    def make_optim_group(self):
-        group = {"params": list(self.parameters())}
-        if self.lr is not None:
-            group["lr"] = self.lr
-        if self.weight_decay is not None:
-            group["weight_decay"] = self.weight_decay
-        return group
-# special attention related function
-def _verify_xformers_memory_efficient_compat():
-    try:
-        from xformers.ops import memory_efficient_attention, LowerTriangularMask  # noqa
-    except ImportError:
-        raise ImportError(
-            "xformers is not installed. Please install it and try again.\n"
-            "To install on AWS and Azure, run \n"
-            "FORCE_CUDA=1 TORCH_CUDA_ARCH_LIST='8.0'\\\n"
-            "pip install -U git+https://git@github.com/fairinternal/xformers.git#egg=xformers\n"
-            "To install on FAIR Cluster, run \n"
-            "FORCE_CUDA=1 TORCH_CUDA_ARCH_LIST='6.0;7.0'\\\n"
-            "pip install -U git+https://git@github.com/fairinternal/xformers.git#egg=xformers\n")
-def _verify_xformers_internal_compat():
-    try:
-        from xformers.checkpoint_fairinternal import checkpoint, _get_default_policy  # noqa
-    except ImportError:
-        raise ImportError(
-            "Francisco's fairinternal xformers is not installed. Please install it and try again.\n"
-            "To install on AWS and Azure, run \n"
-            "FORCE_CUDA=1 TORCH_CUDA_ARCH_LIST='8.0'\\\n"
-            "pip install -U git+https://git@github.com/fairinternal/xformers.git#egg=xformers\n"
-            "To install on FAIR Cluster, run \n"
-            "FORCE_CUDA=1 TORCH_CUDA_ARCH_LIST='6.0;7.0'\\\n"
-            "pip install -U git+https://git@github.com/fairinternal/xformers.git#egg=xformers\n")
-def _is_custom(custom: bool, memory_efficient: bool):
-    return custom or memory_efficient

audiocraft/modules/unet_transformer.py DELETED Viewed

@@ -1,67 +0,0 @@
-import torch
-import typing as tp
-from .transformer import StreamingTransformer, create_sin_embedding
-class UnetTransformer(StreamingTransformer):
-    """U-net Transformer for processing sequences with optional skip connections.
-    This transformer architecture incorporates U-net style skip connections
-    between layers, which can be optionally enabled. It inherits from a
-    StreamingTransformer.
-    Args:
-        d_model (int): Dimension of the model, typically the number of expected features in the input.
-        num_layers (int): Total number of layers in the transformer.
-        skip_connections (bool, optional): Flag to determine whether skip connections should be used.
-                                           Defaults to False.
-        layer_dropout_p (float, Optional): if given, defined bernoulli prob. to drop a skip connection (in training).
-        **kwargs: Additional keyword arguments inherited from `nn.StreamingTransformer`.
-    """
-    def __init__(self, d_model: int, num_layers: int, skip_connections: bool = False,
-                 layer_dropout_p: tp.Optional[float] = None, **kwargs):
-        super().__init__(d_model=d_model,
-                         num_layers=num_layers,
-                         **kwargs)
-        self.skip_connect = skip_connections
-        if self.skip_connect:
-            self.skip_projections = torch.nn.ModuleList([torch.nn.Linear(d_model * 2, d_model)
-                                                        for _ in range(num_layers // 2)])
-        self.num_layers = num_layers
-        self.layer_drop_p = max(min(layer_dropout_p, 1.), 0.) if layer_dropout_p is not None else 0.0
-    def forward(self, x: torch.Tensor, *args, **kwargs):
-        B, T, C = x.shape
-        if 'offsets' in self._streaming_state:
-            offsets = self._streaming_state['offsets']
-        else:
-            offsets = torch.zeros(B, dtype=torch.long, device=x.device)
-        if self.positional_embedding in ['sin', 'sin_rope']:
-            positions = torch.arange(T, device=x.device).view(1, -1, 1)
-            positions = positions + offsets.view(-1, 1, 1)
-            pos_emb = create_sin_embedding(positions, C, max_period=self.max_period, dtype=x.dtype)
-            x = x + self.positional_scale * pos_emb
-        skip_connections: tp.List[torch.Tensor] = []
-        for i, layer in enumerate(self.layers):
-            if self.skip_connect and i >= self.num_layers // 2:
-                # in the second half of the layers, add residual connection
-                # and linearly project the concatenated features back to d_model
-                x = torch.cat([x, skip_connections.pop()], dim=-1)
-                x = self.skip_projections[i % len(self.skip_projections)](x)
-            x = self._apply_layer(layer, x, *args, **kwargs)
-            if self.skip_connect and i < self.num_layers // 2:
-                if self.training and torch.rand(1,) < self.layer_drop_p:  # drop skip
-                    skip_connections.append(torch.zeros_like(x))
-                else:
-                    skip_connections.append(x)
-        if self._is_streaming:
-            self._streaming_state['offsets'] = offsets + T
-        return x

audiocraft/py.typed DELETED Viewed

File without changes

audiocraft/quantization/__init__.py DELETED Viewed

@@ -1,9 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-# flake8: noqa
-from .vq import ResidualVectorQuantizer
-from .base import BaseQuantizer, DummyQuantizer, QuantizedResult

audiocraft/quantization/base.py DELETED Viewed

@@ -1,107 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Base class for all quantizers.
-"""
-from dataclasses import dataclass, field
-import typing as tp
-import torch
-from torch import nn
-@dataclass
-class QuantizedResult:
-    x: torch.Tensor
-    codes: torch.Tensor
-    bandwidth: torch.Tensor  # bandwidth in kb/s used, per batch item.
-    penalty: tp.Optional[torch.Tensor] = None
-    metrics: dict = field(default_factory=dict)
-class BaseQuantizer(nn.Module):
-    """Base class for quantizers.
-    """
-    def forward(self, x: torch.Tensor, frame_rate: int) -> QuantizedResult:
-        """
-        Given input tensor x, returns first the quantized (or approximately quantized)
-        representation along with quantized codes, bandwidth, and any penalty term for the loss.
-        Finally, this returns a dict of metrics to update logging etc.
-        Frame rate must be passed so that the bandwidth is properly computed.
-        """
-        raise NotImplementedError()
-    def encode(self, x: torch.Tensor) -> torch.Tensor:
-        """Encode a given input tensor with the specified sample rate at the given bandwidth.
-        """
-        raise NotImplementedError()
-    def decode(self, codes: torch.Tensor) -> torch.Tensor:
-        """Decode the given codes to the quantized representation.
-        """
-        raise NotImplementedError()
-    @property
-    def total_codebooks(self):
-        """Total number of codebooks.
-        """
-        raise NotImplementedError()
-    @property
-    def num_codebooks(self):
-        """Number of active codebooks.
-        """
-        raise NotImplementedError()
-    def set_num_codebooks(self, n: int):
-        """Set the number of active codebooks.
-        """
-        raise NotImplementedError()
-class DummyQuantizer(BaseQuantizer):
-    """Fake quantizer that actually does not perform any quantization.
-    """
-    def __init__(self):
-        super().__init__()
-    def forward(self, x: torch.Tensor, frame_rate: int):
-        q = x.unsqueeze(1)
-        return QuantizedResult(x, q, torch.tensor(q.numel() * 32 * frame_rate / 1000 / len(x)).to(x))
-    def encode(self, x: torch.Tensor) -> torch.Tensor:
-        """Encode a given input tensor with the specified sample rate at the given bandwidth.
-        In the case of the DummyQuantizer, the codes are actually identical
-        to the input and resulting quantized representation as no quantization is done.
-        """
-        return x.unsqueeze(1)
-    def decode(self, codes: torch.Tensor) -> torch.Tensor:
-        """Decode the given codes to the quantized representation.
-        In the case of the DummyQuantizer, the codes are actually identical
-        to the input and resulting quantized representation as no quantization is done.
-        """
-        return codes.squeeze(1)
-    @property
-    def total_codebooks(self):
-        """Total number of codebooks.
-        """
-        return 1
-    @property
-    def num_codebooks(self):
-        """Total number of codebooks.
-        """
-        return self.total_codebooks
-    def set_num_codebooks(self, n: int):
-        """Set the number of active codebooks.
-        """
-        raise AttributeError("Cannot override the number of codebooks for the dummy quantizer")

audiocraft/quantization/core_vq.py DELETED Viewed

@@ -1,405 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-import typing as tp
-from einops import rearrange, repeat
-import flashy
-import torch
-from torch import nn, einsum
-import torch.nn.functional as F
-def exists(val: tp.Optional[tp.Any]) -> bool:
-    return val is not None
-def default(val: tp.Any, d: tp.Any) -> tp.Any:
-    return val if exists(val) else d
-def l2norm(t):
-    return F.normalize(t, p=2, dim=-1)
-def ema_inplace(moving_avg, new, decay: float):
-    moving_avg.data.mul_(decay).add_(new, alpha=(1 - decay))
-def laplace_smoothing(x, n_categories: int, epsilon: float = 1e-5):
-    return (x + epsilon) / (x.sum() + n_categories * epsilon)
-def uniform_init(*shape: int):
-    t = torch.empty(shape)
-    nn.init.kaiming_uniform_(t)
-    return t
-def sample_vectors(samples, num: int):
-    num_samples, device = samples.shape[0], samples.device
-    if num_samples >= num:
-        indices = torch.randperm(num_samples, device=device)[:num]
-    else:
-        indices = torch.randint(0, num_samples, (num,), device=device)
-    return samples[indices]
-def kmeans(samples, num_clusters: int, num_iters: int = 10):
-    dim, dtype = samples.shape[-1], samples.dtype
-    means = sample_vectors(samples, num_clusters)
-    for _ in range(num_iters):
-        diffs = rearrange(samples, "n d -> n () d") - rearrange(
-            means, "c d -> () c d"
-        )
-        dists = -(diffs ** 2).sum(dim=-1)
-        buckets = dists.max(dim=-1).indices
-        bins = torch.bincount(buckets, minlength=num_clusters)
-        zero_mask = bins == 0
-        bins_min_clamped = bins.masked_fill(zero_mask, 1)
-        new_means = buckets.new_zeros(num_clusters, dim, dtype=dtype)
-        new_means.scatter_add_(0, repeat(buckets, "n -> n d", d=dim), samples)
-        new_means = new_means / bins_min_clamped[..., None]
-        means = torch.where(zero_mask[..., None], means, new_means)
-    return means, bins
-def orthogonal_loss_fn(t):
-    # eq (2) from https://arxiv.org/abs/2112.00384
-    n = t.shape[0]
-    normed_codes = l2norm(t)
-    identity = torch.eye(n, device=t.device)
-    cosine_sim = einsum("i d, j d -> i j", normed_codes, normed_codes)
-    return ((cosine_sim - identity) ** 2).sum() / (n ** 2)
-class EuclideanCodebook(nn.Module):
-    """Codebook with Euclidean distance.
-    Args:
-        dim (int): Dimension.
-        codebook_size (int): Codebook size.
-        kmeans_init (bool): Whether to use k-means to initialize the codebooks.
-            If set to true, run the k-means algorithm on the first training batch and use
-            the learned centroids as initialization.
-        kmeans_iters (int): Number of iterations used for k-means algorithm at initialization.
-        decay (float): Decay for exponential moving average over the codebooks.
-        epsilon (float): Epsilon value for numerical stability.
-        threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
-            that have an exponential moving average cluster size less than the specified threshold with
-            randomly selected vector from the current batch.
-    """
-    def __init__(
-        self,
-        dim: int,
-        codebook_size: int,
-        kmeans_init: int = False,
-        kmeans_iters: int = 10,
-        decay: float = 0.8,
-        epsilon: float = 1e-5,
-        threshold_ema_dead_code: int = 2,
-    ):
-        super().__init__()
-        self.decay = decay
-        init_fn: tp.Union[tp.Callable[..., torch.Tensor], tp.Any] = uniform_init if not kmeans_init else torch.zeros
-        embed = init_fn(codebook_size, dim)
-        self.codebook_size = codebook_size
-        self.kmeans_iters = kmeans_iters
-        self.epsilon = epsilon
-        self.threshold_ema_dead_code = threshold_ema_dead_code
-        self.register_buffer("inited", torch.Tensor([not kmeans_init]))
-        self.register_buffer("cluster_size", torch.zeros(codebook_size))
-        self.register_buffer("embed", embed)
-        self.register_buffer("embed_avg", embed.clone())
-    @torch.jit.ignore
-    def init_embed_(self, data):
-        if self.inited:
-            return
-        embed, cluster_size = kmeans(data, self.codebook_size, self.kmeans_iters)
-        self.embed.data.copy_(embed)
-        self.embed_avg.data.copy_(embed.clone())
-        self.cluster_size.data.copy_(cluster_size)
-        self.inited.data.copy_(torch.Tensor([True]))
-        # Make sure all buffers across workers are in sync after initialization
-        flashy.distrib.broadcast_tensors(self.buffers())
-    def replace_(self, samples, mask):
-        modified_codebook = torch.where(
-            mask[..., None], sample_vectors(samples, self.codebook_size), self.embed
-        )
-        self.embed.data.copy_(modified_codebook)
-    def expire_codes_(self, batch_samples):
-        if self.threshold_ema_dead_code == 0:
-            return
-        expired_codes = self.cluster_size < self.threshold_ema_dead_code
-        if not torch.any(expired_codes):
-            return
-        batch_samples = rearrange(batch_samples, "... d -> (...) d")
-        self.replace_(batch_samples, mask=expired_codes)
-        flashy.distrib.broadcast_tensors(self.buffers())
-    def preprocess(self, x):
-        x = rearrange(x, "... d -> (...) d")
-        return x
-    def quantize(self, x):
-        embed = self.embed.t()
-        dist = -(
-            x.pow(2).sum(1, keepdim=True)
-            - 2 * x @ embed
-            + embed.pow(2).sum(0, keepdim=True)
-        )
-        embed_ind = dist.max(dim=-1).indices
-        return embed_ind
-    def postprocess_emb(self, embed_ind, shape):
-        return embed_ind.view(*shape[:-1])
-    def dequantize(self, embed_ind):
-        quantize = F.embedding(embed_ind, self.embed)
-        return quantize
-    def encode(self, x):
-        shape = x.shape
-        # pre-process
-        x = self.preprocess(x)
-        # quantize
-        embed_ind = self.quantize(x)
-        # post-process
-        embed_ind = self.postprocess_emb(embed_ind, shape)
-        return embed_ind
-    def decode(self, embed_ind):
-        quantize = self.dequantize(embed_ind)
-        return quantize
-    def forward(self, x):
-        shape, dtype = x.shape, x.dtype
-        x = self.preprocess(x)
-        self.init_embed_(x)
-        embed_ind = self.quantize(x)
-        embed_onehot = F.one_hot(embed_ind, self.codebook_size).type(dtype)
-        embed_ind = self.postprocess_emb(embed_ind, shape)
-        quantize = self.dequantize(embed_ind)
-        if self.training:
-            # We do the expiry of code at that point as buffers are in sync
-            # and all the workers will take the same decision.
-            self.expire_codes_(x)
-            ema_inplace(self.cluster_size, embed_onehot.sum(0), self.decay)
-            embed_sum = x.t() @ embed_onehot
-            ema_inplace(self.embed_avg, embed_sum.t(), self.decay)
-            cluster_size = (
-                laplace_smoothing(self.cluster_size, self.codebook_size, self.epsilon)
-                * self.cluster_size.sum()
-            )
-            embed_normalized = self.embed_avg / cluster_size.unsqueeze(1)
-            self.embed.data.copy_(embed_normalized)
-        return quantize, embed_ind
-class VectorQuantization(nn.Module):
-    """Vector quantization implementation.
-    Currently supports only euclidean distance.
-    Args:
-        dim (int): Dimension
-        codebook_size (int): Codebook size
-        codebook_dim (int): Codebook dimension. If not defined, uses the specified dimension in dim.
-        decay (float): Decay for exponential moving average over the codebooks.
-        epsilon (float): Epsilon value for numerical stability.
-        kmeans_init (bool): Whether to use kmeans to initialize the codebooks.
-        kmeans_iters (int): Number of iterations used for kmeans initialization.
-        threshold_ema_dead_code (int):
-        channels_last (bool): Channels are the last dimension in the input tensors.
-        commitment_weight (float): Weight for commitment loss.
-        orthogonal_reg_weight (float): Orthogonal regularization weights.
-        orthogonal_reg_active_codes_only (bool): Apply orthogonal regularization only on active codes.
-        orthogonal_reg_max_codes (optional int): Maximum number of codes to consider
-            for orthogonal regularization.
-        threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
-            that have an exponential moving average cluster size less than the specified threshold with
-            randomly selected vector from the current batch.
-    """
-    def __init__(
-        self,
-        dim: int,
-        codebook_size: int,
-        codebook_dim: tp.Optional[int] = None,
-        decay: float = 0.8,
-        epsilon: float = 1e-5,
-        kmeans_init: bool = False,
-        kmeans_iters: int = 10,
-        threshold_ema_dead_code: int = 2,
-        channels_last: bool = False,
-        commitment_weight: float = 1.,
-        orthogonal_reg_weight: float = 0.0,
-        orthogonal_reg_active_codes_only: bool = False,
-        orthogonal_reg_max_codes: tp.Optional[int] = None,
-    ):
-        super().__init__()
-        _codebook_dim: int = default(codebook_dim, dim)
-        requires_projection = _codebook_dim != dim
-        self.project_in = (nn.Linear(dim, _codebook_dim) if requires_projection else nn.Identity())
-        self.project_out = (nn.Linear(_codebook_dim, dim) if requires_projection else nn.Identity())
-        self.epsilon = epsilon
-        self.commitment_weight = commitment_weight
-        self.orthogonal_reg_weight = orthogonal_reg_weight
-        self.orthogonal_reg_active_codes_only = orthogonal_reg_active_codes_only
-        self.orthogonal_reg_max_codes = orthogonal_reg_max_codes
-        self._codebook = EuclideanCodebook(dim=_codebook_dim, codebook_size=codebook_size,
-                                           kmeans_init=kmeans_init, kmeans_iters=kmeans_iters,
-                                           decay=decay, epsilon=epsilon,
-                                           threshold_ema_dead_code=threshold_ema_dead_code)
-        self.codebook_size = codebook_size
-        self.channels_last = channels_last
-    @property
-    def codebook(self):
-        return self._codebook.embed
-    @property
-    def inited(self):
-        return self._codebook.inited
-    def _preprocess(self, x):
-        if not self.channels_last:
-            x = rearrange(x, "b d n -> b n d")
-        return x
-    def _postprocess(self, quantize):
-        if not self.channels_last:
-            quantize = rearrange(quantize, "b n d -> b d n")
-        return quantize
-    def encode(self, x):
-        x = self._preprocess(x)
-        x = self.project_in(x)
-        embed_in = self._codebook.encode(x)
-        return embed_in
-    def decode(self, embed_ind):
-        quantize = self._codebook.decode(embed_ind)
-        quantize = self.project_out(quantize)
-        quantize = self._postprocess(quantize)
-        return quantize
-    def forward(self, x):
-        device = x.device
-        x = self._preprocess(x)
-        x = self.project_in(x)
-        quantize, embed_ind = self._codebook(x)
-        if self.training:
-            quantize = x + (quantize - x).detach()
-        loss = torch.tensor([0.0], device=device, requires_grad=self.training)
-        if self.training:
-            if self.commitment_weight > 0:
-                commit_loss = F.mse_loss(quantize.detach(), x)
-                loss = loss + commit_loss * self.commitment_weight
-            if self.orthogonal_reg_weight > 0:
-                codebook = self.codebook
-                if self.orthogonal_reg_active_codes_only:
-                    # only calculate orthogonal loss for the activated codes for this batch
-                    unique_code_ids = torch.unique(embed_ind)
-                    codebook = codebook[unique_code_ids]
-                num_codes = codebook.shape[0]
-                if exists(self.orthogonal_reg_max_codes) and num_codes > self.orthogonal_reg_max_codes:
-                    rand_ids = torch.randperm(num_codes, device=device)[:self.orthogonal_reg_max_codes]
-                    codebook = codebook[rand_ids]
-                orthogonal_reg_loss = orthogonal_loss_fn(codebook)
-                loss = loss + orthogonal_reg_loss * self.orthogonal_reg_weight
-        quantize = self.project_out(quantize)
-        quantize = self._postprocess(quantize)
-        return quantize, embed_ind, loss
-class ResidualVectorQuantization(nn.Module):
-    """Residual vector quantization implementation.
-    Follows Algorithm 1. in https://arxiv.org/pdf/2107.03312.pdf
-    """
-    def __init__(self, *, num_quantizers, **kwargs):
-        super().__init__()
-        self.layers = nn.ModuleList(
-            [VectorQuantization(**kwargs) for _ in range(num_quantizers)]
-        )
-    def forward(self, x, n_q: tp.Optional[int] = None):
-        quantized_out = 0.0
-        residual = x
-        all_losses = []
-        all_indices = []
-        n_q = n_q or len(self.layers)
-        for i, layer in enumerate(self.layers[:n_q]):
-            quantized, indices, loss = layer(residual)
-            quantized = quantized.detach()
-            residual = residual - quantized
-            quantized_out = quantized_out + quantized
-            all_indices.append(indices)
-            all_losses.append(loss)
-        if self.training:
-            # Solving subtle bug with STE and RVQ: https://github.com/facebookresearch/encodec/issues/25
-            quantized_out = x + (quantized_out - x).detach()
-        out_losses, out_indices = map(torch.stack, (all_losses, all_indices))
-        return quantized_out, out_indices, out_losses
-    def encode(self, x: torch.Tensor, n_q: tp.Optional[int] = None) -> torch.Tensor:
-        residual = x
-        all_indices = []
-        n_q = n_q or len(self.layers)
-        for layer in self.layers[:n_q]:
-            indices = layer.encode(residual)
-            quantized = layer.decode(indices)
-            residual = residual - quantized
-            all_indices.append(indices)
-        out_indices = torch.stack(all_indices)
-        return out_indices
-    def decode(self, q_indices: torch.Tensor) -> torch.Tensor:
-        quantized_out = torch.tensor(0.0, device=q_indices.device)
-        for i, indices in enumerate(q_indices):
-            layer = self.layers[i]
-            quantized = layer.decode(indices)
-            quantized_out = quantized_out + quantized
-        return quantized_out

audiocraft/quantization/vq.py DELETED Viewed

@@ -1,116 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-import math
-import typing as tp
-import torch
-from .base import BaseQuantizer, QuantizedResult
-from .core_vq import ResidualVectorQuantization
-class ResidualVectorQuantizer(BaseQuantizer):
-    """Residual Vector Quantizer.
-    Args:
-        dimension (int): Dimension of the codebooks.
-        n_q (int): Number of residual vector quantizers used.
-        q_dropout (bool): Random quantizer drop out at train time.
-        bins (int): Codebook size.
-        decay (float): Decay for exponential moving average over the codebooks.
-        kmeans_init (bool): Whether to use kmeans to initialize the codebooks.
-        kmeans_iters (int): Number of iterations used for kmeans initialization.
-        threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
-            that have an exponential moving average cluster size less than the specified threshold with
-            randomly selected vector from the current batch.
-        orthogonal_reg_weight (float): Orthogonal regularization weights.
-        orthogonal_reg_active_codes_only (bool): Apply orthogonal regularization only on active codes.
-        orthogonal_reg_max_codes (optional int): Maximum number of codes to consider.
-            for orthogonal regulariation.
-    """
-    def __init__(
-        self,
-        dimension: int = 256,
-        n_q: int = 8,
-        q_dropout: bool = False,
-        bins: int = 1024,
-        decay: float = 0.99,
-        kmeans_init: bool = True,
-        kmeans_iters: int = 10,
-        threshold_ema_dead_code: int = 2,
-        orthogonal_reg_weight: float = 0.0,
-        orthogonal_reg_active_codes_only: bool = False,
-        orthogonal_reg_max_codes: tp.Optional[int] = None,
-    ):
-        super().__init__()
-        self.max_n_q = n_q
-        self.n_q = n_q
-        self.q_dropout = q_dropout
-        self.dimension = dimension
-        self.bins = bins
-        self.decay = decay
-        self.kmeans_init = kmeans_init
-        self.kmeans_iters = kmeans_iters
-        self.threshold_ema_dead_code = threshold_ema_dead_code
-        self.orthogonal_reg_weight = orthogonal_reg_weight
-        self.orthogonal_reg_active_codes_only = orthogonal_reg_active_codes_only
-        self.orthogonal_reg_max_codes = orthogonal_reg_max_codes
-        self.vq = ResidualVectorQuantization(
-            dim=self.dimension,
-            codebook_size=self.bins,
-            num_quantizers=self.n_q,
-            decay=self.decay,
-            kmeans_init=self.kmeans_init,
-            kmeans_iters=self.kmeans_iters,
-            threshold_ema_dead_code=self.threshold_ema_dead_code,
-            orthogonal_reg_weight=self.orthogonal_reg_weight,
-            orthogonal_reg_active_codes_only=self.orthogonal_reg_active_codes_only,
-            orthogonal_reg_max_codes=self.orthogonal_reg_max_codes,
-            channels_last=False
-        )
-    def forward(self, x: torch.Tensor, frame_rate: int):
-        n_q = self.n_q
-        if self.training and self.q_dropout:
-            n_q = int(torch.randint(1, self.n_q + 1, (1,)).item())
-        bw_per_q = math.log2(self.bins) * frame_rate / 1000
-        quantized, codes, commit_loss = self.vq(x, n_q=n_q)
-        codes = codes.transpose(0, 1)
-        # codes is [B, K, T], with T frames, K nb of codebooks.
-        bw = torch.tensor(n_q * bw_per_q).to(x)
-        return QuantizedResult(quantized, codes, bw, penalty=torch.mean(commit_loss))
-    def encode(self, x: torch.Tensor) -> torch.Tensor:
-        """Encode a given input tensor with the specified frame rate at the given bandwidth.
-        The RVQ encode method sets the appropriate number of quantizer to use
-        and returns indices for each quantizer.
-        """
-        n_q = self.n_q
-        codes = self.vq.encode(x, n_q=n_q)
-        codes = codes.transpose(0, 1)
-        # codes is [B, K, T], with T frames, K nb of codebooks.
-        return codes
-    def decode(self, codes: torch.Tensor) -> torch.Tensor:
-        """Decode the given codes to the quantized representation.
-        """
-        # codes is [B, K, T], with T frames, K nb of codebooks, vq.decode expects [K, B, T].
-        codes = codes.transpose(0, 1)
-        quantized = self.vq.decode(codes)
-        return quantized
-    @property
-    def total_codebooks(self):
-        return self.max_n_q
-    @property
-    def num_codebooks(self):
-        return self.n_q
-    def set_num_codebooks(self, n: int):
-        assert n > 0 and n <= self.max_n_q
-        self.n_q = n

audiocraft/utils/__init__.py DELETED Viewed

@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.

audiocraft/utils/autocast.py DELETED Viewed

@@ -1,40 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-import torch
-class TorchAutocast:
-    """TorchAutocast utility class.
-    Allows you to enable and disable autocast. This is specially useful
-    when dealing with different architectures and clusters with different
-    levels of support.
-    Args:
-        enabled (bool): Whether to enable torch.autocast or not.
-        args: Additional args for torch.autocast.
-        kwargs: Additional kwargs for torch.autocast
-    """
-    def __init__(self, enabled: bool, *args, **kwargs):
-        self.autocast = torch.autocast(*args, **kwargs) if enabled else None
-    def __enter__(self):
-        if self.autocast is None:
-            return
-        try:
-            self.autocast.__enter__()
-        except RuntimeError:
-            device = self.autocast.device
-            dtype = self.autocast.fast_dtype
-            raise RuntimeError(
-                f"There was an error autocasting with dtype={dtype} device={device}\n"
-                "If you are on the FAIR Cluster, you might need to use autocast_dtype=float16"
-            )
-    def __exit__(self, *args, **kwargs):
-        if self.autocast is None:
-            return
-        self.autocast.__exit__(*args, **kwargs)

audiocraft/utils/cache.py DELETED Viewed

@@ -1,324 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-from concurrent.futures import ThreadPoolExecutor
-from collections import deque
-from functools import partial
-from hashlib import sha1
-import logging
-from pathlib import Path
-import sys
-import typing as tp
-import zipfile
-import flashy
-import torch
-logger = logging.getLogger(__name__)
-def get_full_embed(full_embed: torch.Tensor, x: tp.Any, idx: int, device: tp.Union[str, torch.device]) -> torch.Tensor:
-    """Utility function for the EmbeddingCache, returning the full embedding without any chunking.
-    This method can be used in case there is no need in extracting a chunk of the full embedding
-    read from the cache.
-    Args:
-        full_embed (torch.Tensor): The full embedding.
-        x (any): Batch object from which the full embedding is derived.
-        idx (torch.Tensor): Index of object to consider in the batch object.
-    Returns:
-        full_embed (torch.Tensor): The full embedding
-    """
-    return full_embed.to(device)
-class EmbeddingCache:
-    """Cache around embeddings computation for faster execution.
-    The EmbeddingCache is storing pre-computed embeddings on disk and provides a simple API
-    to retrieve the pre-computed embeddings on full inputs and extract only a given chunk
-    using a user-provided function. When the cache is warm (all embeddings are pre-computed),
-    the EmbeddingCache allows for faster training as it removes the need of computing the embeddings.
-    Additionally, it provides in-memory cache around the loaded embeddings to limit IO footprint
-    and synchronization points in the forward calls.
-    Args:
-        cache_path (Path): Path to folder where all pre-computed embeddings are saved on disk.
-        device (str or torch.device): Device on which the embedding is returned.
-        compute_embed_fn (callable[[Path, any, int], torch.Tensor], optional): Function to compute
-            the embedding from a given object and path. This user provided function can compute the
-            embedding from the provided object or using the provided path as entry point. The last parameter
-            specify the index corresponding to the current embedding in the object that can represent batch metadata.
-        extract_embed_fn (callable[[torch.Tensor, any, int], torch.Tensor], optional): Function to extract
-            the desired embedding chunk from the full embedding loaded from the cache. The last parameter
-            specify the index corresponding to the current embedding in the object that can represent batch metadata.
-            If not specified, will return the full embedding unmodified.
-    """
-    def __init__(self, cache_path: tp.Union[str, Path], device: tp.Union[str, torch.device],
-                 compute_embed_fn: tp.Callable[[Path, tp.Any, int], torch.Tensor],
-                 extract_embed_fn: tp.Optional[tp.Callable[[torch.Tensor, tp.Any, int], torch.Tensor]] = None):
-        self.cache_path = Path(cache_path)
-        self.device = device
-        self._compute_embed_fn = compute_embed_fn
-        self._extract_embed_fn: tp.Callable[[torch.Tensor, tp.Any, int], torch.Tensor]
-        if extract_embed_fn is not None:
-            self._extract_embed_fn = extract_embed_fn
-        else:
-            self._extract_embed_fn = partial(get_full_embed, device=device)
-        if self.cache_path is not None:
-            self.cache_path.mkdir(exist_ok=True, parents=True)
-            logger.info(f"Cache instantiated at: {self.cache_path}")
-            self.pool = ThreadPoolExecutor(8)
-            self.pool.__enter__()
-        self._current_batch_cache: dict = {}
-        self._memory_cache: dict = {}
-    def _get_cache_path(self, path: tp.Union[Path, str]):
-        """Get cache path for the given file path."""
-        sig = sha1(str(path).encode()).hexdigest()
-        return self.cache_path / sig
-    @staticmethod
-    def _get_full_embed_from_cache(cache: Path):
-        """Loads full pre-computed embedding from the cache."""
-        try:
-            embed = torch.load(cache, 'cpu')
-        except Exception as exc:
-            logger.error("Error loading %s: %r", cache, exc)
-            embed = None
-        return embed
-    def get_embed_from_cache(self, paths: tp.List[Path], x: tp.Any) -> torch.Tensor:
-        """Get embedding from cache, computing and storing it to cache if not already cached.
-        The EmbeddingCache first tries to load the embedding from the in-memory cache
-        containing the pre-computed chunks populated through `populate_embed_cache`.
-        If not found, the full embedding is computed and stored on disk to be later accessed
-        to populate the in-memory cache, and the desired embedding chunk is extracted and returned.
-        Args:
-            paths (list[Path or str]): List of paths from where the embeddings can be loaded.
-            x (any): Object from which the embedding is extracted.
-        """
-        embeds = []
-        for idx, path in enumerate(paths):
-            cache = self._get_cache_path(path)
-            if cache in self._current_batch_cache:
-                embed = self._current_batch_cache[cache]
-            else:
-                full_embed = self._compute_embed_fn(path, x, idx)
-                try:
-                    with flashy.utils.write_and_rename(cache, pid=True) as f:
-                        torch.save(full_embed.cpu(), f)
-                except Exception as exc:
-                    logger.error('Error saving embed %s (%s): %r', cache, full_embed.shape, exc)
-                else:
-                    logger.info('New embed cache saved: %s (%s)', cache, full_embed.shape)
-                    embed = self._extract_embed_fn(full_embed, x, idx)
-            embeds.append(embed)
-        embed = torch.stack(embeds, dim=0)
-        return embed
-    def populate_embed_cache(self, paths: tp.List[Path], x: tp.Any) -> None:
-        """Populate in-memory caches for embeddings reading from the embeddings stored on disk.
-        The in-memory caches consist in a cache for the full embedding and another cache for the
-        final embedding chunk. Such caches are used to limit the IO access when computing the actual embeddings
-        and reduce the IO footprint and synchronization points during forward passes.
-        Args:
-            paths (list[Path]): List of paths from where the embeddings can be loaded.
-            x (any): Object from which the embedding is extracted.
-        """
-        self._current_batch_cache.clear()
-        if self.cache_path is not None:
-            futures: list = []
-            for path in paths:
-                assert path is not None, "Path is required for computation from cache"
-                cache = self._get_cache_path(path)
-                if cache in self._memory_cache or not cache.exists():
-                    futures.append(None)
-                else:
-                    futures.append(self.pool.submit(EmbeddingCache._get_full_embed_from_cache, cache))
-            for idx, (path, future) in enumerate(zip(paths, futures)):
-                assert path is not None
-                cache = self._get_cache_path(path)
-                full_embed = None
-                if future is None:
-                    if cache in self._memory_cache:
-                        full_embed = self._memory_cache[cache]
-                else:
-                    full_embed = future.result()
-                    if full_embed is not None:
-                        self._memory_cache[cache] = full_embed
-                        full_embed = full_embed.to(self.device)
-                if full_embed is not None:
-                    embed = self._extract_embed_fn(full_embed, x, idx)
-                    self._current_batch_cache[cache] = embed
-class CachedBatchWriter:
-    """Write pre computed caches for mini batches. This can
-    make loading a lot more efficient depending on your filesystem.
-    Args:
-        cache_folder (Path): folder in which the cached minibatches
-            will be stored.
-    Inside cache folder, the structure is the following:
-    `epoch_number / update_number.zip`
-    And the zip file contains one entry per batch item.
-    It is possible to use the cache with a batch size smaller than
-    created with but obviously not larger. Make sure to call the
-    `start_epoch(epoch)` method for indicating changes of epochs.
-    See the grid `audiocraft/grids/musicgen/musicgen_warmup_cache.py`
-    for an example of how to warmup the cache.
-    """
-    def __init__(self, cache_folder: Path):
-        self.cache_folder = cache_folder
-        self._current_epoch: tp.Optional[int] = None
-        self._current_index = 0
-    def start_epoch(self, epoch: int):
-        """Call at the beginning of each epoch.
-        """
-        self._current_epoch = epoch
-        self._current_index = 0
-        self._zip_path.parent.mkdir(exist_ok=True, parents=True)
-    @staticmethod
-    def _get_zip_path(cache_folder: Path, epoch: int, index: int):
-        return cache_folder / f"{epoch:05d}" / f"{index:06d}.zip"
-    @property
-    def _zip_path(self):
-        assert self._current_epoch is not None
-        return CachedBatchWriter._get_zip_path(self.cache_folder, self._current_epoch, self._current_index)
-    def save(self, *content):
-        """Save one mini batch. This function is distributed-aware
-        and will automatically merge all the items from the different
-        workers.
-        """
-        all_contents = []
-        for rank in range(flashy.distrib.world_size()):
-            their_content = flashy.distrib.broadcast_object(content, src=rank)
-            all_contents.append(their_content)
-        if flashy.distrib.is_rank_zero():
-            idx = 0
-            with flashy.utils.write_and_rename(self._zip_path) as tmp:
-                with zipfile.ZipFile(tmp, 'w') as zf:
-                    for content in all_contents:
-                        for vals in zip(*content):
-                            with zf.open(f'{idx}', 'w') as f:  # type: ignore
-                                torch.save(vals, f)
-                            idx += 1
-        flashy.distrib.barrier()
-        self._current_index += 1
-class CachedBatchLoader:
-    """Loader for cached mini-batches dumped with `CachedBatchWriter`.
-    Args:
-        cache_folder (Path): folder in which the cached minibatches are stored.
-        batch_size (int): batch size (per GPU) expected.
-        num_workers (int): number of workers to use for loading.
-        min_length (int): minimum expected length for each epoch. If some
-            mini-batches are missing, and error is raised.
-    This is iterable just like a regular DataLoader.
-    """
-    def __init__(self, cache_folder: Path, batch_size: int,
-                 num_workers: int = 10, min_length: int = 1):
-        self.cache_folder = cache_folder
-        self.batch_size = batch_size
-        self.num_workers = num_workers
-        self.min_length = min_length
-        self._current_epoch: tp.Optional[int] = None
-        self.sampler = None  # for compatibility with the regular DataLoader
-    def __len__(self):
-        path = CachedBatchWriter._get_zip_path(self.cache_folder, self._current_epoch or 0, 0).parent
-        return len([p for p in path.iterdir() if p.suffix == ".zip"])
-    def start_epoch(self, epoch: int):
-        """Call at the beginning of each epoch.
-        """
-        self._current_epoch = epoch
-    def _zip_path(self, index: int):
-        assert self._current_epoch is not None
-        return CachedBatchWriter._get_zip_path(self.cache_folder, self._current_epoch, index)
-    def _load_one(self, index: int):
-        zip_path = self._zip_path(index)
-        if not zip_path.exists():
-            if index < self.min_length:
-                raise RuntimeError(f"Cache should have at least {self.min_length} batches, but {index} doesn't exist")
-            return None
-        mode = "rb" if sys.version_info >= (3, 9) else "r"
-        try:
-            with zipfile.ZipFile(zip_path, 'r') as zf:
-                rank = flashy.distrib.rank()
-                world_size = flashy.distrib.world_size()
-                root = zipfile.Path(zf)
-                items = list(root.iterdir())
-                total_batch_size = self.batch_size * world_size
-                if len(items) < total_batch_size:
-                    raise RuntimeError(
-                        f"The cache can handle a max batch size of {len(items)}, "
-                        f"but {total_batch_size} is needed.")
-                start = rank * self.batch_size
-                items = items[start: start + self.batch_size]
-                assert len(items) == self.batch_size
-                entries = []
-                entries = [torch.load(item.open(mode), 'cpu') for item in items]  # type: ignore
-                transposed = zip(*entries)
-                out = []
-                for part in transposed:
-                    assert len(part) > 0
-                    if isinstance(part[0], torch.Tensor):
-                        out.append(torch.stack(part))
-                    else:
-                        assert isinstance(part, torch.Tensor)
-                        out.append(part)
-                return out
-        except Exception:
-            logger.error("Error when reading zip path %s", zip_path)
-            raise
-    def __iter__(self):
-        """This will yields tuples, exactly as provided to the
-        `CachedBatchWriter.save` method.
-        """
-        pool = ThreadPoolExecutor(self.num_workers)
-        next_index = 0
-        queue = deque()
-        def _get_next():
-            nonlocal next_index
-            r = queue.popleft().result()
-            if r is None:
-                return None
-            else:
-                queue.append(pool.submit(self._load_one, next_index))
-                next_index += 1
-            return r
-        with pool:
-            # fill the buffer of fetching jobs.
-            for _ in range(2 * self.num_workers):
-                queue.append(pool.submit(self._load_one, next_index))
-                next_index += 1
-            while True:
-                batch = _get_next()
-                if batch is None:
-                    return
-                yield batch

audiocraft/utils/cluster.py DELETED Viewed

@@ -1,75 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Utility functions for SLURM configuration and cluster settings.
-"""
-from enum import Enum
-import os
-import socket
-import typing as tp
-import omegaconf
-class ClusterType(Enum):
-    AWS = "aws"
-    FAIR = "fair"
-    RSC = "rsc"
-    LOCAL_DARWIN = "darwin"
-    DEFAULT = "default"  # used for any other cluster.
-def _guess_cluster_type() -> ClusterType:
-    uname = os.uname()
-    fqdn = socket.getfqdn()
-    if uname.sysname == "Linux" and (uname.release.endswith("-aws") or ".ec2" in fqdn):
-        return ClusterType.AWS
-    if fqdn.endswith(".fair"):
-        return ClusterType.FAIR
-    if fqdn.endswith(".facebook.com"):
-        return ClusterType.RSC
-    if uname.sysname == "Darwin":
-        return ClusterType.LOCAL_DARWIN
-    return ClusterType.DEFAULT
-def get_cluster_type(
-    cluster_type: tp.Optional[ClusterType] = None,
-) -> tp.Optional[ClusterType]:
-    if cluster_type is None:
-        return _guess_cluster_type()
-    return cluster_type
-def get_slurm_parameters(
-    cfg: omegaconf.DictConfig, cluster_type: tp.Optional[ClusterType] = None
-) -> omegaconf.DictConfig:
-    """Update SLURM parameters in configuration based on cluster type.
-    If the cluster type is not specify, it infers it automatically.
-    """
-    from ..environment import AudioCraftEnvironment
-    cluster_type = get_cluster_type(cluster_type)
-    # apply cluster-specific adjustments
-    if cluster_type == ClusterType.AWS:
-        cfg["mem_per_gpu"] = None
-        cfg["constraint"] = None
-        cfg["setup"] = []
-    elif cluster_type == ClusterType.RSC:
-        cfg["mem_per_gpu"] = None
-        cfg["setup"] = []
-        cfg["constraint"] = None
-        cfg["partition"] = "learn"
-    slurm_exclude = AudioCraftEnvironment.get_slurm_exclude()
-    if slurm_exclude is not None:
-        cfg["exclude"] = slurm_exclude
-    return cfg

audiocraft/utils/export.py DELETED Viewed

@@ -1,79 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Utility to export a training checkpoint to a lightweight release checkpoint.
-"""
-from pathlib import Path
-import typing as tp
-from omegaconf import OmegaConf
-import torch
-from audiocraft import __version__
-def export_encodec(checkpoint_path: tp.Union[Path, str], out_file: tp.Union[Path, str]):
-    """Export only the best state from the given EnCodec checkpoint. This
-    should be used if you trained your own EnCodec model.
-    """
-    pkg = torch.load(checkpoint_path, 'cpu')
-    new_pkg = {
-        'best_state': pkg['best_state']['model'],
-        'xp.cfg': OmegaConf.to_yaml(pkg['xp.cfg']),
-        'version': __version__,
-        'exported': True,
-    }
-    Path(out_file).parent.mkdir(exist_ok=True, parents=True)
-    torch.save(new_pkg, out_file)
-    return out_file
-def export_pretrained_compression_model(pretrained_encodec: str, out_file: tp.Union[Path, str]):
-    """Export a compression model (potentially EnCodec) from a pretrained model.
-    This is required for packaging the audio tokenizer along a MusicGen or AudioGen model.
-    Do not include the //pretrained/ prefix. For instance if you trained a model
-    with `facebook/encodec_32khz`, just put that as a name. Same for `dac_44khz`.
-    In that case, this will not actually include a copy of the model, simply the reference
-    to the model used.
-    """
-    if Path(pretrained_encodec).exists():
-        pkg = torch.load(pretrained_encodec)
-        assert 'best_state' in pkg
-        assert 'xp.cfg' in pkg
-        assert 'version' in pkg
-        assert 'exported' in pkg
-    else:
-        pkg = {
-            'pretrained': pretrained_encodec,
-            'exported': True,
-            'version': __version__,
-        }
-    Path(out_file).parent.mkdir(exist_ok=True, parents=True)
-    torch.save(pkg, out_file)
-def export_lm(checkpoint_path: tp.Union[Path, str], out_file: tp.Union[Path, str]):
-    """Export only the best state from the given MusicGen or AudioGen checkpoint.
-    """
-    pkg = torch.load(checkpoint_path, 'cpu')
-    if pkg['fsdp_best_state']:
-        best_state = pkg['fsdp_best_state']['model']
-    else:
-        assert pkg['best_state']
-        best_state = pkg['best_state']['model']
-    new_pkg = {
-        'best_state': best_state,
-        'xp.cfg': OmegaConf.to_yaml(pkg['xp.cfg']),
-        'version': __version__,
-        'exported': True,
-    }
-    Path(out_file).parent.mkdir(exist_ok=True, parents=True)
-    torch.save(new_pkg, out_file)
-    return out_file

audiocraft/utils/export_legacy.py DELETED Viewed

@@ -1,56 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Utility to export a training checkpoint to a lightweight release checkpoint.
-"""
-from pathlib import Path
-import typing as tp
-from omegaconf import OmegaConf, DictConfig
-import torch
-def _clean_lm_cfg(cfg: DictConfig):
-    OmegaConf.set_struct(cfg, False)
-    # This used to be set automatically in the LM solver, need a more robust solution
-    # for the future.
-    cfg['transformer_lm']['card'] = 2048
-    cfg['transformer_lm']['n_q'] = 4
-    # Experimental params no longer supported.
-    bad_params = ['spectral_norm_attn_iters', 'spectral_norm_ff_iters',
-                  'residual_balancer_attn', 'residual_balancer_ff', 'layer_drop']
-    for name in bad_params:
-        del cfg['transformer_lm'][name]
-    OmegaConf.set_struct(cfg, True)
-    return cfg
-def export_encodec(checkpoint_path: tp.Union[Path, str], out_folder: tp.Union[Path, str]):
-    sig = Path(checkpoint_path).parent.name
-    assert len(sig) == 8, "Not a valid Dora signature"
-    pkg = torch.load(checkpoint_path, 'cpu')
-    new_pkg = {
-        'best_state': pkg['ema']['state']['model'],
-        'xp.cfg': OmegaConf.to_yaml(pkg['xp.cfg']),
-    }
-    out_file = Path(out_folder) / f'{sig}.th'
-    torch.save(new_pkg, out_file)
-    return out_file
-def export_lm(checkpoint_path: tp.Union[Path, str], out_folder: tp.Union[Path, str]):
-    sig = Path(checkpoint_path).parent.name
-    assert len(sig) == 8, "Not a valid Dora signature"
-    pkg = torch.load(checkpoint_path, 'cpu')
-    new_pkg = {
-        'best_state': pkg['fsdp_best_state']['model'],
-        'xp.cfg': OmegaConf.to_yaml(_clean_lm_cfg(pkg['xp.cfg']))
-    }
-    out_file = Path(out_folder) / f'{sig}.th'
-    torch.save(new_pkg, out_file)
-    return out_file

audiocraft/utils/extend.py DELETED Viewed

@@ -1,440 +0,0 @@
-from tabnanny import verbose
-import torch
-import math
-from audiocraft.models import MusicGen
-import numpy as np
-from PIL import Image, ImageDraw, ImageFont, ImageColor
-import string
-import tempfile
-import os
-import textwrap
-import requests
-from io import BytesIO
-from huggingface_hub import hf_hub_download
-import librosa
-import gradio as gr
-import re
-from tqdm import tqdm
-INTERRUPTING = False
-def separate_audio_segments(audio, segment_duration=30, overlap=1):
-    sr, audio_data = audio[0], audio[1]
-    segment_samples = sr * segment_duration
-    total_samples = max(min((len(audio_data) // segment_samples), 25), 0)
-    overlap_samples = sr * overlap
-    segments = []
-    start_sample = 0
-    # handle the case where the audio is shorter than the segment duration
-    if total_samples == 0:
-        total_samples = 1
-        segment_samples = len(audio_data)
-        overlap_samples = 0
-    while total_samples >= segment_samples:
-        # Collect the segment
-        # the end sample is the start sample plus the segment samples,
-        # the start sample, after 0, is minus the overlap samples to account for the overlap
-        end_sample = start_sample + segment_samples
-        segment = audio_data[start_sample:end_sample]
-        segments.append((sr, segment))
-        start_sample += segment_samples - overlap_samples
-        total_samples -= segment_samples
-    # Collect the final segment
-    if total_samples > 0:
-        segment = audio_data[-segment_samples:]
-        segments.append((sr, segment))
-    print(f"separate_audio_segments: {len(segments)} segments of length {segment_samples // sr} seconds")
-    return segments
-def generate_music_segments(text, melody, seed, MODEL, duration:int=10, overlap:int=1, segment_duration:int=30, prompt_index:int=0, harmony_only:bool= False, excerpt_duration:float=3.5, progress= gr.Progress(track_tqdm=True)):
-    # generate audio segments
-    melody_segments = separate_audio_segments(melody, segment_duration, 0)
-    # Create lists to store the melody tensors for each segment
-    melodys = []
-    output_segments = []
-    last_chunk = []
-    text += ", seed=" + str(seed)
-    prompt_segment = None
-    # prevent hacking
-    duration = min(duration, 720)
-    overlap =  min(overlap, 15)
-    # Calculate the total number of segments
-    total_segments = max(math.ceil(duration / segment_duration),1)
-    #calculate duration loss from segment overlap
-    duration_loss = max(total_segments - 1,0) * math.ceil(overlap / 2)
-    #calc excess duration
-    excess_duration = segment_duration - (total_segments * segment_duration - duration)
-    print(f"total Segments to Generate: {total_segments} for {duration} seconds. Each segment is {segment_duration} seconds. Excess {excess_duration} Overlap Loss {duration_loss}")
-    duration += duration_loss
-    pbar = tqdm(total=total_segments*2, desc="Generating segments", leave=False)
-    while excess_duration + duration_loss > segment_duration:
-        total_segments += 1
-        #calculate duration loss from segment overlap
-        duration_loss += math.ceil(overlap / 2)
-        #calc excess duration
-        excess_duration = segment_duration - (total_segments * segment_duration - duration)
-        print(f"total Segments to Generate: {total_segments} for {duration} seconds. Each segment is {segment_duration} seconds. Excess {excess_duration} Overlap Loss {duration_loss}")
-        if excess_duration + duration_loss > segment_duration:
-            duration += duration_loss
-            duration_loss = 0
-    pbar.update(1)
-    total_segments = min(total_segments, (720 // segment_duration))
-    # If melody_segments is shorter than total_segments, repeat the segments until the total_segments is reached
-    if len(melody_segments) < total_segments:
-        #fix melody_segments
-        for i in range(total_segments - len(melody_segments)):
-            segment = melody_segments[i]
-            melody_segments.append(segment)
-            pbar.update(1)
-        print(f"melody_segments: {len(melody_segments)} fixed")
-    # Iterate over the segments to create list of Melody tensors
-    for segment_idx in range(total_segments):
-        if INTERRUPTING:
-            return [], duration
-        print(f"segment {segment_idx + 1} of {total_segments} \r")
-        if harmony_only:
-            # REMOVE PERCUSION FROM MELODY
-            # Apply HPSS using librosa
-            verse_harmonic, verse_percussive = librosa.effects.hpss(melody_segments[segment_idx][1])
-            # Convert the separated components back to torch.Tensor
-            #harmonic_tensor = torch.from_numpy(verse_harmonic)
-            #percussive_tensor = torch.from_numpy(verse_percussive)
-            sr, verse = melody_segments[segment_idx][0], torch.from_numpy(verse_harmonic).to(MODEL.device).float().t().unsqueeze(0)
-        else:
-            sr, verse = melody_segments[segment_idx][0], torch.from_numpy(melody_segments[segment_idx][1]).to(MODEL.device).float().t().unsqueeze(0)
-        print(f"shape:{verse.shape} dim:{verse.dim()}")
-        #if verse is 2D, add 3rd dimension
-        if verse.dim() == 2:
-           verse = verse[None]
-        verse = verse[..., :int(sr * MODEL.lm.cfg.dataset.segment_duration)]
-        # Reduce the length of verse to sr * excerpt_duration
-        if ("style" in MODEL.name):
-            verse = verse[:, :, :int(sr * excerpt_duration)]
-        # Append the segment to the melodys list
-        melodys.append(verse)
-        pbar.update(1)
-    pbar.close()
-    torch.manual_seed(seed)
-    # If user selects a prompt segment, generate a new prompt segment to use on all segments
-    #default to the first segment for prompt conditioning
-    prompt_verse = melodys[0]
-    if prompt_index > 0:
-        # Get a prompt segment from the selected verse, normally the first verse
-        prompt_verse = melodys[prompt_index if prompt_index <= (total_segments - 1) else (total_segments -1)]
-    # set the prompt segment MODEL generation params
-    MODEL.set_generation_params(
-        use_sampling=True,
-        top_k=MODEL.generation_params["top_k"],
-        top_p=MODEL.generation_params["top_p"],
-        temperature=MODEL.generation_params["temp"],
-        cfg_coef=MODEL.generation_params["cfg_coef"],
-        cfg_coef_beta=MODEL.generation_params["cfg_coef_beta"],
-        duration=segment_duration,
-        two_step_cfg=False,
-        rep_penalty=0.5,
-    )
-    if ("style" in MODEL.name):
-        MODEL.set_style_conditioner_params(
-            eval_q=MODEL.lm.condition_provider.conditioners.self_wav.eval_q, # integer between 1 and 6
-            excerpt_length=excerpt_duration, # the length in seconds that is taken by the model in the provided excerpt, can be between 1.5 and 4.5 seconds but it has to be shortest to the length of the provided conditioning
-        )
-    # Generate a new prompt segment. This will be applied to all segments for consistency
-    print(f"Generating New Prompt Segment: {text} from verse {prompt_index}\r")
-    prompt_segment = MODEL.generate_with_all(
-        descriptions=[text],
-        melody_wavs=prompt_verse,
-        sample_rate=sr,
-        progress=False,
-        prompt=None,
-    )
-    for idx, verse in tqdm(enumerate(melodys), total=len(melodys), desc="Generating melody segments"):
-        if INTERRUPTING:
-            return output_segments, duration
-        print(f'Segment duration: {segment_duration}, duration: {duration}, overlap: {overlap} Overlap Loss: {duration_loss}')
-        # Compensate for the length of final segment
-        if ((idx + 1) == len(melodys)) or (duration < segment_duration):
-            mod_duration = max(min(duration, segment_duration),1)
-            print(f'Modify verse length, duration: {duration}, overlap: {overlap} Overlap Loss: {duration_loss} to mod duration: {mod_duration}')
-            MODEL.set_generation_params(
-                use_sampling=True,
-                top_k=MODEL.generation_params["top_k"],
-                top_p=MODEL.generation_params["top_p"],
-                temperature=MODEL.generation_params["temp"],
-                cfg_coef=MODEL.generation_params["cfg_coef"],
-                cfg_coef_beta=MODEL.generation_params["cfg_coef_beta"],
-                duration=mod_duration,
-                two_step_cfg=False,
-                rep_penalty=0.5,
-            )
-            if ("style" in MODEL.name):
-                MODEL.set_style_conditioner_params(
-                    eval_q=MODEL.lm.condition_provider.conditioners.self_wav.eval_q, # integer between 1 and 6
-                    excerpt_length=min(excerpt_duration, mod_duration), # the length in seconds that is taken by the model in the provided excerpt, can be between 1.5 and 4.5 seconds but it has to be shortest to the length of the provided conditioning
-                )
-            try:
-                # get last chunk
-                verse = verse[:, :, -mod_duration*MODEL.sample_rate:]
-                prompt_segment = prompt_segment[:, :, -mod_duration*MODEL.sample_rate:]
-            except:
-                # get first chunk
-                verse = verse[:, :, :mod_duration*MODEL.sample_rate]
-                prompt_segment = prompt_segment[:, :, :mod_duration*MODEL.sample_rate]
-        print(f"Generating New Melody Segment {idx + 1}: {text}\r")
-        output, tokens = MODEL.generate_with_all(
-            descriptions=[text],
-            melody_wavs=verse,
-            sample_rate=sr,
-            progress=True,
-            prompt=prompt_segment,
-            return_tokens = True
-        )
-        # If user selects a prompt segment, use the prompt segment for all segments
-        # Otherwise, use the previous segment as the prompt
-        if prompt_index < 0:
-            if harmony_only:
-                # REMOVE PERCUSION FROM MELODY
-                # Apply HPSS using librosa
-                verse_harmonic, verse_percussive = librosa.effects.hpss(output.detach().cpu().numpy())
-                # Convert the separated components back to torch.Tensor
-                #harmonic_tensor = torch.from_numpy(verse_harmonic)
-                #percussive_tensor = torch.from_numpy(verse_percussive)
-                verse = torch.from_numpy(verse_harmonic).to(MODEL.device).float()
-                # if verse is 2D, add extra dimension
-                if verse.dim() == 2:
-                   verse = verse[None]
-                output = verse
-            prompt_segment = output
-        # Append the generated output to the list of segments
-        #output_segments.append(output[:, :segment_duration])
-        output_segments.append(output)
-        print(f"output_segments: {len(output_segments)}: shape: {output.shape} dim {output.dim()}")
-        #track duration
-        if duration > segment_duration:
-            duration -= segment_duration
-    return output_segments, excess_duration
-def save_image(image):
-    """
-    Saves a PIL image to a temporary file and returns the file path.
-    Parameters:
-    - image: PIL.Image
-        The PIL image object to be saved.
-    Returns:
-    - str or None: The file path where the image was saved,
-        or None if there was an error saving the image.
-    """
-    temp_dir = tempfile.gettempdir()
-    temp_file = tempfile.NamedTemporaryFile(suffix=".png", dir=temp_dir, delete=False)
-    temp_file.close()
-    file_path = temp_file.name
-    try:
-        image.save(file_path)
-    except Exception as e:
-        print("Unable to save image:", str(e))
-        return None
-    finally:
-        return file_path
-def detect_color_format(color):
-    """
-    Detects if the color is in RGB, RGBA, or hex format,
-    and converts it to an RGBA tuple with integer components.
-    Args:
-        color (str or tuple): The color to detect.
-    Returns:
-        tuple: The color in RGBA format as a tuple of 4 integers.
-    Raises:
-        ValueError: If the input color is not in a recognized format.
-    """
-    # Handle color as a tuple of floats or integers
-    if isinstance(color, tuple):
-        if len(color) == 3 or len(color) == 4:
-            # Ensure all components are numbers
-            if all(isinstance(c, (int, float)) for c in color):
-                r, g, b = color[:3]
-                a = color[3] if len(color) == 4 else 255
-                return (
-                    max(0, min(255, int(round(r)))),
-                    max(0, min(255, int(round(g)))),
-                    max(0, min(255, int(round(b)))),
-                    max(0, min(255, int(round(a * 255)) if a <= 1 else round(a))),
-                )
-        else:
-            raise ValueError(f"Invalid color tuple length: {len(color)}")
-    # Handle hex color codes
-    if isinstance(color, str):
-        color = color.strip()
-        # Try to use PIL's ImageColor
-        try:
-            rgba = ImageColor.getcolor(color, "RGBA")
-            return rgba
-        except ValueError:
-            pass
-        # Handle 'rgba(r, g, b, a)' string format
-        rgba_match = re.match(r'rgba\(\s*([0-9.]+),\s*([0-9.]+),\s*([0-9.]+),\s*([0-9.]+)\s*\)', color)
-        if rgba_match:
-            r, g, b, a = map(float, rgba_match.groups())
-            return (
-                max(0, min(255, int(round(r)))),
-                max(0, min(255, int(round(g)))),
-                max(0, min(255, int(round(b)))),
-                max(0, min(255, int(round(a * 255)) if a <= 1 else round(a))),
-            )
-        # Handle 'rgb(r, g, b)' string format
-        rgb_match = re.match(r'rgb\(\s*([0-9.]+),\s*([0-9.]+),\s*([0-9.]+)\s*\)', color)
-        if rgb_match:
-            r, g, b = map(float, rgb_match.groups())
-            return (
-                max(0, min(255, int(round(r)))),
-                max(0, min(255, int(round(g)))),
-                max(0, min(255, int(round(b)))),
-                255,
-            )
-    # If none of the above conversions work, raise an error
-    raise ValueError(f"Invalid color format: {color}")
-def hex_to_rgba(hex_color):
-    try:
-        if hex_color.startswith("#"):
-            clean_hex = hex_color.replace('#','')
-            # Use a generator expression to convert pairs of hexadecimal digits to integers and create a tuple
-            rgba = tuple(int(clean_hex[i:i+2], 16) for i in range(0, len(clean_hex),2))
-        else:
-            rgba = tuple(map(int,detect_color_format(hex_color)))
-    except ValueError:
-        # If the hex color is invalid, default to yellow
-        rgba = (255,255,0,255)
-    return rgba
-def load_font(font_name, font_size=16):
-    """
-    Load a font using the provided font name and font size.
-    Parameters:
-        font_name (str): The name of the font to load. Can be a font name recognized by the system, a URL to download the font file,
-            a local file path, or a Hugging Face model hub identifier.
-        font_size (int, optional): The size of the font. Default is 16.
-    Returns:
-        ImageFont.FreeTypeFont: The loaded font object.
-    Notes:
-        This function attempts to load the font using various methods until a suitable font is found. If the provided font_name
-        cannot be loaded, it falls back to a default font.
-        The font_name can be one of the following:
-        - A font name recognized by the system, which can be loaded using ImageFont.truetype.
-        - A URL pointing to the font file, which is downloaded using requests and then loaded using ImageFont.truetype.
-        - A local file path to the font file, which is loaded using ImageFont.truetype.
-        - A Hugging Face model hub identifier, which downloads the font file from the Hugging Face model hub using hf_hub_download
-          and then loads it using ImageFont.truetype.
-    Example:
-        font = load_font("Arial.ttf", font_size=20)
-    """
-    font = None
-    if not "http" in font_name:
-        try:
-            font = ImageFont.truetype(font_name, font_size)
-        except (FileNotFoundError, OSError):
-            print("Font not found. Using Hugging Face download..\n")
-        if font is None:
-            try:
-                font_path = ImageFont.truetype(hf_hub_download(repo_id=os.environ.get('SPACE_ID', ''), filename="assets/" + font_name, repo_type="space"), encoding="UTF-8")
-                font = ImageFont.truetype(font_path, font_size)
-            except (FileNotFoundError, OSError):
-                print("Font not found. Trying to download from local assets folder...\n")
-        if font is None:
-            try:
-                font = ImageFont.truetype("assets/" + font_name, font_size)
-            except (FileNotFoundError, OSError):
-                print("Font not found. Trying to download from URL...\n")
-    if font is None:
-        try:
-            req = requests.get(font_name)
-            font = ImageFont.truetype(BytesIO(req.content), font_size)
-        except (FileNotFoundError, OSError):
-             print(f"Font not found: {font_name} Using default font\n")
-    if font:
-        print(f"Font loaded {font.getname()}")
-    else:
-        font = ImageFont.load_default()
-    return font
-def add_settings_to_image(title: str = "title", description: str = "", width: int = 768, height: int = 512, background_path: str = "", font: str = "arial.ttf", font_color: str = "#ffffff", font_size: int = 28, progress=gr.Progress(track_tqdm=True)):
-    # Create a new RGBA image with the specified dimensions
-    image = Image.new("RGBA", (width, height), (255, 255, 255, 0))
-    # If a background image is specified, open it and paste it onto the image
-    if background_path == "":
-        background = Image.new("RGBA", (width, height), (255, 255, 255, 255))
-    else:
-        background = Image.open(background_path).convert("RGBA")
-    #Convert font color to RGBA tuple
-    font_color = hex_to_rgba(font_color)
-    print(f"Font Color: {font_color}\n")
-    # Calculate the center coordinates for placing the text
-    text_x = width // 2
-    text_y = height // 2
-    # Draw the title text at the center top
-    title_font = load_font(font, font_size)  # Replace with your desired font and size
-    title_text = '\n'.join(textwrap.wrap(title, width // 12))
-    title_x, title_y, title_text_width, title_text_height = title_font.getbbox(title_text)
-    title_x = max(text_x - (title_text_width // 2), title_x, 0)
-    title_y = text_y - (height // 2) + 10  # 10 pixels padding from the top
-    title_draw = ImageDraw.Draw(image)
-    title_draw.multiline_text((title_x, title_y), title, fill=font_color, font=title_font, align="center")
-    # Draw the description text two lines below the title
-    description_font = load_font(font, int(font_size * 2 / 3))  # Replace with your desired font and size
-    description_text = '\n'.join(textwrap.wrap(description, width // 12))
-    description_x, description_y, description_text_width, description_text_height = description_font.getbbox(description_text)
-    description_x = max(text_x - (description_text_width // 2), description_x, 0)
-    description_y = title_y + title_text_height + 20  # 20 pixels spacing between title and description
-    description_draw = ImageDraw.Draw(image)
-    description_draw.multiline_text((description_x, description_y), description_text, fill=font_color, font=description_font, align="center")
-    # Calculate the offset to center the image on the background
-    bg_w, bg_h = background.size
-    offset = ((bg_w - width) // 2, (bg_h - height) // 2)
-    # Paste the image onto the background
-    background.paste(image, offset, mask=image)
-    # Save the image and return the file path
-    return save_image(background)

audiocraft/utils/notebook.py DELETED Viewed

@@ -1,32 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-try:
-    import IPython.display as ipd  # type: ignore
-except ImportError:
-    # Note in a notebook...
-    pass
-import torch
-def display_audio(samples: torch.Tensor, sample_rate: int):
-    """Renders an audio player for the given audio samples.
-    Args:
-        samples (torch.Tensor): a Tensor of decoded audio samples
-            with shapes [B, C, T] or [C, T]
-        sample_rate (int): sample rate audio should be displayed with.
-    """
-    assert samples.dim() == 2 or samples.dim() == 3
-    samples = samples.detach().cpu()
-    if samples.dim() == 2:
-        samples = samples[None, ...]
-    for audio in samples:
-        ipd.display(ipd.Audio(audio, rate=sample_rate))

audiocraft/utils/utils.py DELETED Viewed

@@ -1,328 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-from concurrent.futures import ProcessPoolExecutor
-from contextlib import contextmanager
-from functools import wraps, lru_cache
-import hashlib
-import json
-import logging
-from pathlib import Path
-import typing as tp
-import flashy
-import flashy.distrib
-import omegaconf
-import torch
-from torch.nn.utils.rnn import pad_sequence
-logger = logging.getLogger(__name__)
-def model_hash(model: torch.nn.Module) -> str:
-    """Return a model hash. This should allow us to track regressions in model init
-    from the logs of past experiments.
-    """
-    hasher = hashlib.sha1()
-    for p in model.parameters():
-        hasher.update(p.data.cpu().numpy().tobytes())
-    return hasher.hexdigest()
-def dict_from_config(cfg: omegaconf.DictConfig) -> dict:
-    """Convenience function to map an omegaconf configuration to a dictionary.
-    Args:
-        cfg (omegaconf.DictConfig): Original configuration to map to dict.
-    Returns:
-        dict: Config as dictionary object.
-    """
-    dct = omegaconf.OmegaConf.to_container(cfg, resolve=True)
-    assert isinstance(dct, dict)
-    return dct
-def random_subset(dataset, max_samples: int, seed: int = 42) -> torch.utils.data.Subset:
-    if max_samples >= len(dataset):
-        return dataset
-    generator = torch.Generator().manual_seed(seed)
-    perm = torch.randperm(len(dataset), generator=generator)
-    return torch.utils.data.Subset(dataset, perm[:max_samples].tolist())
-def get_loader(dataset, num_samples: tp.Optional[int], batch_size: int,
-               num_workers: int, seed: int, **kwargs) -> torch.utils.data.DataLoader:
-    """Convenience function to load dataset into a dataloader with optional subset sampling.
-    Args:
-        dataset: Dataset to load.
-        num_samples (Optional[int]): Number of samples to limit subset size.
-        batch_size (int): Batch size.
-        num_workers (int): Number of workers for data loading.
-        seed (int): Random seed.
-    """
-    if num_samples is not None:
-        dataset = random_subset(dataset, num_samples, seed)
-    dataloader = flashy.distrib.loader(
-        dataset,
-        batch_size=batch_size,
-        num_workers=num_workers,
-        **kwargs
-    )
-    return dataloader
-def get_dataset_from_loader(dataloader):
-    dataset = dataloader.dataset
-    if isinstance(dataset, torch.utils.data.Subset):
-        return dataset.dataset
-    else:
-        return dataset
-def multinomial(input: torch.Tensor, num_samples: int, replacement=False, *, generator=None):
-    """torch.multinomial with arbitrary number of dimensions, and number of candidates on the last dimension.
-    Args:
-        input (torch.Tensor): The input tensor containing probabilities.
-        num_samples (int): Number of samples to draw.
-        replacement (bool): Whether to draw with replacement or not.
-    Keywords args:
-        generator (torch.Generator): A pseudorandom number generator for sampling.
-    Returns:
-        torch.Tensor: Last dimension contains num_samples indices
-            sampled from the multinomial probability distribution
-            located in the last dimension of tensor input.
-    """
-    input_ = input.reshape(-1, input.shape[-1])
-    output_ = torch.multinomial(input_, num_samples=num_samples, replacement=replacement, generator=generator)
-    output = output_.reshape(*list(input.shape[:-1]), -1)
-    return output
-def sample_top_k(probs: torch.Tensor, k: int) -> torch.Tensor:
-    """Sample next token from top K values along the last dimension of the input probs tensor.
-    Args:
-        probs (torch.Tensor): Input probabilities with token candidates on the last dimension.
-        k (int): The k in “top-k”.
-    Returns:
-        torch.Tensor: Sampled tokens.
-    """
-    top_k_value, _ = torch.topk(probs, k, dim=-1)
-    min_value_top_k = top_k_value[..., [-1]]
-    probs *= (probs >= min_value_top_k).float()
-    probs.div_(probs.sum(dim=-1, keepdim=True))
-    next_token = multinomial(probs, num_samples=1)
-    return next_token
-def sample_top_p(probs: torch.Tensor, p: float) -> torch.Tensor:
-    """Sample next token from top P probabilities along the last dimension of the input probs tensor.
-    Args:
-        probs (torch.Tensor): Input probabilities with token candidates on the last dimension.
-        p (int): The p in “top-p”.
-    Returns:
-        torch.Tensor: Sampled tokens.
-    """
-    probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
-    probs_sum = torch.cumsum(probs_sort, dim=-1)
-    mask = probs_sum - probs_sort > p
-    probs_sort *= (~mask).float()
-    probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
-    next_token = multinomial(probs_sort, num_samples=1)
-    next_token = torch.gather(probs_idx, -1, next_token)
-    return next_token
-class DummyPoolExecutor:
-    """Dummy pool executor to use when we actually have only 1 worker.
-    (e.g. instead of ProcessPoolExecutor).
-    """
-    class DummyResult:
-        def __init__(self, func, *args, **kwargs):
-            self.func = func
-            self.args = args
-            self.kwargs = kwargs
-        def result(self):
-            return self.func(*self.args, **self.kwargs)
-    def __init__(self, workers, mp_context=None):
-        pass
-    def submit(self, func, *args, **kwargs):
-        return DummyPoolExecutor.DummyResult(func, *args, **kwargs)
-    def __enter__(self):
-        return self
-    def __exit__(self, exc_type, exc_value, exc_tb):
-        return
-def get_pool_executor(num_workers: int, mp_context=None):
-    return ProcessPoolExecutor(num_workers, mp_context) if num_workers > 1 else DummyPoolExecutor(1)
-def length_to_mask(lengths: torch.Tensor, max_len: tp.Optional[int] = None) -> torch.Tensor:
-    """Utility function to convert a tensor of sequence lengths to a mask (useful when working on padded sequences).
-    For example: [3, 5] => [[1, 1, 1, 0, 0], [1, 1, 1, 1, 1]]
-    Args:
-        lengths (torch.Tensor): tensor with lengths
-        max_len (int): can set the max length manually. Defaults to None.
-    Returns:
-        torch.Tensor: mask with 0s where there is pad tokens else 1s
-    """
-    assert len(lengths.shape) == 1, "Length shape should be 1 dimensional."
-    final_length = lengths.max().item() if not max_len else max_len
-    final_length = max(final_length, 1)  # if all seqs are of len zero we don't want a zero-size tensor
-    return torch.arange(final_length, device=lengths.device)[None, :] < lengths[:, None]
-def hash_trick(word: str, vocab_size: int) -> int:
-    """Hash trick to pair each word with an index
-    Args:
-        word (str): word we wish to convert to an index
-        vocab_size (int): size of the vocabulary
-    Returns:
-        int: index of the word in the embedding LUT
-    """
-    hash = int(hashlib.sha256(word.encode("utf-8")).hexdigest(), 16)
-    return hash % vocab_size
-def with_rank_rng(base_seed: int = 1234):
-    """Decorator for a function so that the function will use a Random Number Generator
-    whose state depend on the GPU rank. The original RNG state is restored upon returning.
-    Args:
-        base_seed (int): Random seed.
-    """
-    def _decorator(fun: tp.Callable):
-        @wraps(fun)
-        def _decorated(*args, **kwargs):
-            state = torch.get_rng_state()
-            seed = base_seed ^ flashy.distrib.rank()
-            torch.manual_seed(seed)
-            logger.debug('Rank dependent seed set to %d', seed)
-            try:
-                return fun(*args, **kwargs)
-            finally:
-                torch.set_rng_state(state)
-                logger.debug('RNG state restored.')
-        return _decorated
-    return _decorator
-def collate(tensors: tp.List[torch.Tensor], dim: int = 0) -> tp.Tuple[torch.Tensor, torch.Tensor]:
-    """Get a list of tensors and collate them to a single tensor. according to the following logic:
-    - `dim` specifies the time dimension which will be stacked and padded.
-    - The output will contain 1 new dimension (dimension index 0) which will be the size of
-    of the original list.
-    Args:
-        tensors (tp.List[torch.Tensor]): List of tensors to collate.
-        dim (int): Dimension which will be stacked and padded.
-    Returns:
-        tp.Tuple[torch.Tensor, torch.Tensor]:
-            torch.Tensor: Stacked and padded tensor. The output will contain 1 new dimension
-                (dimension index 0) which will be the size of the original list.
-            torch.Tensor: Tensor containing length of original tensor sizes (without padding).
-    """
-    tensors = [x.transpose(0, dim) for x in tensors]
-    lens = torch.LongTensor([len(x) for x in tensors])
-    padded_tensors = pad_sequence(tensors)
-    padded_tensors = padded_tensors.transpose(0, 1)
-    padded_tensors = padded_tensors.transpose(1, dim + 1)
-    return padded_tensors, lens
-# TODO: Move to flashy?
-def copy_state(state: tp.Any, device: tp.Union[torch.device, str] = 'cpu',
-               dtype: tp.Optional[torch.dtype] = None) -> tp.Any:
-    if isinstance(state, torch.Tensor):
-        if dtype is None or not state.is_floating_point():
-            dtype = state.dtype
-        return state.detach().to(device=device, dtype=dtype, copy=True)
-    elif isinstance(state, dict):
-        return {k: copy_state(v, device, dtype) for k, v in state.items()}
-    elif isinstance(state, list):
-        return [copy_state(v, device, dtype) for v in state]
-# TODO: Move to flashy?
-@contextmanager
-def swap_state(model, state, **kwargs):
-    old_state = copy_state(model.state_dict())
-    model.load_state_dict(state, **kwargs)
-    try:
-        yield
-    finally:
-        model.load_state_dict(old_state)
-@lru_cache(None)
-def warn_once(logger, msg):
-    """Warn about a given message only once."""
-    logger.warning(msg)
-def is_jsonable(x: tp.Any):
-    """Check if an object can be serialized into a json:"""
-    try:
-        json.dumps(x)
-        return True
-    except (TypeError, OverflowError):
-        return False
-def load_clap_state_dict(clap_model, path: tp.Union[str, Path]):
-    """Wrapper around state dict loading of CLAP model
-    addressing compatibility issues between CLAP and AudioCraft
-    HuggingFace transformer version.
-    See: https://github.com/LAION-AI/CLAP/issues/118
-    """
-    from clap_module.factory import load_state_dict  # type: ignore
-    pkg = load_state_dict(path)
-    pkg.pop('text_branch.embeddings.position_ids', None)
-    clap_model.model.load_state_dict(pkg)
-def construct_frame_chords(
-                    min_timestamp: int,
-                    chord_changes: tp.List[tp.Tuple[float, str]],
-                    mapping_dict: tp.Dict,
-                    prev_chord: str,
-                    frame_rate: float,
-                    segment_duration: float,
-                    ) -> tp.List[str]:
-    """ Translate symbolic chords [(start_time, tuples),...] into a frame-level int sequence"""
-    frames = [
-        frame / frame_rate
-        for frame in range(
-            min_timestamp, int(min_timestamp + segment_duration * frame_rate)
-        )
-    ]
-    frame_chords = []
-    current_chord = prev_chord
-    for frame in frames:
-        while chord_changes and frame >= chord_changes[0][0]:
-            current_chord = chord_changes.pop(0)[1]
-        current_chord = 'N' if current_chord in {None, ''} else current_chord
-        frame_chords.append(mapping_dict[current_chord])
-    return frame_chords

modules/constants.py ADDED Viewed

	@@ -0,0 +1,63 @@

+# modules/constants.py
+# constants.py contains all the constants used in the project such as the default LUT example image, prompts, negative prompts, pre-rendered maps, models, LoRA weights, and more.
+# execptions made for some environmental variables
+import os
+from pathlib import Path
+from dotenv import load_dotenv
+import numpy as np
+IS_SHARED_SPACE = "Agents-MCP-Hackathon/UnlimitedMusicGen" in os.environ.get('SPACE_ID', '')
+# Load environment variables from .env file
+dotenv_path = Path(__file__).parent.parent / '.env'
+load_dotenv(dotenv_path)
+# Function to load env vars from .env and create Python variables
+def load_env_vars(env_path):
+    try:
+        with open(env_path, 'r') as file:
+            for line in file:
+                # Skip empty lines or comments
+                line = line.strip()
+                if line and not line.startswith('#'):
+                    # Split on the first '=' only
+                    if '=' in line:
+                        key, value = line.split('=', 1)
+                        key = key.strip()
+                        value = value.strip()
+                        # Dynamically create a Python variable with the key name
+                        globals()[key] = value
+                        # Also update os.environ (optional, for consistency)
+                        os.environ[key] = value
+    except FileNotFoundError:
+        print(f"Warning: .env file not found at {env_path}")
+USE_FLASH_ATTENTION = os.getenv("USE_FLASH_ATTENTION", "0") == "1"
+HF_API_TOKEN = os.getenv("HF_TOKEN")
+if not HF_API_TOKEN:
+    raise ValueError("HF_TOKEN is not set. Please check your .env file.")
+default_lut_example_img = "./LUT/daisy.jpg"
+MAX_SEED = np.iinfo(np.int32).max
+TARGET_SIZE = (2688,1536)
+BASE_HEIGHT = 640
+SCALE_FACTOR = (12/5)
+TMPDIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'tmp')
+os.makedirs(TMPDIR, exist_ok=True)
+# Constants for URL shortener
+HF_REPO_ID = "Surn/Storage"  # Or your desired repository
+SHORTENER_JSON_FILE = "shortener.json"
+model_extensions = {".glb", ".gltf", ".obj", ".ply"}
+model_extensions_list = list(model_extensions)
+image_extensions = {".png", ".jpg", ".jpeg", ".webp"}
+image_extensions_list = list(image_extensions)
+music_extensions = {".mp3", ".wav", ".ogg", ".flac"}
+music_extensions_list = list(music_extensions)
+upload_file_types = model_extensions_list + image_extensions_list + music_extensions_list