Spaces:

aiqtech
/

heartlib

Paused

App Files Files Community

aiqtech commited on Jan 20

Commit

8122ef6

verified ·

1 Parent(s): 82087af

Deploy from GitHub repository

Browse files

Files changed (24) hide show

.gitattributes +2 -0
README.md +9 -5
app.py +38 -0
assets/badge.svg +12 -0
assets/exp.png +3 -0
assets/logo.png +3 -0
assets/lyrics.txt +40 -0
assets/tags.txt +1 -0
examples/README.md +15 -0
examples/run_lyrics_transcription.py +35 -0
examples/run_music_generation.py +41 -0
pyproject.toml +47 -0
requirements.txt +1 -0
src/heartlib/__init__.py +7 -0
src/heartlib/heartcodec/configuration_heartcodec.py +73 -0
src/heartlib/heartcodec/modeling_heartcodec.py +181 -0
src/heartlib/heartcodec/models/flow_matching.py +177 -0
src/heartlib/heartcodec/models/sq_codec.py +539 -0
src/heartlib/heartcodec/models/transformer.py +501 -0
src/heartlib/heartmula/configuration_heartmula.py +23 -0
src/heartlib/heartmula/modeling_heartmula.py +316 -0
src/heartlib/pipelines/__init__.py +0 -0
src/heartlib/pipelines/lyrics_transcription.py +40 -0
src/heartlib/pipelines/music_generation.py +256 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/exp.png filter=lfs diff=lfs merge=lfs -text
+assets/logo.png filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,12 +1,16 @@
 ---
 title: Heartlib
-emoji: 🦀
-colorFrom: gray
-colorTo: yellow
 sdk: gradio
-sdk_version: 6.3.0
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Heartlib
+emoji: 🚀
+colorFrom: blue
+colorTo: green
 sdk: gradio
+sdk_version: "5.35.0"
 app_file: app.py
 pinned: false
 ---
+# Heartlib
+<p align="center">
+Deployed from: https://github.com/HeartMuLa/heartlib

app.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import gradio as gr
+def main_function(input_data):
+    if not input_data:
+        return "Please provide input"
+    result = f"Processed successfully! Input received: {input_data}"
+    return result
+with gr.Blocks(title="heartlib") as demo:
+    gr.Markdown(f"""
+    # Heartlib
+    <p align="center">
+    This space was created from: [https://github.com/HeartMuLa/heartlib](https://github.com/HeartMuLa/heartlib)
+    """)
+    with gr.Row():
+        with gr.Column():
+            input_data = gr.Textbox(
+                label="Input",
+                placeholder="Enter your input here...",
+                lines=3
+            )
+            process_btn = gr.Button("Process", variant="primary")
+        with gr.Column():
+            output_data = gr.Textbox(label="Output")
+    process_btn.click(
+        fn=main_function,
+        inputs=input_data,
+        outputs=output_data
+    )
+if __name__ == "__main__":
+    demo.launch()

assets/badge.svg ADDED Viewed

assets/exp.png ADDED Viewed

Git LFS Details

SHA256: 715d2f0083971cdf62990de35b717a38eae83a25ba434487e48d0612f444a891
Pointer size: 131 Bytes
Size of remote file: 555 kB

assets/logo.png ADDED Viewed

Git LFS Details

SHA256: 4a70ac32f4997dc5396da8b24df054fb8453b769febf2a12ece7c24fdc0e1668
Pointer size: 131 Bytes
Size of remote file: 142 kB

assets/lyrics.txt ADDED Viewed

	@@ -0,0 +1,40 @@

+[Intro]
+[Verse]
+The sun creeps in across the floor
+I hear the traffic outside the door
+The coffee pot begins to hiss
+It is another morning just like this
+[Prechorus]
+The world keeps spinning round and round
+Feet are planted on the ground
+I find my rhythm in the sound
+[Chorus]
+Every day the light returns
+Every day the fire burns
+We keep on walking down this street
+Moving to the same steady beat
+It is the ordinary magic that we meet
+[Verse]
+The hours tick deeply into noon
+Chasing shadows,chasing the moon
+Work is done and the lights go low
+Watching the city start to glow
+[Bridge]
+It is not always easy,not always bright
+Sometimes we wrestle with the night
+But we make it to the morning light
+[Chorus]
+Every day the light returns
+Every day the fire burns
+We keep on walking down this street
+Moving to the same steady beat
+[Outro]
+Just another day
+Every single day

assets/tags.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ piano,happy

examples/README.md ADDED Viewed

	@@ -0,0 +1,15 @@

+# 🎤 Lyrics Transcription
+Download checkpoint using any of the following command:
+```
+hf download --local_dir './ckpt/HeartTranscriptor-oss' 'HeartMuLa/HeartTranscriptor-oss'
+modelscope download --model 'HeartMuLa/HeartTranscriptor-oss' --local_dir './ckpt/HeartTranscriptor-oss'
+```
+```
+python ./examples/run_lyrics_transcription.py --model_path=./ckpt
+```
+By default this command will load the generated music file at `./assets/output.mp3` and print the transcribed lyrics. Use `--music_path` to specify the path to the music file.
+Note that our HeartTranscriptor is trained on separated vocal tracks. In this example usage part, we directly demonstrate on unseparated music tracks, which is purely for simplicity of illustration. We recommend using source separation tools like demucs to separate the tracks before transcribing lyrics to achieve better results.

examples/run_lyrics_transcription.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from heartlib import HeartTranscriptorPipeline
+import argparse
+import torch
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", type=str, required=True)
+    parser.add_argument("--music_path", type=str, default="./assets/output.mp3")
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = parse_args()
+    pipe = HeartTranscriptorPipeline.from_pretrained(
+        args.model_path,
+        device=torch.device("cuda"),
+        dtype=torch.float16,
+    )
+    with torch.no_grad():
+        result = pipe(
+            args.music_path,
+            **{
+                "max_new_tokens": 256,
+                "num_beams": 2,
+                "task": "transcribe",
+                "condition_on_prev_tokens": False,
+                "compression_ratio_threshold": 1.8,
+                "temperature": (0.0, 0.1, 0.2, 0.4),
+                "logprob_threshold": -1.0,
+                "no_speech_threshold": 0.4,
+            },
+        )
+    print(result)

examples/run_music_generation.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from heartlib import HeartMuLaGenPipeline
+import argparse
+import torch
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", type=str, required=True)
+    parser.add_argument("--version", type=str, default="3B")
+    parser.add_argument("--lyrics", type=str, default="./assets/lyrics.txt")
+    parser.add_argument("--tags", type=str, default="./assets/tags.txt")
+    parser.add_argument("--save_path", type=str, default="./assets/output.mp3")
+    parser.add_argument("--max_audio_length_ms", type=int, default=240_000)
+    parser.add_argument("--topk", type=int, default=50)
+    parser.add_argument("--temperature", type=float, default=1.0)
+    parser.add_argument("--cfg_scale", type=float, default=1.5)
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = parse_args()
+    pipe = HeartMuLaGenPipeline.from_pretrained(
+        args.model_path,
+        device=torch.device("cuda"),
+        dtype=torch.bfloat16,
+        version=args.version,
+    )
+    with torch.no_grad():
+        pipe(
+            {
+                "lyrics": args.lyrics,
+                "tags": args.tags,
+            },
+            max_audio_length_ms=args.max_audio_length_ms,
+            save_path=args.save_path,
+            topk=args.topk,
+            temperature=args.temperature,
+            cfg_scale=args.cfg_scale,
+        )
+    print(f"Generated music saved to {args.save_path}")

pyproject.toml ADDED Viewed

	@@ -0,0 +1,47 @@

+[build-system]
+requires = ["setuptools>=61", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "heartlib"
+version = "0.1.0"
+description = "A Python Library."
+readme = "README.md"
+requires-python = ">=3.9"
+license = {text = "CC-BY-NC-4.0"}
+authors = [
+    {name = "HeartMuLa Team", email = "heartmula.ai@gmail.com"}
+]
+dependencies = [
+    "numpy==2.0.2",
+    "torch==2.4.1",
+    "torchaudio==2.4.1",
+    "torchtune==0.4.0",
+    "torchao==0.9.0",
+    "torchvision==0.19.1",
+    "tqdm==4.67.1",
+    "traitlets==5.7.1",
+    "traittypes==0.2.3",
+    "transformers==4.57.0",
+    "tokenizers==0.22.1",
+    "ipykernel==6.17.1",
+    "einops==0.8.1",
+    "accelerate==1.12.0",
+    "bitsandbytes==0.49.0",
+    "vector-quantize-pytorch==1.27.15",
+    "modelscope==1.33.0",
+    "soundfile"
+]
+urls = { "homepage" = "https://heartmula.github.io/" }
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: Other/Proprietary License",
+    "Operating System :: OS Independent"
+]
+[tool.setuptools]
+package-dir = {"" = "src"}
+[tool.setuptools.packages.find]
+where = ["src"]

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ gradio>=5.35.0

src/heartlib/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from .pipelines.music_generation import HeartMuLaGenPipeline
+from .pipelines.lyrics_transcription import HeartTranscriptorPipeline
+__all__ = [
+    "HeartMuLaGenPipeline",
+    "HeartTranscriptorPipeline"
+]

src/heartlib/heartcodec/configuration_heartcodec.py ADDED Viewed

	@@ -0,0 +1,73 @@

+from transformers.configuration_utils import PretrainedConfig
+from typing import List
+class HeartCodecConfig(PretrainedConfig):
+    model_type = "heartcodec"
+    def __init__(
+        self,
+        # config for rvq
+        dim: int = 512,
+        codebook_size: int = 8192,
+        decay: float = 0.9,
+        commitment_weight: float = 1.0,
+        threshold_ema_dead_code: int = 2,
+        use_cosine_sim: bool = False,
+        codebook_dim: int = 32,
+        num_quantizers: int = 8,
+        # config for diffusion transformer
+        attention_head_dim: int = 64,
+        in_channels: int = 1024,
+        norm_type: str = "ada_norm_single",
+        num_attention_heads: int = 24,
+        num_layers: int = 24,
+        num_layers_2: int = 6,
+        out_channels: int = 256,
+        # config for sq codec
+        num_bands: int = 1,
+        sample_rate: int = 48000,
+        causal: bool = True,
+        num_samples: int = 2,
+        downsample_factors: List[int] = [3, 4, 4, 4, 5],
+        downsample_kernel_sizes: List[int] = [6, 8, 8, 8, 10],
+        upsample_factors: List[int] = [5, 4, 4, 4, 3],
+        upsample_kernel_sizes: List[int] = [10, 8, 8, 8, 6],
+        latent_hidden_dim: int = 128,
+        default_kernel_size: int = 7,
+        delay_kernel_size: int = 5,
+        init_channel: int = 64,
+        res_kernel_size: int = 7,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.dim = dim
+        self.codebook_size = codebook_size
+        self.decay = decay
+        self.commitment_weight = commitment_weight
+        self.threshold_ema_dead_code = threshold_ema_dead_code
+        self.use_cosine_sim = use_cosine_sim
+        self.codebook_dim = codebook_dim
+        self.num_quantizers = num_quantizers
+        self.attention_head_dim = attention_head_dim
+        self.in_channels = in_channels
+        self.norm_type = norm_type
+        self.num_attention_heads = num_attention_heads
+        self.num_layers = num_layers
+        self.num_layers_2 = num_layers_2
+        self.out_channels = out_channels
+        self.num_bands = num_bands
+        self.sample_rate = sample_rate
+        self.causal = causal
+        self.num_samples = num_samples
+        self.downsample_factors = downsample_factors
+        self.downsample_kernel_sizes = downsample_kernel_sizes
+        self.upsample_factors = upsample_factors
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.latent_hidden_dim = latent_hidden_dim
+        self.default_kernel_size = default_kernel_size
+        self.delay_kernel_size = delay_kernel_size
+        self.init_channel = init_channel
+        self.res_kernel_size = res_kernel_size

src/heartlib/heartcodec/modeling_heartcodec.py ADDED Viewed

	@@ -0,0 +1,181 @@

+import torch
+from .models.flow_matching import FlowMatching
+from .models.sq_codec import ScalarModel
+from .configuration_heartcodec import HeartCodecConfig
+from transformers.modeling_utils import PreTrainedModel
+import math
+import numpy as np
+class HeartCodec(PreTrainedModel):
+    config_class = HeartCodecConfig
+    def __init__(
+        self,
+        config: HeartCodecConfig,
+    ):
+        super(HeartCodec, self).__init__(config)
+        self.config = config
+        self.flow_matching = FlowMatching(
+            dim=config.dim,
+            codebook_size=config.codebook_size,
+            decay=config.decay,
+            commitment_weight=config.commitment_weight,
+            threshold_ema_dead_code=config.threshold_ema_dead_code,
+            use_cosine_sim=config.use_cosine_sim,
+            codebook_dim=config.codebook_dim,
+            num_quantizers=config.num_quantizers,
+            attention_head_dim=config.attention_head_dim,
+            in_channels=config.in_channels,
+            norm_type=config.norm_type,
+            num_attention_heads=config.num_attention_heads,
+            num_layers=config.num_layers,
+            num_layers_2=config.num_layers_2,
+            out_channels=config.out_channels,
+        )
+        self.scalar_model = ScalarModel(
+            num_bands=config.num_bands,
+            sample_rate=config.sample_rate,
+            causal=config.causal,
+            num_samples=config.num_samples,
+            downsample_factors=config.downsample_factors,
+            downsample_kernel_sizes=config.downsample_kernel_sizes,
+            upsample_factors=config.upsample_factors,
+            upsample_kernel_sizes=config.upsample_kernel_sizes,
+            latent_hidden_dim=config.latent_hidden_dim,
+            default_kernel_size=config.default_kernel_size,
+            delay_kernel_size=config.delay_kernel_size,
+            init_channel=config.init_channel,
+            res_kernel_size=config.res_kernel_size,
+        )
+        self.post_init()
+        self.sample_rate = config.sample_rate
+    @torch.inference_mode()
+    def detokenize(
+        self,
+        codes,
+        duration=29.76,
+        num_steps=10,
+        disable_progress=False,
+        guidance_scale=1.25,
+        device="cuda",
+    ):
+        codes = codes.unsqueeze(0).to(device)
+        first_latent = torch.randn(codes.shape[0], int(duration * 25), 256).to(
+            device
+        )  # B, T, 64
+        first_latent_length = 0
+        first_latent_codes_length = 0
+        min_samples = int(duration * 12.5)
+        hop_samples = min_samples // 93 * 80
+        ovlp_samples = min_samples - hop_samples
+        ovlp_frames = ovlp_samples * 2
+        codes_len = codes.shape[-1]  #
+        target_len = int(
+            (codes_len - first_latent_codes_length) / 12.5 * self.sample_rate
+        )
+        # code repeat
+        if codes_len < min_samples:
+            while codes.shape[-1] < min_samples:
+                codes = torch.cat([codes, codes], -1)
+            codes = codes[:, :, 0:min_samples]
+        codes_len = codes.shape[-1]
+        if (codes_len - ovlp_frames) % hop_samples > 0:
+            len_codes = (
+                math.ceil((codes_len - ovlp_samples) / float(hop_samples)) * hop_samples
+                + ovlp_samples
+            )
+            while codes.shape[-1] < len_codes:
+                codes = torch.cat([codes, codes], -1)
+            codes = codes[:, :, 0:len_codes]
+        latent_length = int(duration * 25)
+        latent_list = []
+        for sinx in range(0, codes.shape[-1] - hop_samples + 1, hop_samples):
+            codes_input = []
+            codes_input.append(codes[:, :, sinx : sinx + min_samples])
+            if sinx == 0 or ovlp_frames == 0:
+                incontext_length = first_latent_length
+                latents = self.flow_matching.inference_codes(
+                    codes_input,
+                    first_latent,
+                    latent_length,
+                    incontext_length,
+                    guidance_scale=guidance_scale,
+                    num_steps=num_steps,
+                    disable_progress=disable_progress,
+                    scenario="other_seg",
+                )
+                latent_list.append(latents)
+            else:
+                true_latent = latent_list[-1][:, -ovlp_frames:, :]
+                len_add_to_latent = latent_length - true_latent.shape[1]  #
+                incontext_length = true_latent.shape[1]
+                true_latent = torch.cat(
+                    [
+                        true_latent,
+                        torch.randn(
+                            true_latent.shape[0],
+                            len_add_to_latent,
+                            true_latent.shape[-1],
+                        ).to(device),
+                    ],
+                    1,
+                )
+                latents = self.flow_matching.inference_codes(
+                    codes_input,
+                    true_latent,
+                    latent_length,
+                    incontext_length,
+                    guidance_scale=guidance_scale,
+                    num_steps=num_steps,
+                    disable_progress=disable_progress,
+                    scenario="other_seg",
+                )
+                latent_list.append(latents)
+        latent_list = [l.float() for l in latent_list]
+        latent_list[0] = latent_list[0][:, first_latent_length:, :]
+        min_samples = int(duration * self.sample_rate)
+        hop_samples = min_samples // 93 * 80
+        ovlp_samples = min_samples - hop_samples
+        output = None
+        for i in range(len(latent_list)):
+            latent = latent_list[i]
+            bsz, t, f = latent.shape
+            latent = latent.reshape(
+                latent.shape[0], latent.shape[1], 2, latent.shape[2] // 2
+            ).permute(0, 2, 1, 3)
+            latent = latent.reshape(
+                latent.shape[0] * 2, latent.shape[2], latent.shape[3]
+            )
+            cur_output = (
+                self.scalar_model.decode(latent.transpose(1, 2)).squeeze(0).squeeze(1)
+            )  # 1 512 256
+            cur_output = cur_output[:, 0:min_samples].detach().cpu()  # B, T
+            if cur_output.dim() == 3:
+                cur_output = cur_output[0]
+            if output is None:
+                output = cur_output
+            else:
+                if ovlp_samples == 0:
+                    output = torch.cat([output, cur_output], -1)
+                else:
+                    ov_win = torch.from_numpy(np.linspace(0, 1, ovlp_samples)[None, :])
+                    ov_win = torch.cat([ov_win, 1 - ov_win], -1)
+                    output[:, -ovlp_samples:] = (
+                        output[:, -ovlp_samples:] * ov_win[:, -ovlp_samples:]
+                        + cur_output[:, 0:ovlp_samples] * ov_win[:, 0:ovlp_samples]
+                    )
+                    output = torch.cat([output, cur_output[:, ovlp_samples:]], -1)
+        output = output[:, 0:target_len]
+        return output

src/heartlib/heartcodec/models/flow_matching.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from tqdm import tqdm
+from vector_quantize_pytorch import ResidualVQ
+from .transformer import LlamaTransformer
+class FlowMatching(nn.Module):
+    def __init__(
+        self,
+        # rvq stuff
+        dim: int = 512,
+        codebook_size: int = 8192,
+        decay: float = 0.9,
+        commitment_weight: float = 1.0,
+        threshold_ema_dead_code: int = 2,
+        use_cosine_sim: bool = False,
+        codebook_dim: int = 32,
+        num_quantizers: int = 8,
+        # dit backbone stuff
+        attention_head_dim: int = 64,
+        in_channels: int = 1024,
+        norm_type: str = "ada_norm_single",
+        num_attention_heads: int = 24,
+        num_layers: int = 24,
+        num_layers_2: int = 6,
+        out_channels: int = 256,
+    ):
+        super().__init__()
+        self.vq_embed = ResidualVQ(
+            dim=dim,
+            codebook_size=codebook_size,
+            decay=decay,
+            commitment_weight=commitment_weight,
+            threshold_ema_dead_code=threshold_ema_dead_code,
+            use_cosine_sim=use_cosine_sim,
+            codebook_dim=codebook_dim,
+            num_quantizers=num_quantizers,
+        )
+        self.cond_feature_emb = nn.Linear(dim, dim)
+        self.zero_cond_embedding1 = nn.Parameter(torch.randn(dim))
+        self.estimator = LlamaTransformer(
+            attention_head_dim=attention_head_dim,
+            in_channels=in_channels,
+            norm_type=norm_type,
+            num_attention_heads=num_attention_heads,
+            num_layers=num_layers,
+            num_layers_2=num_layers_2,
+            out_channels=out_channels,
+        )
+        self.latent_dim = out_channels
+    @torch.no_grad()
+    def inference_codes(
+        self,
+        codes,
+        true_latents,
+        latent_length,
+        incontext_length,
+        guidance_scale=2.0,
+        num_steps=20,
+        disable_progress=True,
+        scenario="start_seg",
+    ):
+        device = true_latents.device
+        dtype = true_latents.dtype
+        # codes_bestrq_middle, codes_bestrq_last = codes
+        codes_bestrq_emb = codes[0]
+        batch_size = codes_bestrq_emb.shape[0]
+        self.vq_embed.eval()
+        quantized_feature_emb = self.vq_embed.get_output_from_indices(
+            codes_bestrq_emb.transpose(1, 2)
+        )
+        quantized_feature_emb = self.cond_feature_emb(quantized_feature_emb)  # b t 512
+        # assert 1==2
+        quantized_feature_emb = F.interpolate(
+            quantized_feature_emb.permute(0, 2, 1), scale_factor=2, mode="nearest"
+        ).permute(0, 2, 1)
+        num_frames = quantized_feature_emb.shape[1]  #
+        latents = torch.randn(
+            (batch_size, num_frames, self.latent_dim), device=device, dtype=dtype
+        )
+        latent_masks = torch.zeros(
+            latents.shape[0], latents.shape[1], dtype=torch.int64, device=latents.device
+        )
+        latent_masks[:, 0:latent_length] = 2
+        if scenario == "other_seg":
+            latent_masks[:, 0:incontext_length] = 1
+        quantized_feature_emb = (latent_masks > 0.5).unsqueeze(
+            -1
+        ) * quantized_feature_emb + (latent_masks < 0.5).unsqueeze(
+            -1
+        ) * self.zero_cond_embedding1.unsqueeze(
+            0
+        )
+        incontext_latents = (
+            true_latents
+            * ((latent_masks > 0.5) * (latent_masks < 1.5)).unsqueeze(-1).float()
+        )
+        incontext_length = ((latent_masks > 0.5) * (latent_masks < 1.5)).sum(-1)[0]
+        additional_model_input = torch.cat([quantized_feature_emb], 1)
+        temperature = 1.0
+        t_span = torch.linspace(
+            0, 1, num_steps + 1, device=quantized_feature_emb.device
+        )
+        latents = self.solve_euler(
+            latents * temperature,
+            incontext_latents,
+            incontext_length,
+            t_span,
+            additional_model_input,
+            guidance_scale,
+        )
+        latents[:, 0:incontext_length, :] = incontext_latents[
+            :, 0:incontext_length, :
+        ]  # B, T, dim
+        return latents
+    def solve_euler(self, x, incontext_x, incontext_length, t_span, mu, guidance_scale):
+        """
+        Fixed euler solver for ODEs.
+        Args:
+            x (torch.Tensor): random noise
+            t_span (torch.Tensor): n_timesteps interpolated
+                shape: (n_timesteps + 1,)
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+        """
+        t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
+        noise = x.clone()
+        # I am storing this because I can later plot it by putting a debugger here and saving it to a file
+        # Or in future might add like a return_all_steps flag
+        sol = []
+        for step in tqdm(range(1, len(t_span))):
+            x[:, 0:incontext_length, :] = (1 - (1 - 1e-6) * t) * noise[
+                :, 0:incontext_length, :
+            ] + t * incontext_x[:, 0:incontext_length, :]
+            if guidance_scale > 1.0:
+                dphi_dt = self.estimator(
+                    torch.cat(
+                        [
+                            torch.cat([x, x], 0),
+                            torch.cat([incontext_x, incontext_x], 0),
+                            torch.cat([torch.zeros_like(mu), mu], 0),
+                        ],
+                        2,
+                    ),
+                    timestep=t.unsqueeze(-1).repeat(2),
+                )
+                dphi_dt_uncond, dhpi_dt_cond = dphi_dt.chunk(2, 0)
+                dphi_dt = dphi_dt_uncond + guidance_scale * (
+                    dhpi_dt_cond - dphi_dt_uncond
+                )
+            else:
+                dphi_dt = self.estimator(
+                    torch.cat([x, incontext_x, mu], 2), timestep=t.unsqueeze(-1)
+                )
+            x = x + dt * dphi_dt
+            t = t + dt
+            sol.append(x)
+            if step < len(t_span) - 1:
+                dt = t_span[step + 1] - t
+        result = sol[-1]
+        return result

src/heartlib/heartcodec/models/sq_codec.py ADDED Viewed

	@@ -0,0 +1,539 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from torch.nn.utils.parametrizations import weight_norm
+from torch.nn.utils import remove_weight_norm
+from torch.autograd.function import InplaceFunction
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size * dilation - dilation) / 2)
+# Scripting this brings model speed up 1.4x
+@torch.jit.script
+def snake(x, alpha):
+    shape = x.shape
+    x = x.reshape(shape[0], shape[1], -1)
+    x = x + (alpha + 1e-9).reciprocal() * torch.sin(alpha * x).pow(2)
+    x = x.reshape(shape)
+    return x
+class Snake1d(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.alpha = nn.Parameter(torch.ones(1, channels, 1))
+    def forward(self, x):
+        return snake(x, self.alpha)
+class Conv1d(nn.Conv1d):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        dilation: int = 1,
+        groups: int = 1,
+        padding_mode: str = "zeros",
+        bias: bool = True,
+        padding=None,
+        causal: bool = False,
+        w_init_gain=None,
+    ):
+        self.causal = causal
+        if padding is None:
+            if causal:
+                padding = 0
+                self.left_padding = dilation * (kernel_size - 1)
+            else:
+                padding = get_padding(kernel_size, dilation)
+        super(Conv1d, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            padding_mode=padding_mode,
+            bias=bias,
+        )
+        if w_init_gain is not None:
+            torch.nn.init.xavier_uniform_(
+                self.weight, gain=torch.nn.init.calculate_gain(w_init_gain)
+            )
+    def forward(self, x):
+        if self.causal:
+            x = F.pad(x.unsqueeze(2), (self.left_padding, 0, 0, 0)).squeeze(2)
+        return super(Conv1d, self).forward(x)
+class ConvTranspose1d(nn.ConvTranspose1d):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        output_padding: int = 0,
+        groups: int = 1,
+        bias: bool = True,
+        dilation: int = 1,
+        padding=None,
+        padding_mode: str = "zeros",
+        causal: bool = False,
+    ):
+        if padding is None:
+            padding = 0 if causal else (kernel_size - stride) // 2
+        if causal:
+            assert padding == 0, "padding is not allowed in causal ConvTranspose1d."
+            assert (
+                kernel_size == 2 * stride
+            ), "kernel_size must be equal to 2*stride is not allowed in causal ConvTranspose1d."
+        super(ConvTranspose1d, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+            groups=groups,
+            bias=bias,
+            dilation=dilation,
+            padding_mode=padding_mode,
+        )
+        self.causal = causal
+        self.stride = stride
+    def forward(self, x):
+        x = super(ConvTranspose1d, self).forward(x)
+        if self.causal:
+            x = x[:, :, : -self.stride]
+        return x
+class PreProcessor(nn.Module):
+    def __init__(self, n_in, n_out, num_samples, kernel_size=7, causal=False):
+        super(PreProcessor, self).__init__()
+        self.pooling = torch.nn.AvgPool1d(kernel_size=num_samples)
+        self.conv = Conv1d(n_in, n_out, kernel_size=kernel_size, causal=causal)
+        self.activation = nn.PReLU()
+    def forward(self, x):
+        output = self.activation(self.conv(x))
+        output = self.pooling(output)
+        return output
+class PostProcessor(nn.Module):
+    def __init__(self, n_in, n_out, num_samples, kernel_size=7, causal=False):
+        super(PostProcessor, self).__init__()
+        self.num_samples = num_samples
+        self.conv = Conv1d(n_in, n_out, kernel_size=kernel_size, causal=causal)
+        self.activation = nn.PReLU()
+    def forward(self, x):
+        x = torch.transpose(x, 1, 2)
+        B, T, C = x.size()
+        x = x.repeat(1, 1, self.num_samples).view(B, -1, C)
+        x = torch.transpose(x, 1, 2)
+        output = self.activation(self.conv(x))
+        return output
+class ResidualUnit(nn.Module):
+    def __init__(self, n_in, n_out, dilation, res_kernel_size=7, causal=False):
+        super(ResidualUnit, self).__init__()
+        self.conv1 = weight_norm(
+            Conv1d(
+                n_in,
+                n_out,
+                kernel_size=res_kernel_size,
+                dilation=dilation,
+                causal=causal,
+            )
+        )
+        self.conv2 = weight_norm(Conv1d(n_in, n_out, kernel_size=1, causal=causal))
+        self.activation1 = nn.PReLU()
+        self.activation2 = nn.PReLU()
+    def forward(self, x):
+        output = self.activation1(self.conv1(x))
+        output = self.activation2(self.conv2(output))
+        return output + x
+class ResEncoderBlock(nn.Module):
+    def __init__(
+        self, n_in, n_out, stride, down_kernel_size, res_kernel_size=7, causal=False
+    ):
+        super(ResEncoderBlock, self).__init__()
+        self.convs = nn.ModuleList(
+            [
+                ResidualUnit(
+                    n_in,
+                    n_out // 2,
+                    dilation=1,
+                    res_kernel_size=res_kernel_size,
+                    causal=causal,
+                ),
+                ResidualUnit(
+                    n_out // 2,
+                    n_out // 2,
+                    dilation=3,
+                    res_kernel_size=res_kernel_size,
+                    causal=causal,
+                ),
+                ResidualUnit(
+                    n_out // 2,
+                    n_out // 2,
+                    dilation=5,
+                    res_kernel_size=res_kernel_size,
+                    causal=causal,
+                ),
+                ResidualUnit(
+                    n_out // 2,
+                    n_out // 2,
+                    dilation=7,
+                    res_kernel_size=res_kernel_size,
+                    causal=causal,
+                ),
+                ResidualUnit(
+                    n_out // 2,
+                    n_out // 2,
+                    dilation=9,
+                    res_kernel_size=res_kernel_size,
+                    causal=causal,
+                ),
+            ]
+        )
+        self.down_conv = DownsampleLayer(
+            n_in, n_out, down_kernel_size, stride=stride, causal=causal
+        )
+    def forward(self, x):
+        for conv in self.convs:
+            x = conv(x)
+        x = self.down_conv(x)
+        return x
+class ResDecoderBlock(nn.Module):
+    def __init__(
+        self, n_in, n_out, stride, up_kernel_size, res_kernel_size=7, causal=False
+    ):
+        super(ResDecoderBlock, self).__init__()
+        self.up_conv = UpsampleLayer(
+            n_in,
+            n_out,
+            kernel_size=up_kernel_size,
+            stride=stride,
+            causal=causal,
+            activation=None,
+        )
+        self.convs = nn.ModuleList(
+            [
+                ResidualUnit(
+                    n_out,
+                    n_out,
+                    dilation=1,
+                    res_kernel_size=res_kernel_size,
+                    causal=causal,
+                ),
+                ResidualUnit(
+                    n_out,
+                    n_out,
+                    dilation=3,
+                    res_kernel_size=res_kernel_size,
+                    causal=causal,
+                ),
+                ResidualUnit(
+                    n_out,
+                    n_out,
+                    dilation=5,
+                    res_kernel_size=res_kernel_size,
+                    causal=causal,
+                ),
+                ResidualUnit(
+                    n_out,
+                    n_out,
+                    dilation=7,
+                    res_kernel_size=res_kernel_size,
+                    causal=causal,
+                ),
+                ResidualUnit(
+                    n_out,
+                    n_out,
+                    dilation=9,
+                    res_kernel_size=res_kernel_size,
+                    causal=causal,
+                ),
+            ]
+        )
+    def forward(self, x):
+        x = self.up_conv(x)
+        for conv in self.convs:
+            x = conv(x)
+        return x
+class DownsampleLayer(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        causal: bool = False,
+        activation=nn.PReLU(),
+        use_weight_norm: bool = True,
+        pooling: bool = False,
+    ):
+        super(DownsampleLayer, self).__init__()
+        self.pooling = pooling
+        self.stride = stride
+        self.activation = nn.PReLU()
+        self.use_weight_norm = use_weight_norm
+        if pooling:
+            self.layer = Conv1d(in_channels, out_channels, kernel_size, causal=causal)
+            self.pooling = nn.AvgPool1d(kernel_size=stride)
+        else:
+            self.layer = Conv1d(
+                in_channels, out_channels, kernel_size, stride=stride, causal=causal
+            )
+        if use_weight_norm:
+            self.layer = weight_norm(self.layer)
+    def forward(self, x):
+        x = self.layer(x)
+        x = self.activation(x) if self.activation is not None else x
+        if self.pooling:
+            x = self.pooling(x)
+        return x
+    def remove_weight_norm(self):
+        if self.use_weight_norm:
+            remove_weight_norm(self.layer)
+class UpsampleLayer(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        causal: bool = False,
+        activation=nn.PReLU(),
+        use_weight_norm: bool = True,
+        repeat: bool = False,
+    ):
+        super(UpsampleLayer, self).__init__()
+        self.repeat = repeat
+        self.stride = stride
+        self.activation = activation
+        self.use_weight_norm = use_weight_norm
+        if repeat:
+            self.layer = Conv1d(in_channels, out_channels, kernel_size, causal=causal)
+        else:
+            self.layer = ConvTranspose1d(
+                in_channels, out_channels, kernel_size, stride=stride, causal=causal
+            )
+        if use_weight_norm:
+            self.layer = weight_norm(self.layer)
+    def forward(self, x):
+        x = self.layer(x)
+        x = self.activation(x) if self.activation is not None else x
+        if self.repeat:
+            x = torch.transpose(x, 1, 2)
+            B, T, C = x.size()
+            x = x.repeat(1, 1, self.stride).view(B, -1, C)
+            x = torch.transpose(x, 1, 2)
+        return x
+    def remove_weight_norm(self):
+        if self.use_weight_norm:
+            remove_weight_norm(self.layer)
+class round_func9(InplaceFunction):
+    @staticmethod
+    def forward(ctx, input):
+        ctx.input = input
+        return torch.round(9 * input) / 9
+    @staticmethod
+    def backward(ctx, grad_output):
+        grad_input = grad_output.clone()
+        return grad_input
+class ScalarModel(nn.Module):
+    def __init__(
+        self,
+        num_bands,
+        sample_rate,
+        causal,
+        num_samples,
+        downsample_factors,
+        downsample_kernel_sizes,
+        upsample_factors,
+        upsample_kernel_sizes,
+        latent_hidden_dim,
+        default_kernel_size,
+        delay_kernel_size,
+        init_channel,
+        res_kernel_size,
+        mode="pre_proj",
+    ):
+        super(ScalarModel, self).__init__()
+        # self.args = args
+        self.encoder = []
+        self.decoder = []
+        self.vq = round_func9()  # using 9
+        self.mode = mode
+        # Encoder parts
+        self.encoder.append(
+            weight_norm(
+                Conv1d(
+                    num_bands,
+                    init_channel,
+                    kernel_size=default_kernel_size,
+                    causal=causal,
+                )
+            )
+        )
+        if num_samples > 1:
+            # Downsampling
+            self.encoder.append(
+                PreProcessor(
+                    init_channel,
+                    init_channel,
+                    num_samples,
+                    kernel_size=default_kernel_size,
+                    causal=causal,
+                )
+            )
+        for i, down_factor in enumerate(downsample_factors):
+            self.encoder.append(
+                ResEncoderBlock(
+                    init_channel * np.power(2, i),
+                    init_channel * np.power(2, i + 1),
+                    down_factor,
+                    downsample_kernel_sizes[i],
+                    res_kernel_size,
+                    causal=causal,
+                )
+            )
+        self.encoder.append(
+            weight_norm(
+                Conv1d(
+                    init_channel * np.power(2, len(downsample_factors)),
+                    latent_hidden_dim,
+                    kernel_size=default_kernel_size,
+                    causal=causal,
+                )
+            )
+        )
+        # Decoder
+        # look ahead
+        self.decoder.append(
+            weight_norm(
+                Conv1d(
+                    latent_hidden_dim,
+                    init_channel * np.power(2, len(upsample_factors)),
+                    kernel_size=delay_kernel_size,
+                )
+            )
+        )
+        for i, upsample_factor in enumerate(upsample_factors):
+            self.decoder.append(
+                ResDecoderBlock(
+                    init_channel * np.power(2, len(upsample_factors) - i),
+                    init_channel * np.power(2, len(upsample_factors) - i - 1),
+                    upsample_factor,
+                    upsample_kernel_sizes[i],
+                    res_kernel_size,
+                    causal=causal,
+                )
+            )
+        if num_samples > 1:
+            self.decoder.append(
+                PostProcessor(
+                    init_channel,
+                    init_channel,
+                    num_samples,
+                    kernel_size=default_kernel_size,
+                    causal=causal,
+                )
+            )
+        self.decoder.append(
+            weight_norm(
+                Conv1d(
+                    init_channel,
+                    num_bands,
+                    kernel_size=default_kernel_size,
+                    causal=causal,
+                )
+            )
+        )
+        self.encoder = nn.ModuleList(self.encoder)
+        self.decoder = nn.ModuleList(self.decoder)
+    def forward(self, x):
+        for i, layer in enumerate(self.encoder):
+            if i != len(self.encoder) - 1:
+                x = layer(x)
+            else:
+                x = F.tanh(layer(x))
+        # import pdb; pdb.set_trace()
+        x = self.vq.apply(x)  # vq
+        for i, layer in enumerate(self.decoder):
+            x = layer(x)
+        return x
+    def inference(self, x):
+        for i, layer in enumerate(self.encoder):
+            if i != len(self.encoder) - 1:
+                x = layer(x)
+            else:
+                x = F.tanh(layer(x))  # reverse to tanh
+        emb = x
+        # import pdb; pdb.set_trace()
+        emb_quant = self.vq.apply(emb)  # vq
+        x = emb_quant
+        for i, layer in enumerate(self.decoder):
+            x = layer(x)
+        return emb, emb_quant, x
+    def encode(self, x):
+        for i, layer in enumerate(self.encoder):
+            if i != len(self.encoder) - 1:
+                x = layer(x)
+            else:
+                x = F.tanh(layer(x))  # reverse to tanh
+        emb = x
+        # import pdb; pdb.set_trace()
+        emb_quant = self.vq.apply(emb)  # vq
+        return emb
+    def decode(self, x):
+        x = self.vq.apply(
+            x
+        )  # make sure the prediction follow the similar disctribution
+        for i, layer in enumerate(self.decoder):
+            x = layer(x)
+        return x

src/heartlib/heartcodec/models/transformer.py ADDED Viewed

	@@ -0,0 +1,501 @@

+import math
+from typing import Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        var = x.pow(2).mean(dim=-1, keepdim=True)
+        x = x * torch.rsqrt(var + self.eps)
+        return self.weight * x
+class RotaryEmbedding(nn.Module):
+    def __init__(self, dim: int, base: int = 10000):
+        super().__init__()
+        self.dim = dim
+        self.base = base
+        self._cache = {}
+    def get_sin_cos(self, seq_len: int, device, dtype):
+        key = (seq_len, device, dtype)
+        cached = self._cache.get(key, None)
+        if cached is not None and cached[0].device == device:
+            return cached
+        inv_freq = 1.0 / (
+            self.base
+            ** (torch.arange(0, self.dim, 2, device=device, dtype=dtype) / self.dim)
+        )
+        t = torch.arange(seq_len, device=device, dtype=dtype)
+        freqs = torch.einsum("i,j->ij", t, inv_freq)
+        sin = freqs.sin()
+        cos = freqs.cos()
+        self._cache[key] = (sin, cos)
+        return sin, cos
+    def apply_rotary(
+        self, x: torch.Tensor, sin: torch.Tensor, cos: torch.Tensor
+    ) -> torch.Tensor:
+        x1, x2 = x[..., : self.dim // 2], x[..., self.dim // 2 : self.dim]
+        # Interleave sin/cos across pairs
+        x_rot = torch.stack((-x2, x1), dim=-1).reshape_as(x[..., : self.dim])
+        return (x[..., : self.dim] * cos.unsqueeze(-1)).reshape_as(
+            x[..., : self.dim]
+        ) + (x_rot * sin.unsqueeze(-1)).reshape_as(x[..., : self.dim])
+class LlamaAttention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        n_heads: int,
+        head_dim: int,
+        bias: bool = False,
+        dropout: float = 0.0,
+        rope_dim: Optional[int] = None,
+        cross_attention_dim: Optional[int] = None,
+        use_sdpa: bool = True,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.n_heads = n_heads
+        self.head_dim = head_dim
+        self.inner_dim = n_heads * head_dim
+        self.cross_attention_dim = cross_attention_dim
+        self.q_proj = nn.Linear(dim, self.inner_dim, bias=bias)
+        k_in = dim if cross_attention_dim is None else cross_attention_dim
+        self.k_proj = nn.Linear(k_in, self.inner_dim, bias=bias)
+        self.v_proj = nn.Linear(k_in, self.inner_dim, bias=bias)
+        self.o_proj = nn.Linear(self.inner_dim, dim, bias=bias)
+        self.dropout = dropout
+        self.rope_dim = rope_dim if rope_dim is not None else head_dim
+        self.rope = RotaryEmbedding(self.rope_dim)
+        self.use_sdpa = use_sdpa
+        self._has_sdpa = hasattr(F, "scaled_dot_product_attention")
+    def _shape(self, x: torch.Tensor, b: int, t: int) -> torch.Tensor:
+        return x.view(b, t, self.n_heads, self.head_dim).transpose(1, 2)
+    def forward(
+        self,
+        x: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        b, t, c = x.shape
+        q = self._shape(self.q_proj(x), b, t)
+        if encoder_hidden_states is None:
+            k = self._shape(self.k_proj(x), b, t)
+            v = self._shape(self.v_proj(x), b, t)
+        else:
+            bt, tk, ck = encoder_hidden_states.shape
+            k = self._shape(self.k_proj(encoder_hidden_states), b, tk)
+            v = self._shape(self.v_proj(encoder_hidden_states), b, tk)
+        # RoPE on first rope_dim of head_dim
+        rope_dim = min(self.rope_dim, self.head_dim)
+        seq_len_for_rope = k.shape[-2]
+        sin, cos = self.rope.get_sin_cos(
+            seq_len_for_rope, device=x.device, dtype=x.dtype
+        )
+        def apply_rope_vec(tensor):
+            head = tensor[..., :rope_dim]
+            tail = tensor[..., rope_dim:]
+            b, h, tt, _ = head.shape
+            head = head.view(b, h, tt, rope_dim // 2, 2)
+            sin_ = sin.view(1, 1, tt, rope_dim // 2, 1)
+            cos_ = cos.view(1, 1, tt, rope_dim // 2, 1)
+            x1 = head[..., 0:1]
+            x2 = head[..., 1:2]
+            rot = torch.cat(
+                [x1 * cos_ - x2 * sin_, x1 * sin_ + x2 * cos_], dim=-1
+            ).view(b, h, tt, rope_dim)
+            return torch.cat([rot, tail], dim=-1)
+        q = apply_rope_vec(q)
+        k = apply_rope_vec(k)
+        # Prefer PyTorch SDPA (can enable FlashAttention kernel on supported GPUs)
+        if self.use_sdpa and self._has_sdpa:
+            s = k.shape[-2]
+            attn_mask_sdpa = None
+            if attention_mask is not None:
+                m = attention_mask
+                if m.dim() == 2 and m.shape == (b, s):  # [b, s]
+                    m = m[:, None, None, :]  # [b,1,1,s]
+                elif m.dim() == 3 and m.shape[-2] == 1:  # [b,1,s]
+                    m = m[:, None, :, :]  # [b,1,1,s]
+                elif m.dim() == 3 and m.shape[-2] == t:  # [b,t,s]
+                    m = m[:, None, :, :]  # [b,1,t,s]
+                elif m.dim() == 4 and m.shape[1] == 1:  # [b,1,t,s] or [b,1,1,s]
+                    pass
+                attn_mask_sdpa = m
+            out = F.scaled_dot_product_attention(
+                q,
+                k,
+                v,
+                attn_mask=attn_mask_sdpa,
+                dropout_p=self.dropout if self.training else 0.0,
+                is_causal=False,
+            )
+            out = out.transpose(1, 2).contiguous().view(b, t, self.inner_dim)
+            return self.o_proj(out)
+        else:
+            attn_scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(
+                self.head_dim
+            )
+            if attention_mask is not None:
+                attn_scores = attn_scores + attention_mask
+            attn = attn_scores.softmax(dim=-1)
+            attn = F.dropout(attn, p=self.dropout, training=self.training)
+            out = torch.matmul(attn, v)
+            out = out.transpose(1, 2).contiguous().view(b, t, self.inner_dim)
+            return self.o_proj(out)
+class LlamaMLP(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: Optional[int] = None,
+        multiple_of: int = 256,
+        dropout: float = 0.0,
+    ):
+        super().__init__()
+        hidden_dim = hidden_dim or 4 * dim
+        # align to multiple_of like Llama
+        hidden_dim = int(2 * hidden_dim / 3)
+        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+        self.gate = nn.Linear(dim, hidden_dim, bias=False)
+        self.up = nn.Linear(dim, hidden_dim, bias=False)
+        self.down = nn.Linear(hidden_dim, dim, bias=False)
+        self.dropout = dropout
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = F.silu(self.gate(x)) * self.up(x)
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        return self.down(x)
+class LlamaTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        n_heads: int,
+        head_dim: int,
+        mlp_multiple_of: int = 256,
+        dropout: float = 0.0,
+        attention_bias: bool = False,
+        cross_attention_dim: Optional[int] = None,
+        use_ada_layer_norm_single: bool = False,
+    ):
+        super().__init__()
+        self.attn_norm = RMSNorm(dim, 1e-6)
+        self.attn = LlamaAttention(
+            dim,
+            n_heads,
+            head_dim,
+            bias=attention_bias,
+            dropout=dropout,
+            rope_dim=head_dim,
+            cross_attention_dim=None,
+        )
+        self.cross_attn = None
+        if cross_attention_dim is not None:
+            self.cross_attn_norm = RMSNorm(dim, 1e-6)
+            self.cross_attn = LlamaAttention(
+                dim,
+                n_heads,
+                head_dim,
+                bias=attention_bias,
+                dropout=dropout,
+                rope_dim=head_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
+        self.mlp_norm = RMSNorm(dim, 1e-6)
+        self.mlp = LlamaMLP(dim, multiple_of=mlp_multiple_of, dropout=dropout)
+        self.use_ada_layer_norm_single = use_ada_layer_norm_single
+        if self.use_ada_layer_norm_single:
+            self.scale_shift_table = nn.Parameter(torch.randn(6, dim) / dim**0.5)
+    def forward(
+        self,
+        x: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        timestep: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if self.use_ada_layer_norm_single:
+            batch_size = x.shape[0]
+            # timestep: [B, 6*D]
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
+                self.scale_shift_table[None] + timestep.reshape(batch_size, 6, -1)
+            ).chunk(6, dim=1)
+            # Self-Attention with modulation and gating
+            norm_hidden_states = self.attn_norm(x)
+            norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
+            h = self.attn(norm_hidden_states, attention_mask=attention_mask)
+            h = gate_msa * h
+            x = x + h
+            # MLP with modulation and gating
+            norm_hidden_states = self.mlp_norm(x)
+            norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
+            h = self.mlp(norm_hidden_states)
+            h = gate_mlp * h
+            x = x + h
+            return x
+        else:
+            h = self.attn(self.attn_norm(x), attention_mask=attention_mask)
+            x = x + h
+            h = self.mlp(self.mlp_norm(x))
+            x = x + h
+            return x
+class ProjectLayer(nn.Module):
+    def __init__(self, hidden_size, filter_size, kernel_size=1, dropout=0.0):
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.dropout = dropout
+        self.ffn_1 = nn.Conv1d(
+            hidden_size, filter_size, kernel_size, padding=kernel_size // 2
+        )
+        self.ffn_2 = nn.Linear(filter_size, filter_size)
+    def forward(self, x):
+        x = self.ffn_1(x.transpose(1, 2)).transpose(1, 2)
+        x = x * self.kernel_size**-0.5
+        x = self.ffn_2(x)
+        return x
+class LlamaTransformer(nn.Module):
+    def __init__(
+        self,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        in_channels: int,
+        out_channels: int,
+        num_layers: int = 12,
+        num_layers_2: int = 2,
+        dropout: float = 0.0,
+        cross_attention_dim: Optional[int] = None,
+        norm_type: str = "layer_norm",
+    ):
+        super().__init__()
+        inner_dim = num_attention_heads * attention_head_dim
+        inner_dim_2 = inner_dim * 2
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.inner_dim = inner_dim
+        self.inner_dim_2 = inner_dim_2
+        self.dropout = dropout
+        self.proj_in = ProjectLayer(in_channels, inner_dim, kernel_size=3)
+        use_ada_single = norm_type == "ada_norm_single"
+        self.transformer_blocks = nn.ModuleList(
+            [
+                LlamaTransformerBlock(
+                    dim=inner_dim,
+                    n_heads=num_attention_heads,
+                    head_dim=attention_head_dim,
+                    dropout=dropout,
+                    attention_bias=False,
+                    cross_attention_dim=cross_attention_dim,
+                    use_ada_layer_norm_single=use_ada_single,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        self.transformer_blocks_2 = nn.ModuleList(
+            [
+                LlamaTransformerBlock(
+                    dim=inner_dim_2,
+                    n_heads=num_attention_heads,
+                    head_dim=attention_head_dim * 2,
+                    dropout=dropout,
+                    attention_bias=False,
+                    cross_attention_dim=cross_attention_dim,
+                    use_ada_layer_norm_single=use_ada_single,
+                )
+                for _ in range(num_layers_2)
+            ]
+        )
+        self.connection_proj = ProjectLayer(
+            in_channels + inner_dim, inner_dim_2, kernel_size=3
+        )
+        self.norm_out = nn.LayerNorm(inner_dim, elementwise_affine=False, eps=1e-6)
+        self.norm_out_2 = nn.LayerNorm(inner_dim_2, elementwise_affine=False, eps=1e-6)
+        self.scale_shift_table = nn.Parameter(
+            torch.randn(2, inner_dim) / inner_dim**0.5
+        )
+        self.scale_shift_table_2 = nn.Parameter(
+            torch.randn(2, inner_dim_2) / inner_dim_2**0.5
+        )
+        self.proj_out = ProjectLayer(inner_dim_2, out_channels, kernel_size=3)
+        self.adaln_single = AdaLayerNormSingleFlow(inner_dim)
+        self.adaln_single_2 = AdaLayerNormSingleFlow(inner_dim_2)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        timestep: Optional[torch.LongTensor] = None,
+    ):
+        s = self.proj_in(hidden_states)
+        embedded_timestep = None
+        timestep_mod = None
+        if self.adaln_single is not None and timestep is not None:
+            batch_size = s.shape[0]
+            timestep_mod, embedded_timestep = self.adaln_single(
+                timestep, hidden_dtype=s.dtype
+            )
+        for blk in self.transformer_blocks:
+            s = blk(s, timestep=timestep_mod)
+        if embedded_timestep is None:
+            embedded_timestep = torch.zeros(
+                s.size(0), s.size(-1), device=s.device, dtype=s.dtype
+            )
+        shift, scale = (
+            self.scale_shift_table[None] + embedded_timestep[:, None]
+        ).chunk(2, dim=1)
+        s = self.norm_out(s)
+        s = s * (1 + scale) + shift
+        x = torch.cat([hidden_states, s], dim=-1)
+        x = self.connection_proj(x)
+        embedded_timestep_2 = None
+        timestep_mod_2 = None
+        if self.adaln_single_2 is not None and timestep is not None:
+            batch_size = x.shape[0]
+            timestep_mod_2, embedded_timestep_2 = self.adaln_single_2(
+                timestep, hidden_dtype=x.dtype
+            )
+        for blk in self.transformer_blocks_2:
+            x = blk(x, timestep=timestep_mod_2)
+        if embedded_timestep_2 is None:
+            embedded_timestep_2 = torch.zeros(
+                x.size(0), x.size(-1), device=x.device, dtype=x.dtype
+            )
+        shift_2, scale_2 = (
+            self.scale_shift_table_2[None] + embedded_timestep_2[:, None]
+        ).chunk(2, dim=1)
+        x = self.norm_out_2(x)
+        x = x * (1 + scale_2) + shift_2
+        out = self.proj_out(x)
+        return out
+class PixArtAlphaCombinedFlowEmbeddings(nn.Module):
+    def __init__(self, embedding_dim: int, size_emb_dim: int):
+        super().__init__()
+        self.flow_t_size = 512
+        self.outdim = size_emb_dim
+        self.timestep_embedder = TimestepEmbedding(
+            in_channels=self.flow_t_size, time_embed_dim=embedding_dim
+        )
+    def timestep_embedding(self, timesteps, max_period=10000, scale=1000):
+        half = self.flow_t_size // 2
+        freqs = torch.exp(
+            -math.log(max_period)
+            * torch.arange(start=0, end=half, device=timesteps.device)
+            / half
+        ).type(timesteps.type())
+        args = timesteps[:, None] * freqs[None] * scale
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if self.flow_t_size % 2:
+            embedding = torch.cat(
+                [embedding, torch.zeros_like(embedding[:, :1])], dim=-1
+            )
+        return embedding
+    def forward(self, timestep, hidden_dtype):
+        timesteps_proj = self.timestep_embedding(timestep)
+        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_dtype))
+        conditioning = timesteps_emb
+        return conditioning
+class AdaLayerNormSingleFlow(nn.Module):
+    def __init__(self, embedding_dim: int):
+        super().__init__()
+        self.emb = PixArtAlphaCombinedFlowEmbeddings(
+            embedding_dim, size_emb_dim=embedding_dim // 3
+        )
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias=True)
+    def forward(
+        self,
+        timestep: torch.Tensor,
+        hidden_dtype: Optional[torch.dtype] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        embedded_timestep = self.emb(timestep, hidden_dtype=hidden_dtype)
+        return self.linear(self.silu(embedded_timestep)), embedded_timestep
+class TimestepEmbedding(nn.Module):
+    def __init__(self, in_channels: int, time_embed_dim: int):
+        super().__init__()
+        self.linear_1 = nn.Linear(in_channels, time_embed_dim)
+        self.act = nn.SiLU()
+        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.linear_1(x)
+        x = self.act(x)
+        x = self.linear_2(x)
+        return x
+class Timesteps(nn.Module):
+    def __init__(
+        self,
+        num_channels: int,
+        flip_sin_to_cos: bool = True,
+        downscale_freq_shift: float = 0,
+    ):
+        super().__init__()
+        self.num_channels = num_channels
+        self.flip_sin_to_cos = flip_sin_to_cos
+        self.downscale_freq_shift = downscale_freq_shift
+    def forward(self, timesteps: torch.Tensor) -> torch.Tensor:
+        half_dim = self.num_channels // 2
+        exponent = (
+            -math.log(10000)
+            * torch.arange(0, half_dim, device=timesteps.device)
+            / (half_dim - self.downscale_freq_shift)
+        )
+        emb = torch.exp(exponent)[None, :] * timesteps[:, None]
+        if self.flip_sin_to_cos:
+            emb = torch.cat([torch.cos(emb), torch.sin(emb)], dim=-1)
+        else:
+            emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
+        if self.num_channels % 2 == 1:
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        return emb

src/heartlib/heartmula/configuration_heartmula.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from transformers.configuration_utils import PretrainedConfig
+class HeartMuLaConfig(PretrainedConfig):
+    model_type = "heartmula"
+    def __init__(
+        self,
+        backbone_flavor: str = "llama-3B",
+        decoder_flavor: str = "llama-300M",
+        text_vocab_size: int = 128256,
+        audio_vocab_size: int = 8197,
+        audio_num_codebooks: int = 8,
+        muq_dim: int = 512,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.backbone_flavor = backbone_flavor
+        self.decoder_flavor = decoder_flavor
+        self.text_vocab_size = text_vocab_size
+        self.audio_vocab_size = audio_vocab_size
+        self.audio_num_codebooks = audio_num_codebooks
+        self.muq_dim = muq_dim

src/heartlib/heartmula/modeling_heartmula.py ADDED Viewed

	@@ -0,0 +1,316 @@

+import torch
+import torch.nn as nn
+from .configuration_heartmula import HeartMuLaConfig
+from transformers.modeling_utils import PreTrainedModel
+import torch
+import torch.nn as nn
+import torchtune
+from torchtune.models import llama3_2
+def llama3_2_3B() -> torchtune.modules.transformer.TransformerDecoder:
+    return llama3_2.llama3_2(
+        vocab_size=128_256,
+        num_layers=28,
+        num_heads=24,
+        num_kv_heads=8,
+        embed_dim=3072,
+        max_seq_len=8192,
+        intermediate_dim=8192,
+        attn_dropout=0.0,
+        norm_eps=1e-5,
+        rope_base=500_000,
+        scale_factor=32,
+    )
+def llama3_2_300M() -> torchtune.modules.transformer.TransformerDecoder:
+    return llama3_2.llama3_2(
+        vocab_size=128_256,
+        num_layers=3,
+        num_heads=8,
+        num_kv_heads=4,
+        embed_dim=3072,
+        max_seq_len=2048,
+        intermediate_dim=8192,
+        attn_dropout=0.0,
+        norm_eps=1e-5,
+        rope_base=500_000,
+        scale_factor=32,
+    )
+def llama3_2_7B() -> torchtune.modules.transformer.TransformerDecoder:
+    return llama3_2.llama3_2(
+        vocab_size=128_256,
+        num_layers=32,
+        num_heads=32,
+        num_kv_heads=8,
+        embed_dim=4096,
+        max_seq_len=8192,
+        intermediate_dim=14336,
+        attn_dropout=0.0,
+        norm_eps=1e-5,
+        rope_base=500_000,
+        scale_factor=32,
+    )
+def llama3_2_400M() -> torchtune.modules.transformer.TransformerDecoder:
+    return llama3_2.llama3_2(
+        vocab_size=128_256,
+        num_layers=4,
+        num_heads=8,
+        num_kv_heads=4,
+        embed_dim=3072,
+        max_seq_len=2048,
+        intermediate_dim=8192,
+        attn_dropout=0.0,
+        norm_eps=1e-5,
+        rope_base=500_000,
+        scale_factor=32,
+    )  # 减少了num_heads和num_kv_heads之间的倍速，提升了精确度，但降低了效率
+FLAVORS = {
+    "llama-3B": llama3_2_3B,
+    "llama-300M": llama3_2_300M,
+    "llama-7B": llama3_2_7B,
+    "llama-400M": llama3_2_400M,
+}
+def _prepare_transformer(model):
+    embed_dim = model.tok_embeddings.embedding_dim
+    model.tok_embeddings = nn.Identity()
+    model.output = nn.Identity()
+    return model, embed_dim
+def _create_causal_mask(seq_len: int, device: torch.device):
+    return torch.tril(torch.ones(seq_len, seq_len, dtype=torch.bool, device=device))
+def _index_causal_mask(mask: torch.Tensor, input_pos: torch.Tensor):
+    r = mask[input_pos, :]
+    return r
+def _multinomial_sample_one_no_sync(
+    probs,
+):  # Does multinomial sampling without a cuda synchronization
+    q = torch.empty_like(probs).exponential_(1)
+    return torch.argmax(probs / q, dim=-1, keepdim=True).to(dtype=torch.int)
+def sample_topk(logits: torch.Tensor, topk: int, temperature: float):
+    logits = logits / temperature
+    filter_value: float = -float("Inf")
+    indices_to_remove = logits < torch.topk(logits, topk)[0][..., -1, None]
+    scores_processed = logits.masked_fill(indices_to_remove, filter_value)
+    scores_processed = torch.nn.functional.log_softmax(scores_processed, dim=-1)
+    probs = torch.nn.functional.softmax(scores_processed, dim=-1)
+    sample_token = _multinomial_sample_one_no_sync(probs)
+    return sample_token
+class HeartMuLa(PreTrainedModel):
+    config_class = HeartMuLaConfig
+    def __init__(
+        self,
+        config: HeartMuLaConfig,
+    ):
+        super(HeartMuLa, self).__init__(config)
+        self.config = config
+        self.backbone, backbone_dim = _prepare_transformer(
+            FLAVORS[config.backbone_flavor]()
+        )
+        self.decoder, decoder_dim = _prepare_transformer(
+            FLAVORS[config.decoder_flavor]()
+        )
+        self.text_embeddings = nn.Embedding(config.text_vocab_size, backbone_dim)
+        self.audio_embeddings = nn.Embedding(
+            config.audio_vocab_size * config.audio_num_codebooks, backbone_dim
+        )
+        self.unconditional_text_embedding = nn.Embedding(1, backbone_dim)
+        self.projection = nn.Linear(backbone_dim, decoder_dim, bias=False)
+        self.codebook0_head = nn.Linear(
+            backbone_dim, config.audio_vocab_size, bias=False
+        )
+        self.audio_head = nn.Parameter(
+            torch.empty(
+                config.audio_num_codebooks - 1, decoder_dim, config.audio_vocab_size
+            )
+        )
+        self.muq_linear = nn.Linear(config.muq_dim, backbone_dim)
+        self.post_init()
+    def setup_caches(self, max_batch_size: int):
+        dtype = next(self.parameters()).dtype
+        device = next(self.parameters()).device
+        try:
+            self.reset_caches()
+        except RuntimeError:
+            pass
+        with device:
+            self.backbone.setup_caches(max_batch_size, dtype)
+            self.decoder.setup_caches(
+                max_batch_size,
+                dtype,
+                decoder_max_seq_len=self.config.audio_num_codebooks,
+            )
+        self.register_buffer(
+            "backbone_causal_mask",
+            _create_causal_mask(self.backbone.max_seq_len, device),
+        )
+        self.register_buffer(
+            "decoder_causal_mask",
+            _create_causal_mask(self.config.audio_num_codebooks, device),
+        )
+    def generate_frame(
+        self,
+        tokens: torch.Tensor,
+        tokens_mask: torch.Tensor,
+        input_pos: torch.Tensor,
+        temperature: float,
+        topk: int,
+        cfg_scale: float,
+        continuous_segments: torch.Tensor = None,
+        starts=None,
+    ) -> torch.Tensor:
+        b, s, _ = tokens.size()
+        assert self.backbone.caches_are_enabled(), "backbone caches are not enabled"
+        curr_backbone_mask = _index_causal_mask(self.backbone_causal_mask, input_pos)
+        uncond_mask = None
+        if cfg_scale > 1.0 and b > 1:
+            actual_B = b // 2
+            uncond_mask = torch.cat(
+                [
+                    torch.zeros(actual_B, dtype=torch.bool, device=tokens.device),
+                    torch.ones(actual_B, dtype=torch.bool, device=tokens.device),
+                ]
+            )
+        embeds = self._embed_tokens(tokens, uncond_mask=uncond_mask)
+        masked_embeds = embeds * tokens_mask.unsqueeze(-1)
+        h = masked_embeds.sum(dim=2, dtype=embeds.dtype)  # merge
+        if continuous_segments is not None:
+            continuous_segments = self.muq_linear(continuous_segments)
+            if uncond_mask is not None:
+                uncond_embed = self.unconditional_text_embedding(
+                    torch.zeros(1, device=tokens.device, dtype=torch.long)
+                )
+                mask_expanded = uncond_mask.view(b, 1).expand_as(continuous_segments)
+                continuous_segments = torch.where(
+                    mask_expanded, uncond_embed, continuous_segments
+                )
+            batch_indices = torch.arange(h.shape[0], device=h.device)
+            h[batch_indices, starts] = continuous_segments
+        h = self.backbone(h, input_pos=input_pos, mask=curr_backbone_mask)
+        last_h = h[:, -1, :]  # the last frame
+        c0_logits = self.codebook0_head(last_h)  # only predict the audio part
+        if cfg_scale > 1.0 and b > 1 and (b % 2 == 0):
+            actual_B = b // 2
+            cond_logits = c0_logits[:actual_B, :]
+            uncond_logits = c0_logits[actual_B:, :]
+            guided_logits = uncond_logits + (cond_logits - uncond_logits) * cfg_scale
+            c0_sample = sample_topk(guided_logits, topk, temperature)
+            c0_sample = c0_sample.repeat(
+                2, 1
+            )  # repeat to both branches to keep alignment
+        else:
+            c0_sample = sample_topk(c0_logits, topk, temperature)
+        c0_embed = self._embed_audio(0, c0_sample)
+        self.decoder.reset_caches()
+        curr_h = torch.cat([last_h.unsqueeze(1), c0_embed], dim=1)
+        curr_sample = c0_sample.clone()
+        curr_pos = (
+            torch.arange(0, curr_h.size(1), device=curr_h.device)
+            .unsqueeze(0)
+            .repeat(curr_h.size(0), 1)
+        )
+        curr_h = curr_h.to(embeds.dtype)
+        for i in range(1, self.config.audio_num_codebooks):
+            curr_decoder_mask = _index_causal_mask(self.decoder_causal_mask, curr_pos)
+            decoder_h = self.decoder(
+                self.projection(curr_h), input_pos=curr_pos, mask=curr_decoder_mask
+            )
+            ci_logits = torch.mm(decoder_h[:, -1, :], self.audio_head[i - 1])
+            if cfg_scale > 1.0 and b > 1 and (b % 2 == 0):
+                actual_B = b // 2
+                cond_ci = ci_logits[:actual_B, :]
+                uncond_ci = ci_logits[actual_B:, :]
+                guided_ci = uncond_ci + (cond_ci - uncond_ci) * cfg_scale
+                ci_sample = sample_topk(guided_ci, topk, temperature)
+                ci_sample = ci_sample.repeat(2, 1)
+            else:
+                ci_sample = sample_topk(ci_logits, topk, temperature)
+            ci_embed = self._embed_audio(i, ci_sample)
+            curr_h = ci_embed
+            curr_sample = torch.cat([curr_sample, ci_sample], dim=1)
+            curr_pos = curr_pos[:, -1:] + 1
+        return curr_sample
+    def reset_caches(self):
+        self.backbone.reset_caches()
+        self.decoder.reset_caches()
+    def _embed_local_audio(self, tokens):
+        """the token from 0-30"""
+        audio_tokens = tokens + (
+            self.config.audio_vocab_size
+            * torch.arange(self.config.audio_num_codebooks - 1, device=tokens.device)
+        )
+        audio_embeds = self.audio_embeddings(audio_tokens.view(-1)).reshape(
+            tokens.size(0), tokens.size(1), self.config.audio_num_codebooks - 1, -1
+        )
+        return audio_embeds
+    def _embed_audio(self, codebook: int, tokens: torch.Tensor) -> torch.Tensor:
+        return self.audio_embeddings(tokens + codebook * self.config.audio_vocab_size)
+    def _embed_tokens(
+        self, tokens: torch.Tensor, uncond_mask: torch.Tensor | None
+    ) -> torch.Tensor:
+        B, S, _ = tokens.size()
+        text_embeds = self.text_embeddings(tokens[:, :, -1])
+        if uncond_mask is not None:
+            uncond_text_embed = self.unconditional_text_embedding(
+                torch.zeros(1, device=tokens.device, dtype=torch.long)
+            )
+            mask_expanded = uncond_mask.view(B, 1, 1).expand_as(text_embeds)
+            text_embeds = torch.where(
+                mask_expanded,
+                uncond_text_embed,
+                text_embeds,
+            )
+        text_embeds = text_embeds.unsqueeze(-2)
+        audio_tokens = tokens[:, :, :-1] + (
+            self.config.audio_vocab_size
+            * torch.arange(self.config.audio_num_codebooks, device=tokens.device)
+        )
+        audio_embeds = self.audio_embeddings(audio_tokens.view(-1)).reshape(
+            tokens.size(0), tokens.size(1), self.config.audio_num_codebooks, -1
+        )
+        return torch.cat([audio_embeds, text_embeds], dim=-2)

src/heartlib/pipelines/__init__.py ADDED Viewed

File without changes

src/heartlib/pipelines/lyrics_transcription.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from transformers.pipelines.automatic_speech_recognition import (
+    AutomaticSpeechRecognitionPipeline,
+)
+from transformers.models.whisper.modeling_whisper import WhisperForConditionalGeneration
+from transformers.models.whisper.processing_whisper import WhisperProcessor
+import torch
+import os
+class HeartTranscriptorPipeline(AutomaticSpeechRecognitionPipeline):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    @classmethod
+    def from_pretrained(
+        cls, pretrained_path: str, device: torch.device, dtype: torch.dtype
+    ):
+        if os.path.exists(
+            hearttranscriptor_path := os.path.join(
+                pretrained_path, "HeartTranscriptor-oss"
+            )
+        ):
+            model = WhisperForConditionalGeneration.from_pretrained(
+                hearttranscriptor_path, torch_dtype=dtype, low_cpu_mem_usage=True
+            )
+            processor = WhisperProcessor.from_pretrained(hearttranscriptor_path)
+        else:
+            raise FileNotFoundError(
+                f"Expected to find checkpoint for HeartTranscriptor at {hearttranscriptor_path} but not found. Please check your folder {pretrained_path}."
+            )
+        return cls(
+            model=model,
+            tokenizer=processor.tokenizer,
+            feature_extractor=processor.feature_extractor,
+            device=device,
+            dtype=dtype,
+            chunk_length_s=30,
+            batch_size=16,
+        )

src/heartlib/pipelines/music_generation.py ADDED Viewed

	@@ -0,0 +1,256 @@

+from transformers.pipelines.base import Pipeline
+from tokenizers import Tokenizer
+from ..heartmula.modeling_heartmula import HeartMuLa
+from ..heartcodec.modeling_heartcodec import HeartCodec
+import torch
+from typing import Dict, Any, Optional
+import os
+from dataclasses import dataclass
+from tqdm import tqdm
+import torchaudio
+import json
+from transformers import BitsAndBytesConfig
+@dataclass
+class HeartMuLaGenConfig:
+    text_bos_id: int = 128000
+    text_eos_id: int = 128001
+    audio_eos_id: int = 8193
+    empty_id: int = 0
+    @classmethod
+    def from_file(cls, path: str):
+        with open(path, encoding="utf-8") as fp:
+            data = json.load(fp)
+        return cls(**data)
+class HeartMuLaGenPipeline(Pipeline):
+    def __init__(
+        self,
+        model: HeartMuLa,
+        audio_codec: HeartCodec,
+        muq_mulan: Optional[Any],
+        text_tokenizer: Tokenizer,
+        config: HeartMuLaGenConfig,
+        device: torch.device,
+        dtype: torch.dtype,
+    ):
+        super().__init__(model, dtype=dtype)
+        self.model = model
+        self.audio_codec = audio_codec
+        self.muq_mulan = muq_mulan
+        self.text_tokenizer = text_tokenizer
+        self.config = config
+        self._parallel_number = audio_codec.config.num_quantizers + 1
+        self._muq_dim = model.config.muq_dim
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_kwargs = {"cfg_scale": kwargs.get("cfg_scale", 1.5)}
+        forward_kwargs = {
+            "max_audio_length_ms": kwargs.get("max_audio_length_ms", 120_000),
+            "temperature": kwargs.get("temperature", 1.0),
+            "topk": kwargs.get("topk", 50),
+            "cfg_scale": kwargs.get("cfg_scale", 1.5),
+        }
+        postprocess_kwargs = {
+            "save_path": kwargs.get("save_path", "output.mp3"),
+        }
+        return preprocess_kwargs, forward_kwargs, postprocess_kwargs
+    def preprocess(self, inputs: Dict[str, Any], cfg_scale: float):
+        # process tags
+        tags = inputs["tags"]
+        if os.path.isfile(tags):
+            with open(tags, encoding="utf-8") as fp:
+                tags = fp.read()
+        assert isinstance(tags, str), f"tags must be a string, but got {type(tags)}"
+        tags = tags.lower()
+        # encapsulate with special <tag> and </tag> tokens
+        if not tags.startswith("<tag>"):
+            tags = f"<tag>{tags}"
+        if not tags.endswith("</tag>"):
+            tags = f"{tags}</tag>"
+        tags_ids = self.text_tokenizer.encode(tags).ids
+        if tags_ids[0] != self.config.text_bos_id:
+            tags_ids = [self.config.text_bos_id] + tags_ids
+        if tags_ids[-1] != self.config.text_eos_id:
+            tags_ids = tags_ids + [self.config.text_eos_id]
+        # process reference audio
+        ref_audio = inputs.get("ref_audio", None)
+        if ref_audio is not None:
+            raise NotImplementedError("ref_audio is not supported yet.")
+        muq_embed = torch.zeros([self._muq_dim], dtype=self.dtype)
+        muq_idx = len(tags_ids)
+        # process lyrics
+        lyrics = inputs["lyrics"]
+        if os.path.isfile(lyrics):
+            with open(lyrics, encoding="utf-8") as fp:
+                lyrics = fp.read()
+        assert isinstance(
+            lyrics, str
+        ), f"lyrics must be a string, but got {type(lyrics)}"
+        lyrics = lyrics.lower()
+        lyrics_ids = self.text_tokenizer.encode(lyrics).ids
+        if lyrics_ids[0] != self.config.text_bos_id:
+            lyrics_ids = [self.config.text_bos_id] + lyrics_ids
+        if lyrics_ids[-1] != self.config.text_eos_id:
+            lyrics_ids = lyrics_ids + [self.config.text_eos_id]
+        # cat them together. tags, ref_audio, lyrics
+        prompt_len = len(tags_ids) + 1 + len(lyrics_ids)
+        tokens = torch.zeros([prompt_len, self._parallel_number], dtype=torch.long)
+        tokens[: len(tags_ids), -1] = torch.tensor(tags_ids)
+        tokens[len(tags_ids) + 1 :, -1] = torch.tensor(lyrics_ids)
+        tokens_mask = torch.zeros_like(tokens, dtype=torch.bool)
+        tokens_mask[:, -1] = True
+        bs_size = 2 if cfg_scale != 1.0 else 1
+        def _cfg_cat(tensor: torch.Tensor, cfg_scale: float):
+            tensor = tensor.unsqueeze(0)
+            if cfg_scale != 1.0:
+                tensor = torch.cat([tensor, tensor], dim=0)
+            return tensor
+        return {
+            "tokens": _cfg_cat(tokens, cfg_scale),
+            "tokens_mask": _cfg_cat(tokens_mask, cfg_scale),
+            "muq_embed": _cfg_cat(muq_embed, cfg_scale),
+            "muq_idx": [muq_idx] * bs_size,
+            "pos": _cfg_cat(torch.arange(prompt_len, dtype=torch.long), cfg_scale),
+        }
+    def _forward(
+        self,
+        model_inputs: Dict[str, Any],
+        max_audio_length_ms: int,
+        temperature: float,
+        topk: int,
+        cfg_scale: float,
+    ):
+        prompt_tokens = model_inputs["tokens"]
+        prompt_tokens_mask = model_inputs["tokens_mask"]
+        continuous_segment = model_inputs["muq_embed"]
+        starts = model_inputs["muq_idx"]
+        prompt_pos = model_inputs["pos"]
+        frames = []
+        bs_size = 2 if cfg_scale != 1.0 else 1
+        self.model.setup_caches(bs_size)
+        with torch.autocast(device_type=self.device.type, dtype=self.dtype):
+            curr_token = self.model.generate_frame(
+                tokens=prompt_tokens,
+                tokens_mask=prompt_tokens_mask,
+                input_pos=prompt_pos,
+                temperature=temperature,
+                topk=topk,
+                cfg_scale=cfg_scale,
+                continuous_segments=continuous_segment,
+                starts=starts,
+            )
+        frames.append(curr_token[0:1,])
+        def _pad_audio_token(token: torch.Tensor):
+            padded_token = (
+                torch.ones(
+                    (token.shape[0], self._parallel_number),
+                    device=token.device,
+                    dtype=torch.long,
+                )
+                * self.config.empty_id
+            )
+            padded_token[:, :-1] = token
+            padded_token = padded_token.unsqueeze(1)
+            padded_token_mask = torch.ones_like(
+                padded_token, device=token.device, dtype=torch.bool
+            )
+            padded_token_mask[..., -1] = False
+            return padded_token, padded_token_mask
+        max_audio_frames = max_audio_length_ms // 80
+        for i in tqdm(range(max_audio_frames)):
+            curr_token, curr_token_mask = _pad_audio_token(curr_token)
+            with torch.autocast(device_type=self.device.type, dtype=self.dtype):
+                curr_token = self.model.generate_frame(
+                    tokens=curr_token,
+                    tokens_mask=curr_token_mask,
+                    input_pos=prompt_pos[..., -1:] + i + 1,
+                    temperature=temperature,
+                    topk=topk,
+                    cfg_scale=cfg_scale,
+                    continuous_segments=None,
+                    starts=None,
+                )
+            if torch.any(curr_token[0:1, :] >= self.config.audio_eos_id):
+                break
+            frames.append(curr_token[0:1,])
+        frames = torch.stack(frames).permute(1, 2, 0).squeeze(0)
+        wav = self.audio_codec.detokenize(frames)
+        return {"wav": wav}
+    def postprocess(self, model_outputs: Dict[str, Any], save_path: str):
+        wav = model_outputs["wav"]
+        torchaudio.save(save_path, wav, 48000)
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_path: str,
+        device: torch.device,
+        dtype: torch.dtype,
+        version: str,
+        bnb_config: Optional[BitsAndBytesConfig] = None,
+    ):
+        if os.path.exists(
+            heartcodec_path := os.path.join(pretrained_path, "HeartCodec-oss")
+        ):
+            heartcodec = HeartCodec.from_pretrained(heartcodec_path, device_map=device)
+        else:
+            raise FileNotFoundError(
+                f"Expected to find checkpoint for HeartCodec at {heartcodec_path} but not found. Please check your folder {pretrained_path}."
+            )
+        if os.path.exists(
+            heartmula_path := os.path.join(pretrained_path, f"HeartMuLa-oss-{version}")
+        ):
+            heartmula = HeartMuLa.from_pretrained(
+                heartmula_path, dtype=dtype, quantization_config=bnb_config
+            )
+        else:
+            raise FileNotFoundError(
+                f"Expected to find checkpoint for HeartMuLa at {heartmula_path} but not found. Please check your folder {pretrained_path}."
+            )
+        if os.path.isfile(
+            vocab_path := os.path.join(pretrained_path, "tokenizer.json")
+        ):
+            tokenizer = Tokenizer.from_file(vocab_path)
+        else:
+            raise FileNotFoundError(
+                f"Expected to find tokenizer.json for HeartMuLa at {vocab_path} but not found. Please check your folder {pretrained_path}."
+            )
+        if os.path.isfile(
+            gen_config_path := os.path.join(pretrained_path, "gen_config.json")
+        ):
+            gen_config = HeartMuLaGenConfig.from_file(gen_config_path)
+        else:
+            raise FileNotFoundError(
+                f"Expected to find gen_config.json for HeartMuLa at {gen_config_path} but not found. Please check your folder {pretrained_path}."
+            )
+        return cls(heartmula, heartcodec, None, tokenizer, gen_config, device, dtype)