seemanthraju commited on Jan 23

Commit

7be9079

0 Parent(s):

first commit

Files changed (37) hide show

.gitattributes +3 -0
.gitignore +83 -0
README.md +198 -0
chiluka/__init__.py +9 -0
chiluka/configs/config_ft.yml +116 -0
chiluka/inference.py +368 -0
chiluka/models/__init__.py +21 -0
chiluka/models/core.py +731 -0
chiluka/models/diffusion/__init__.py +22 -0
chiluka/models/diffusion/diffusion.py +72 -0
chiluka/models/diffusion/modules.py +367 -0
chiluka/models/diffusion/sampler.py +176 -0
chiluka/models/diffusion/utils.py +40 -0
chiluka/models/hifigan.py +266 -0
chiluka/pretrained/ASR/__init__.py +1 -0
chiluka/pretrained/ASR/__pycache__/__init__.cpython-310.pyc +0 -0
chiluka/pretrained/ASR/__pycache__/layers.cpython-310.pyc +0 -0
chiluka/pretrained/ASR/__pycache__/models.cpython-310.pyc +0 -0
chiluka/pretrained/ASR/config.yml +29 -0
chiluka/pretrained/ASR/epoch_00080.pth +3 -0
chiluka/pretrained/ASR/layers.py +354 -0
chiluka/pretrained/ASR/models.py +186 -0
chiluka/pretrained/JDC/__init__.py +1 -0
chiluka/pretrained/JDC/__pycache__/__init__.cpython-310.pyc +0 -0
chiluka/pretrained/JDC/__pycache__/model.cpython-310.pyc +0 -0
chiluka/pretrained/JDC/bst.t7 +3 -0
chiluka/pretrained/JDC/model.py +190 -0
chiluka/pretrained/PLBERT/__pycache__/util.cpython-310.pyc +0 -0
chiluka/pretrained/PLBERT/config.yml +30 -0
chiluka/pretrained/PLBERT/step_1000000.t7 +3 -0
chiluka/pretrained/PLBERT/util.py +42 -0
chiluka/text_utils.py +24 -0
chiluka/utils.py +21 -0
examples/basic_synthesis.py +51 -0
examples/telugu_synthesis.py +53 -0
pyproject.toml +64 -0
setup.py +60 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,3 @@

+# Git LFS tracking for large model files
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.t7 filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,83 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# PyInstaller
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+tests/
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+# Jupyter Notebook
+.ipynb_checkpoints
+# OS
+.DS_Store
+Thumbs.db
+# Test outputs
+test_outputs/
+*.wav
+!chiluka/pretrained/**
+# Note: Large model files are tracked with Git LFS
+# If not using Git LFS, uncomment these lines:
+# *.pth
+# *.t7

README.md ADDED Viewed

	@@ -0,0 +1,198 @@

+# Chiluka 🦜
+**Chiluka** (చిలుక - Telugu for "parrot") is a self-contained TTS (Text-to-Speech) inference package based on StyleTTS2.
+## Features
+- 🚀 Simple, clean API for TTS synthesis
+- 📦 **Fully self-contained** - all models bundled in the package
+- 🎙️ Style transfer from reference audio
+- 🌍 Multi-language support via phonemizer
+- 🔧 No external dependencies on other repos
+## Installation
+### From Source (Recommended)
+```bash
+git clone https://github.com/yourusername/chiluka.git
+cd chiluka
+pip install -e .
+```
+**Note:** This repo uses Git LFS for large model files. Make sure to install Git LFS first:
+```bash
+# Ubuntu/Debian
+sudo apt-get install git-lfs
+git lfs install
+# macOS
+brew install git-lfs
+git lfs install
+# Then clone
+git lfs clone https://github.com/yourusername/chiluka.git
+```
+### Install espeak-ng (Required for phonemization)
+**Ubuntu/Debian:**
+```bash
+sudo apt-get install espeak-ng
+```
+**macOS:**
+```bash
+brew install espeak-ng
+```
+## Quick Start
+```python
+from chiluka import Chiluka
+# Initialize - uses bundled models automatically!
+tts = Chiluka()
+# Synthesize speech
+wav = tts.synthesize(
+    text="Hello, this is Chiluka speaking!",
+    reference_audio="path/to/reference.wav",
+    language="en"
+)
+# Save to file
+tts.save_wav(wav, "output.wav")
+```
+### Telugu Example
+```python
+from chiluka import Chiluka
+tts = Chiluka()
+wav = tts.synthesize(
+    text="నమస్కారం, నేను చిలుక మాట్లాడుతున్నాను",
+    reference_audio="path/to/telugu_reference.wav",
+    language="te"  # Telugu
+)
+tts.save_wav(wav, "telugu_output.wav")
+```
+## Package Structure
+```
+chiluka/
+├── chiluka/
+│   ├── __init__.py
+│   ├── inference.py          # Main Chiluka API
+│   ├── text_utils.py
+│   ├── utils.py
+│   ├── configs/
+│   │   └── config_ft.yml     # Model configuration
+│   ├── checkpoints/
+│   │   └── *.pth             # Trained model checkpoint
+│   ├── pretrained/
+│   │   ├── ASR/              # Text aligner model
+│   │   ├── JDC/              # Pitch extractor model
+│   │   └── PLBERT/           # PL-BERT model
+│   └── models/
+│       ├── core.py
+│       ├── hifigan.py
+│       └── diffusion/
+├── examples/
+│   ├── basic_synthesis.py
+│   └── telugu_synthesis.py
+├── setup.py
+├── pyproject.toml
+└── README.md
+```
+## API Reference
+### Chiluka Class
+```python
+tts = Chiluka(
+    config_path=None,      # Optional: custom config file
+    checkpoint_path=None,  # Optional: custom checkpoint
+    pretrained_dir=None,   # Optional: custom pretrained models
+    device=None            # Optional: 'cuda' or 'cpu'
+)
+```
+### synthesize()
+```python
+wav = tts.synthesize(
+    text="Hello world",           # Text to synthesize
+    reference_audio="ref.wav",    # Reference audio for style
+    language="en",                # Language code ('en', 'te', 'hi', etc.)
+    alpha=0.3,                    # Acoustic style mixing (0-1)
+    beta=0.7,                     # Prosodic style mixing (0-1)
+    diffusion_steps=5,            # Diffusion sampling steps
+    embedding_scale=1.0,          # Classifier-free guidance scale
+    sr=24000                      # Sample rate
+)
+```
+### Other Methods
+```python
+# Save audio to file
+tts.save_wav(wav, "output.wav", sr=24000)
+# Play audio (requires pyaudio)
+tts.play(wav, sr=24000)
+# Get style embedding from audio
+style = tts.compute_style("reference.wav", sr=24000)
+```
+## Synthesis Parameters
+| Parameter | Default | Description |
+|-----------|---------|-------------|
+| `alpha` | 0.3 | Acoustic style mixing (0=reference only, 1=predicted only) |
+| `beta` | 0.7 | Prosodic style mixing (0=reference only, 1=predicted only) |
+| `diffusion_steps` | 5 | Number of diffusion sampling steps (more = better quality, slower) |
+| `embedding_scale` | 1.0 | Classifier-free guidance scale |
+## Supported Languages
+Uses [phonemizer](https://github.com/bootphon/phonemizer) with espeak-ng. Common languages:
+| Language | Code |
+|----------|------|
+| English (US) | `en-us` |
+| English (UK) | `en-gb` |
+| Telugu | `te` |
+| Hindi | `hi` |
+| Tamil | `ta` |
+| Kannada | `kn` |
+See espeak-ng documentation for full list.
+## Requirements
+- Python >= 3.8
+- PyTorch >= 1.13.0
+- CUDA (recommended for faster inference)
+- espeak-ng
+## Training Your Own Model
+This package is for **inference only**. To train your own model, use the original [StyleTTS2](https://github.com/yl4579/StyleTTS2) repository.
+After training, copy your checkpoint to `chiluka/checkpoints/` and update the config if needed.
+## Credits
+Based on [StyleTTS2](https://github.com/yl4579/StyleTTS2) by Yinghao Aaron Li et al.
+## License
+MIT License

chiluka/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+"""
+Chiluka - A lightweight TTS inference package based on StyleTTS2
+"""
+__version__ = "0.1.0"
+from .inference import Chiluka
+__all__ = ["Chiluka"]

chiluka/configs/config_ft.yml ADDED Viewed

	@@ -0,0 +1,116 @@

+log_dir: "Models/tm_tel_ft_24k"
+first_stage_path: "first_stage.pth"
+save_freq: 2
+log_interval: 10
+device: "cuda"
+epochs_1st: 30
+epochs_2nd: 20
+batch_size: 2  # Keep at 2 with filtering
+max_len: 200   # This is fine - refers to audio frames, not phonemes
+pretrained_model: "/home/purview/Documents/TextToSpeech_Backup/StyleTTS2/Models/LibriTTS/epochs_2nd_00020.pth"
+second_stage_load_pretrained: true
+load_only_params: true
+F0_path: "Utils/JDC/bst.t7"
+ASR_config: "Utils/ASR/config.yml"
+ASR_path: "Utils/ASR/epoch_00080.pth"
+PLBERT_dir: "Utils/PLBERT/"
+data_params:
+  train_data: "Data_custom/train_list.txt"
+  val_data: "Data_custom/val_list.txt"
+  root_path: "/home/purview/Documents/TextToSpeech_Backup/Processed_Dataset_24k/wavs"
+  OOD_data: "Data_custom/OOD_texts.txt"
+  min_length: 50  # <<<< This is in phonemes - keep it low
+# Rest of your config stays the same...
+preprocess_params:
+  sr: 24000
+  spect_params:
+    n_fft: 2048
+    win_length: 1200
+    hop_length: 300
+model_params:
+  # match the LibriTTS checkpoint setting (it was trained multispeaker:true)
+  # You can still finetune with only speaker_id=0 in your train_list.txt
+  multispeaker: true
+  dim_in: 64
+  hidden_dim: 512
+  max_conv_dim: 512
+  n_layer: 3
+  n_mels: 80
+  n_token: 178
+  max_dur: 50
+  style_dim: 128
+  dropout: 0.2
+  # MUST MATCH LibriTTS CHECKPOINT (this is your main fix)
+  decoder:
+    type: "hifigan"
+    resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+    resblock_kernel_sizes: [3, 7, 11]
+    upsample_initial_channel: 512
+    upsample_rates: [10, 5, 3, 2]
+    upsample_kernel_sizes: [20, 10, 6, 4]
+  slm:
+    model: "microsoft/wavlm-base-plus"
+    sr: 16000
+    hidden: 768
+    nlayers: 13
+    initial_channel: 64
+  diffusion:
+    embedding_mask_proba: 0.1
+    transformer:
+      num_layers: 3
+      num_heads: 8
+      head_features: 64
+      multiplier: 2
+    dist:
+      sigma_data: 0.19926648961191362
+      estimate_sigma_data: true
+      mean: -3.0
+      std: 1.0
+loss_params:
+  lambda_mel: 5.0
+  lambda_gen: 1.0
+  lambda_slm: 1.0
+  lambda_mono: 1.0
+  lambda_s2s: 1.0
+  TMA_epoch: 4
+  lambda_F0: 1.0
+  lambda_norm: 1.0
+  lambda_dur: 1.0
+  lambda_ce: 20.0
+  lambda_sty: 1.0
+  lambda_diff: 1.0
+  # For a safe first run, delay diffusion + joint/SLM-adv.
+  # After it runs, you can set these back to 0 like LibriTTS.
+  diff_epoch: 999
+  joint_epoch: 999
+optimizer_params:
+  lr: 0.0001
+  bert_lr: 0.00001
+  ft_lr: 0.00001
+slmadv_params:
+  min_len: 400
+  max_len: 500
+  batch_percentage: 0.5
+  iter: 20
+  thresh: 5
+  scale: 0.01
+  sig: 1.5

chiluka/inference.py ADDED Viewed

	@@ -0,0 +1,368 @@

+"""
+Chiluka - Main inference API for TTS synthesis.
+Example usage:
+    from chiluka import Chiluka
+    # Simple usage (uses bundled models)
+    tts = Chiluka()
+    # Generate speech
+    wav = tts.synthesize(
+        text="Hello, world!",
+        reference_audio="path/to/reference.wav",
+        language="en"
+    )
+    # Save to file
+    tts.save_wav(wav, "output.wav")
+"""
+import os
+import yaml
+import torch
+import torchaudio
+import librosa
+import numpy as np
+from pathlib import Path
+from typing import Optional, Union
+from nltk.tokenize import word_tokenize
+from .models import build_model, load_ASR_models, load_F0_models, load_plbert
+from .models.diffusion import DiffusionSampler, ADPM2Sampler, KarrasSchedule
+from .text_utils import TextCleaner
+from .utils import recursive_munch, length_to_mask
+# Get package directory
+PACKAGE_DIR = Path(__file__).parent.absolute()
+DEFAULT_PRETRAINED_DIR = PACKAGE_DIR / "pretrained"
+DEFAULT_CONFIG_PATH = PACKAGE_DIR / "configs" / "config_ft.yml"
+DEFAULT_CHECKPOINT_DIR = PACKAGE_DIR / "checkpoints"
+def get_default_checkpoint():
+    """Find the first checkpoint in the checkpoints directory."""
+    if DEFAULT_CHECKPOINT_DIR.exists():
+        checkpoints = list(DEFAULT_CHECKPOINT_DIR.glob("*.pth"))
+        if checkpoints:
+            return str(checkpoints[0])
+    return None
+class Chiluka:
+    """
+    Chiluka TTS - Text-to-Speech synthesis using StyleTTS2.
+    Args:
+        config_path: Path to the YAML config file. If None, uses bundled config.
+        checkpoint_path: Path to the trained model checkpoint (.pth file). If None, uses bundled checkpoint.
+        pretrained_dir: Directory containing pretrained sub-models (ASR/, JDC/, PLBERT/). If None, uses bundled models.
+        device: Device to use ('cuda' or 'cpu'). If None, auto-detects.
+    Example:
+        # Use bundled models (simplest)
+        tts = Chiluka()
+        # Or specify custom paths
+        tts = Chiluka(
+            config_path="my_config.yml",
+            checkpoint_path="my_model.pth"
+        )
+    """
+    def __init__(
+        self,
+        config_path: Optional[str] = None,
+        checkpoint_path: Optional[str] = None,
+        pretrained_dir: Optional[str] = None,
+        device: Optional[str] = None,
+    ):
+        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+        print(f"Using device: {self.device}")
+        # Resolve paths - use bundled defaults if not specified
+        config_path = config_path or str(DEFAULT_CONFIG_PATH)
+        checkpoint_path = checkpoint_path or get_default_checkpoint()
+        pretrained_dir = pretrained_dir or str(DEFAULT_PRETRAINED_DIR)
+        if not checkpoint_path:
+            raise ValueError(
+                "No checkpoint found. Please either:\n"
+                "1. Place a .pth checkpoint in: {}\n"
+                "2. Specify checkpoint_path parameter".format(DEFAULT_CHECKPOINT_DIR)
+            )
+        # Load config
+        print(f"Loading config from {config_path}...")
+        with open(config_path, 'r') as f:
+            self.config = yaml.safe_load(f)
+        # Resolve pretrained paths
+        self.pretrained_dir = Path(pretrained_dir)
+        asr_config = self.pretrained_dir / "ASR" / "config.yml"
+        asr_path = self.pretrained_dir / "ASR" / "epoch_00080.pth"
+        f0_path = self.pretrained_dir / "JDC" / "bst.t7"
+        plbert_dir = self.pretrained_dir / "PLBERT"
+        # Verify pretrained models exist
+        self._verify_pretrained_models(asr_path, f0_path, plbert_dir)
+        # Load pretrained models
+        print(f"Loading ASR model...")
+        self.text_aligner = load_ASR_models(str(asr_path), str(asr_config))
+        print(f"Loading F0 model...")
+        self.pitch_extractor = load_F0_models(str(f0_path))
+        print(f"Loading PL-BERT...")
+        self.plbert = load_plbert(str(plbert_dir))
+        # Build model
+        self.model_params = recursive_munch(self.config["model_params"])
+        self.model = build_model(self.model_params, self.text_aligner, self.pitch_extractor, self.plbert)
+        # Load checkpoint
+        print(f"Loading checkpoint from {checkpoint_path}...")
+        self._load_checkpoint(checkpoint_path)
+        # Move to device and set to eval mode
+        for key in self.model:
+            self.model[key].eval().to(self.device)
+        # Build sampler
+        self.sampler = DiffusionSampler(
+            self.model.diffusion.diffusion,
+            sampler=ADPM2Sampler(),
+            sigma_schedule=KarrasSchedule(sigma_min=0.0001, sigma_max=3.0, rho=9.0),
+            clamp=False,
+        )
+        # Text cleaner
+        self.textcleaner = TextCleaner()
+        # Mel spectrogram transform
+        self.to_mel = torchaudio.transforms.MelSpectrogram(
+            n_mels=80, n_fft=2048, win_length=1200, hop_length=300
+        )
+        # Cache for phonemizer backends
+        self._phonemizers = {}
+        print("✓ Chiluka TTS initialized successfully!")
+    def _verify_pretrained_models(self, asr_path, f0_path, plbert_dir):
+        """Verify all pretrained models exist."""
+        missing = []
+        if not asr_path.exists():
+            missing.append(f"ASR model: {asr_path}")
+        if not f0_path.exists():
+            missing.append(f"F0 model: {f0_path}")
+        if not plbert_dir.exists():
+            missing.append(f"PLBERT directory: {plbert_dir}")
+        if missing:
+            raise FileNotFoundError(
+                "Missing pretrained models:\n" +
+                "\n".join(f"  - {m}" for m in missing) +
+                f"\n\nExpected in: {self.pretrained_dir}"
+            )
+    def _load_checkpoint(self, checkpoint_path: str):
+        """Load model weights from checkpoint."""
+        checkpoint = torch.load(checkpoint_path, map_location=self.device)
+        for key in self.model:
+            if key in checkpoint["net"]:
+                try:
+                    self.model[key].load_state_dict(checkpoint["net"][key])
+                except Exception:
+                    state_dict = checkpoint["net"][key]
+                    new_state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()}
+                    self.model[key].load_state_dict(new_state_dict)
+    def _get_phonemizer(self, language: str):
+        """Get or create phonemizer backend for a language."""
+        if language not in self._phonemizers:
+            import phonemizer
+            self._phonemizers[language] = phonemizer.backend.EspeakBackend(
+                language=language, preserve_punctuation=True, with_stress=True
+            )
+        return self._phonemizers[language]
+    def _preprocess_mel(self, wave: np.ndarray, mean: float = -4, std: float = 4) -> torch.Tensor:
+        """Convert waveform to normalized mel spectrogram."""
+        wave_tensor = torch.from_numpy(wave).float()
+        mel_tensor = self.to_mel(wave_tensor)
+        mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
+        return mel_tensor
+    def compute_style(self, audio_path: str, sr: int = 24000) -> torch.Tensor:
+        """
+        Compute style embedding from reference audio.
+        Args:
+            audio_path: Path to reference audio file
+            sr: Target sample rate
+        Returns:
+            Style embedding tensor
+        """
+        wave, orig_sr = librosa.load(audio_path, sr=sr)
+        audio, _ = librosa.effects.trim(wave, top_db=30)
+        if orig_sr != sr:
+            audio = librosa.resample(audio, orig_sr=orig_sr, target_sr=sr)
+        mel_tensor = self._preprocess_mel(audio).to(self.device)
+        with torch.no_grad():
+            ref_s = self.model.style_encoder(mel_tensor.unsqueeze(1))
+            ref_p = self.model.predictor_encoder(mel_tensor.unsqueeze(1))
+        return torch.cat([ref_s, ref_p], dim=1)
+    def synthesize(
+        self,
+        text: str,
+        reference_audio: str,
+        language: str = "en",
+        alpha: float = 0.3,
+        beta: float = 0.7,
+        diffusion_steps: int = 5,
+        embedding_scale: float = 1.0,
+        sr: int = 24000,
+    ) -> np.ndarray:
+        """
+        Synthesize speech from text.
+        Args:
+            text: Input text to synthesize
+            reference_audio: Path to reference audio for style transfer
+            language: Language code for phonemization (e.g., 'en', 'te', 'hi')
+            alpha: Style mixing coefficient for acoustic features (0-1)
+            beta: Style mixing coefficient for prosodic features (0-1)
+            diffusion_steps: Number of diffusion sampling steps
+            embedding_scale: Classifier-free guidance scale
+            sr: Sample rate
+        Returns:
+            Generated audio waveform as numpy array
+        """
+        # Compute style from reference
+        ref_s = self.compute_style(reference_audio, sr=sr)
+        # Phonemize text
+        phonemizer = self._get_phonemizer(language)
+        text = text.strip()
+        ps = phonemizer.phonemize([text])
+        ps = word_tokenize(ps[0])
+        ps = " ".join(ps)
+        # Convert to tokens
+        tokens = self.textcleaner(ps)
+        tokens.insert(0, 0)  # Add start token
+        tokens = torch.LongTensor(tokens).to(self.device).unsqueeze(0)
+        # Truncate if too long
+        max_len = self.model.bert.config.max_position_embeddings
+        if tokens.shape[-1] > max_len:
+            tokens = tokens[:, :max_len]
+        with torch.no_grad():
+            input_lengths = torch.LongTensor([tokens.shape[-1]]).to(self.device)
+            text_mask = length_to_mask(input_lengths).to(self.device)
+            # Encode text
+            t_en = self.model.text_encoder(tokens, input_lengths, text_mask)
+            bert_dur = self.model.bert(tokens, attention_mask=(~text_mask).int())
+            d_en = self.model.bert_encoder(bert_dur).transpose(-1, -2)
+            # Sample style
+            s_pred = self.sampler(
+                noise=torch.randn((1, 256)).unsqueeze(1).to(self.device),
+                embedding=bert_dur,
+                embedding_scale=embedding_scale,
+                features=ref_s,
+                num_steps=diffusion_steps,
+            ).squeeze(1)
+            s = s_pred[:, 128:]
+            ref = s_pred[:, :128]
+            # Mix styles
+            ref = alpha * ref + (1 - alpha) * ref_s[:, :128]
+            s = beta * s + (1 - beta) * ref_s[:, 128:]
+            # Predict duration
+            d = self.model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
+            x, _ = self.model.predictor.lstm(d)
+            duration = self.model.predictor.duration_proj(x)
+            duration = torch.sigmoid(duration).sum(axis=-1)
+            pred_dur = torch.round(duration.squeeze()).clamp(min=1)
+            # Build alignment
+            pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
+            c_frame = 0
+            for i in range(pred_aln_trg.size(0)):
+                pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1
+                c_frame += int(pred_dur[i].data)
+            en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(self.device))
+            # Adjust for hifigan decoder
+            if self.model_params.decoder.type == "hifigan":
+                asr_new = torch.zeros_like(en)
+                asr_new[:, :, 0] = en[:, :, 0]
+                asr_new[:, :, 1:] = en[:, :, 0:-1]
+                en = asr_new
+            # Predict F0 and energy
+            F0_pred, N_pred = self.model.predictor.F0Ntrain(en, s)
+            # Encode for decoder
+            asr = (t_en @ pred_aln_trg.unsqueeze(0).to(self.device))
+            if self.model_params.decoder.type == "hifigan":
+                asr_new = torch.zeros_like(asr)
+                asr_new[:, :, 0] = asr[:, :, 0]
+                asr_new[:, :, 1:] = asr[:, :, 0:-1]
+                asr = asr_new
+            # Decode waveform
+            out = self.model.decoder(asr, F0_pred, N_pred, ref.squeeze().unsqueeze(0))
+        return out.squeeze().cpu().numpy()[..., :-50]
+    def save_wav(self, wav: np.ndarray, path: str, sr: int = 24000):
+        """
+        Save waveform to WAV file.
+        Args:
+            wav: Audio waveform as numpy array
+            path: Output file path
+            sr: Sample rate
+        """
+        import scipy.io.wavfile as wavfile
+        wav_int16 = (wav * 32767).clip(-32768, 32767).astype(np.int16)
+        wavfile.write(path, sr, wav_int16)
+        print(f"Saved audio to {path}")
+    def play(self, wav: np.ndarray, sr: int = 24000):
+        """
+        Play audio through speakers (requires pyaudio).
+        Args:
+            wav: Audio waveform as numpy array
+            sr: Sample rate
+        """
+        try:
+            import pyaudio
+        except ImportError:
+            raise ImportError("pyaudio is required for playback. Install with: pip install pyaudio")
+        audio_int16 = (wav * 32767.0).clip(-32768, 32767).astype("int16").tobytes()
+        p = pyaudio.PyAudio()
+        stream = p.open(format=pyaudio.paInt16, channels=1, rate=sr, output=True)
+        stream.write(audio_int16)
+        stream.stop_stream()
+        stream.close()
+        p.terminate()

chiluka/models/__init__.py ADDED Viewed

	@@ -0,0 +1,21 @@

+"""Model components for Chiluka TTS."""
+from .core import (
+    build_model,
+    load_ASR_models,
+    load_F0_models,
+    load_plbert,
+    StyleEncoder,
+    TextEncoder,
+    ProsodyPredictor,
+)
+__all__ = [
+    "build_model",
+    "load_ASR_models",
+    "load_F0_models",
+    "load_plbert",
+    "StyleEncoder",
+    "TextEncoder",
+    "ProsodyPredictor",
+]

chiluka/models/core.py ADDED Viewed

	@@ -0,0 +1,731 @@

+"""Core model definitions for Chiluka TTS."""
+import os
+import math
+import yaml
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils import weight_norm, spectral_norm
+from collections import OrderedDict
+from munch import Munch
+from transformers import AlbertConfig, AlbertModel
+from .diffusion.sampler import KDiffusion, LogNormalDistribution
+from .diffusion.modules import Transformer1d, StyleTransformer1d
+from .diffusion.diffusion import AudioDiffusionConditional
+from .hifigan import Decoder
+# ============== Style Encoder ==============
+class DownSample(nn.Module):
+    def __init__(self, layer_type):
+        super().__init__()
+        self.layer_type = layer_type
+    def forward(self, x):
+        if self.layer_type == 'none':
+            return x
+        elif self.layer_type == 'timepreserve':
+            return F.avg_pool2d(x, (2, 1))
+        elif self.layer_type == 'half':
+            if x.shape[-1] % 2 != 0:
+                x = torch.cat([x, x[..., -1].unsqueeze(-1)], dim=-1)
+            return F.avg_pool2d(x, 2)
+        else:
+            raise RuntimeError(f'Unexpected downsample type {self.layer_type}')
+class LearnedDownSample(nn.Module):
+    def __init__(self, layer_type, dim_in):
+        super().__init__()
+        self.layer_type = layer_type
+        if self.layer_type == 'none':
+            self.conv = nn.Identity()
+        elif self.layer_type == 'timepreserve':
+            self.conv = spectral_norm(nn.Conv2d(dim_in, dim_in, kernel_size=(3, 1), stride=(2, 1), groups=dim_in, padding=(1, 0)))
+        elif self.layer_type == 'half':
+            self.conv = spectral_norm(nn.Conv2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, padding=1))
+        else:
+            raise RuntimeError(f'Unexpected downsample type {self.layer_type}')
+    def forward(self, x):
+        return self.conv(x)
+class ResBlk(nn.Module):
+    def __init__(self, dim_in, dim_out, actv=nn.LeakyReLU(0.2), normalize=False, downsample='none'):
+        super().__init__()
+        self.actv = actv
+        self.normalize = normalize
+        self.downsample = DownSample(downsample)
+        self.downsample_res = LearnedDownSample(downsample, dim_in)
+        self.learned_sc = dim_in != dim_out
+        self._build_weights(dim_in, dim_out)
+    def _build_weights(self, dim_in, dim_out):
+        self.conv1 = spectral_norm(nn.Conv2d(dim_in, dim_in, 3, 1, 1))
+        self.conv2 = spectral_norm(nn.Conv2d(dim_in, dim_out, 3, 1, 1))
+        if self.normalize:
+            self.norm1 = nn.InstanceNorm2d(dim_in, affine=True)
+            self.norm2 = nn.InstanceNorm2d(dim_in, affine=True)
+        if self.learned_sc:
+            self.conv1x1 = spectral_norm(nn.Conv2d(dim_in, dim_out, 1, 1, 0, bias=False))
+    def _shortcut(self, x):
+        if self.learned_sc:
+            x = self.conv1x1(x)
+        if self.downsample:
+            x = self.downsample(x)
+        return x
+    def _residual(self, x):
+        if self.normalize:
+            x = self.norm1(x)
+        x = self.actv(x)
+        x = self.conv1(x)
+        x = self.downsample_res(x)
+        if self.normalize:
+            x = self.norm2(x)
+        x = self.actv(x)
+        x = self.conv2(x)
+        return x
+    def forward(self, x):
+        x = self._shortcut(x) + self._residual(x)
+        return x / math.sqrt(2)
+class StyleEncoder(nn.Module):
+    def __init__(self, dim_in=48, style_dim=48, max_conv_dim=384):
+        super().__init__()
+        blocks = []
+        blocks += [spectral_norm(nn.Conv2d(1, dim_in, 3, 1, 1))]
+        repeat_num = 4
+        for _ in range(repeat_num):
+            dim_out = min(dim_in * 2, max_conv_dim)
+            blocks += [ResBlk(dim_in, dim_out, downsample='half')]
+            dim_in = dim_out
+        blocks += [nn.LeakyReLU(0.2)]
+        blocks += [spectral_norm(nn.Conv2d(dim_out, dim_out, 5, 1, 0))]
+        blocks += [nn.AdaptiveAvgPool2d(1)]
+        blocks += [nn.LeakyReLU(0.2)]
+        self.shared = nn.Sequential(*blocks)
+        self.unshared = nn.Linear(dim_out, style_dim)
+    def forward(self, x):
+        h = self.shared(x)
+        h = h.view(h.size(0), -1)
+        s = self.unshared(h)
+        return s
+# ============== Text Encoder ==============
+class LayerNorm(nn.Module):
+    def __init__(self, channels, eps=1e-5):
+        super().__init__()
+        self.channels = channels
+        self.eps = eps
+        self.gamma = nn.Parameter(torch.ones(channels))
+        self.beta = nn.Parameter(torch.zeros(channels))
+    def forward(self, x):
+        x = x.transpose(1, -1)
+        x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
+        return x.transpose(1, -1)
+class LinearNorm(nn.Module):
+    def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
+        super().__init__()
+        self.linear_layer = nn.Linear(in_dim, out_dim, bias=bias)
+        nn.init.xavier_uniform_(self.linear_layer.weight, gain=nn.init.calculate_gain(w_init_gain))
+    def forward(self, x):
+        return self.linear_layer(x)
+class TextEncoder(nn.Module):
+    def __init__(self, channels, kernel_size, depth, n_symbols, actv=nn.LeakyReLU(0.2)):
+        super().__init__()
+        self.embedding = nn.Embedding(n_symbols, channels)
+        padding = (kernel_size - 1) // 2
+        self.cnn = nn.ModuleList()
+        for _ in range(depth):
+            self.cnn.append(nn.Sequential(
+                weight_norm(nn.Conv1d(channels, channels, kernel_size=kernel_size, padding=padding)),
+                LayerNorm(channels),
+                actv,
+                nn.Dropout(0.2),
+            ))
+        self.lstm = nn.LSTM(channels, channels // 2, 1, batch_first=True, bidirectional=True)
+    def forward(self, x, input_lengths, m):
+        x = self.embedding(x)
+        x = x.transpose(1, 2)
+        m = m.to(input_lengths.device).unsqueeze(1)
+        x.masked_fill_(m, 0.0)
+        for c in self.cnn:
+            x = c(x)
+            x.masked_fill_(m, 0.0)
+        x = x.transpose(1, 2)
+        input_lengths = input_lengths.cpu().numpy()
+        x = nn.utils.rnn.pack_padded_sequence(x, input_lengths, batch_first=True, enforce_sorted=False)
+        self.lstm.flatten_parameters()
+        x, _ = self.lstm(x)
+        x, _ = nn.utils.rnn.pad_packed_sequence(x, batch_first=True)
+        x = x.transpose(-1, -2)
+        x_pad = torch.zeros([x.shape[0], x.shape[1], m.shape[-1]])
+        x_pad[:, :, :x.shape[-1]] = x
+        x = x_pad.to(x.device)
+        x.masked_fill_(m, 0.0)
+        return x
+# ============== Prosody Predictor ==============
+class AdaIN1d(nn.Module):
+    def __init__(self, style_dim, num_features):
+        super().__init__()
+        self.norm = nn.InstanceNorm1d(num_features, affine=False)
+        self.fc = nn.Linear(style_dim, num_features * 2)
+    def forward(self, x, s):
+        h = self.fc(s)
+        h = h.view(h.size(0), h.size(1), 1)
+        gamma, beta = torch.chunk(h, chunks=2, dim=1)
+        return (1 + gamma) * self.norm(x) + beta
+class UpSample1d(nn.Module):
+    def __init__(self, layer_type):
+        super().__init__()
+        self.layer_type = layer_type
+    def forward(self, x):
+        if self.layer_type == 'none':
+            return x
+        else:
+            return F.interpolate(x, scale_factor=2, mode='nearest')
+class AdainResBlk1d(nn.Module):
+    def __init__(self, dim_in, dim_out, style_dim=64, actv=nn.LeakyReLU(0.2), upsample='none', dropout_p=0.0):
+        super().__init__()
+        self.actv = actv
+        self.upsample_type = upsample
+        self.upsample = UpSample1d(upsample)
+        self.learned_sc = dim_in != dim_out
+        self._build_weights(dim_in, dim_out, style_dim)
+        self.dropout = nn.Dropout(dropout_p)
+        if upsample == 'none':
+            self.pool = nn.Identity()
+        else:
+            self.pool = weight_norm(nn.ConvTranspose1d(dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1, output_padding=1))
+    def _build_weights(self, dim_in, dim_out, style_dim):
+        self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1))
+        self.conv2 = weight_norm(nn.Conv1d(dim_out, dim_out, 3, 1, 1))
+        self.norm1 = AdaIN1d(style_dim, dim_in)
+        self.norm2 = AdaIN1d(style_dim, dim_out)
+        if self.learned_sc:
+            self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False))
+    def _shortcut(self, x):
+        x = self.upsample(x)
+        if self.learned_sc:
+            x = self.conv1x1(x)
+        return x
+    def _residual(self, x, s):
+        x = self.norm1(x, s)
+        x = self.actv(x)
+        x = self.pool(x)
+        x = self.conv1(self.dropout(x))
+        x = self.norm2(x, s)
+        x = self.actv(x)
+        x = self.conv2(self.dropout(x))
+        return x
+    def forward(self, x, s):
+        out = self._residual(x, s)
+        out = (out + self._shortcut(x)) / math.sqrt(2)
+        return out
+class AdaLayerNorm(nn.Module):
+    def __init__(self, style_dim, channels, eps=1e-5):
+        super().__init__()
+        self.channels = channels
+        self.eps = eps
+        self.fc = nn.Linear(style_dim, channels * 2)
+    def forward(self, x, s):
+        x = x.transpose(-1, -2)
+        x = x.transpose(1, -1)
+        h = self.fc(s)
+        h = h.view(h.size(0), h.size(1), 1)
+        gamma, beta = torch.chunk(h, chunks=2, dim=1)
+        gamma, beta = gamma.transpose(1, -1), beta.transpose(1, -1)
+        x = F.layer_norm(x, (self.channels,), eps=self.eps)
+        x = (1 + gamma) * x + beta
+        return x.transpose(1, -1).transpose(-1, -2)
+class DurationEncoder(nn.Module):
+    def __init__(self, sty_dim, d_model, nlayers, dropout=0.1):
+        super().__init__()
+        self.lstms = nn.ModuleList()
+        for _ in range(nlayers):
+            self.lstms.append(nn.LSTM(d_model + sty_dim, d_model // 2, num_layers=1, batch_first=True, bidirectional=True, dropout=dropout))
+            self.lstms.append(AdaLayerNorm(sty_dim, d_model))
+        self.dropout = dropout
+        self.d_model = d_model
+        self.sty_dim = sty_dim
+    def forward(self, x, style, text_lengths, m):
+        masks = m.to(text_lengths.device)
+        x = x.permute(2, 0, 1)
+        s = style.expand(x.shape[0], x.shape[1], -1)
+        x = torch.cat([x, s], axis=-1)
+        x.masked_fill_(masks.unsqueeze(-1).transpose(0, 1), 0.0)
+        x = x.transpose(0, 1)
+        input_lengths = text_lengths.cpu().numpy()
+        x = x.transpose(-1, -2)
+        for block in self.lstms:
+            if isinstance(block, AdaLayerNorm):
+                x = block(x.transpose(-1, -2), style).transpose(-1, -2)
+                x = torch.cat([x, s.permute(1, -1, 0)], axis=1)
+                x.masked_fill_(masks.unsqueeze(-1).transpose(-1, -2), 0.0)
+            else:
+                x = x.transpose(-1, -2)
+                x = nn.utils.rnn.pack_padded_sequence(x, input_lengths, batch_first=True, enforce_sorted=False)
+                block.flatten_parameters()
+                x, _ = block(x)
+                x, _ = nn.utils.rnn.pad_packed_sequence(x, batch_first=True)
+                x = F.dropout(x, p=self.dropout, training=self.training)
+                x = x.transpose(-1, -2)
+                x_pad = torch.zeros([x.shape[0], x.shape[1], m.shape[-1]])
+                x_pad[:, :, :x.shape[-1]] = x
+                x = x_pad.to(x.device)
+        return x.transpose(-1, -2)
+class ProsodyPredictor(nn.Module):
+    def __init__(self, style_dim, d_hid, nlayers, max_dur=50, dropout=0.1):
+        super().__init__()
+        self.text_encoder = DurationEncoder(sty_dim=style_dim, d_model=d_hid, nlayers=nlayers, dropout=dropout)
+        self.lstm = nn.LSTM(d_hid + style_dim, d_hid // 2, 1, batch_first=True, bidirectional=True)
+        self.duration_proj = LinearNorm(d_hid, max_dur)
+        self.shared = nn.LSTM(d_hid + style_dim, d_hid // 2, 1, batch_first=True, bidirectional=True)
+        self.F0 = nn.ModuleList()
+        self.F0.append(AdainResBlk1d(d_hid, d_hid, style_dim, dropout_p=dropout))
+        self.F0.append(AdainResBlk1d(d_hid, d_hid // 2, style_dim, upsample=True, dropout_p=dropout))
+        self.F0.append(AdainResBlk1d(d_hid // 2, d_hid // 2, style_dim, dropout_p=dropout))
+        self.N = nn.ModuleList()
+        self.N.append(AdainResBlk1d(d_hid, d_hid, style_dim, dropout_p=dropout))
+        self.N.append(AdainResBlk1d(d_hid, d_hid // 2, style_dim, upsample=True, dropout_p=dropout))
+        self.N.append(AdainResBlk1d(d_hid // 2, d_hid // 2, style_dim, dropout_p=dropout))
+        self.F0_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0)
+        self.N_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0)
+    def forward(self, texts, style, text_lengths, alignment, m):
+        d = self.text_encoder(texts, style, text_lengths, m)
+        input_lengths = text_lengths.cpu().numpy()
+        x = nn.utils.rnn.pack_padded_sequence(d, input_lengths, batch_first=True, enforce_sorted=False)
+        m = m.to(text_lengths.device).unsqueeze(1)
+        self.lstm.flatten_parameters()
+        x, _ = self.lstm(x)
+        x, _ = nn.utils.rnn.pad_packed_sequence(x, batch_first=True)
+        x_pad = torch.zeros([x.shape[0], m.shape[-1], x.shape[-1]])
+        x_pad[:, :x.shape[1], :] = x
+        x = x_pad.to(x.device)
+        duration = self.duration_proj(F.dropout(x, 0.5, training=self.training))
+        en = (d.transpose(-1, -2) @ alignment)
+        return duration.squeeze(-1), en
+    def F0Ntrain(self, x, s):
+        x, _ = self.shared(x.transpose(-1, -2))
+        F0 = x.transpose(-1, -2)
+        for block in self.F0:
+            F0 = block(F0, s)
+        F0 = self.F0_proj(F0)
+        N = x.transpose(-1, -2)
+        for block in self.N:
+            N = block(N, s)
+        N = self.N_proj(N)
+        return F0.squeeze(1), N.squeeze(1)
+# ============== Pretrained Model Loaders ==============
+class CustomAlbert(AlbertModel):
+    def forward(self, *args, **kwargs):
+        outputs = super().forward(*args, **kwargs)
+        return outputs.last_hidden_state
+def load_plbert(log_dir):
+    """Load PL-BERT model from directory."""
+    config_path = os.path.join(log_dir, "config.yml")
+    plbert_config = yaml.safe_load(open(config_path))
+    albert_base_configuration = AlbertConfig(**plbert_config['model_params'])
+    bert = CustomAlbert(albert_base_configuration)
+    files = os.listdir(log_dir)
+    ckpts = [f for f in files if f.startswith("step_")]
+    iters = [int(f.split('_')[-1].split('.')[0]) for f in ckpts if os.path.isfile(os.path.join(log_dir, f))]
+    iters = sorted(iters)[-1]
+    checkpoint = torch.load(os.path.join(log_dir, f"step_{iters}.t7"), map_location='cpu')
+    state_dict = checkpoint['net']
+    new_state_dict = OrderedDict()
+    for k, v in state_dict.items():
+        name = k[7:]  # remove `module.`
+        if name.startswith('encoder.'):
+            name = name[8:]  # remove `encoder.`
+            new_state_dict[name] = v
+    if "embeddings.position_ids" in new_state_dict:
+        del new_state_dict["embeddings.position_ids"]
+    bert.load_state_dict(new_state_dict, strict=False)
+    return bert
+# ASR model components
+import torchaudio
+import torchaudio.functional as audio_F
+class MFCC(nn.Module):
+    def __init__(self, n_mfcc=40, n_mels=80):
+        super().__init__()
+        self.n_mfcc = n_mfcc
+        self.n_mels = n_mels
+        self.norm = 'ortho'
+        dct_mat = audio_F.create_dct(self.n_mfcc, self.n_mels, self.norm)
+        self.register_buffer('dct_mat', dct_mat)
+    def forward(self, mel_specgram):
+        if len(mel_specgram.shape) == 2:
+            mel_specgram = mel_specgram.unsqueeze(0)
+            unsqueezed = True
+        else:
+            unsqueezed = False
+        mfcc = torch.matmul(mel_specgram.transpose(1, 2), self.dct_mat).transpose(1, 2)
+        if unsqueezed:
+            mfcc = mfcc.squeeze(0)
+        return mfcc
+class ConvNorm(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=None, dilation=1, bias=True, w_init_gain='linear'):
+        super().__init__()
+        if padding is None:
+            padding = int(dilation * (kernel_size - 1) / 2)
+        self.conv = nn.Conv1d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, bias=bias)
+        nn.init.xavier_uniform_(self.conv.weight, gain=nn.init.calculate_gain(w_init_gain))
+    def forward(self, signal):
+        return self.conv(signal)
+class ConvBlock(nn.Module):
+    def __init__(self, hidden_dim, n_conv=3, dropout_p=0.2, activ='relu'):
+        super().__init__()
+        self._n_groups = 8
+        self.blocks = nn.ModuleList([self._get_conv(hidden_dim, dilation=3**i, activ=activ, dropout_p=dropout_p) for i in range(n_conv)])
+    def forward(self, x):
+        for block in self.blocks:
+            res = x
+            x = block(x)
+            x += res
+        return x
+    def _get_conv(self, hidden_dim, dilation, activ='relu', dropout_p=0.2):
+        layers = [
+            ConvNorm(hidden_dim, hidden_dim, kernel_size=3, padding=dilation, dilation=dilation),
+            nn.ReLU() if activ == 'relu' else nn.LeakyReLU(0.2),
+            nn.GroupNorm(num_groups=self._n_groups, num_channels=hidden_dim),
+            nn.Dropout(p=dropout_p),
+            ConvNorm(hidden_dim, hidden_dim, kernel_size=3, padding=1, dilation=1),
+            nn.ReLU() if activ == 'relu' else nn.LeakyReLU(0.2),
+            nn.Dropout(p=dropout_p)
+        ]
+        return nn.Sequential(*layers)
+class LocationLayer(nn.Module):
+    def __init__(self, attention_n_filters, attention_kernel_size, attention_dim):
+        super().__init__()
+        padding = int((attention_kernel_size - 1) / 2)
+        self.location_conv = ConvNorm(2, attention_n_filters, kernel_size=attention_kernel_size, padding=padding, bias=False, stride=1, dilation=1)
+        self.location_dense = LinearNorm(attention_n_filters, attention_dim, bias=False, w_init_gain='tanh')
+    def forward(self, attention_weights_cat):
+        processed_attention = self.location_conv(attention_weights_cat)
+        processed_attention = processed_attention.transpose(1, 2)
+        processed_attention = self.location_dense(processed_attention)
+        return processed_attention
+class Attention(nn.Module):
+    def __init__(self, attention_rnn_dim, embedding_dim, attention_dim, attention_location_n_filters, attention_location_kernel_size):
+        super().__init__()
+        self.query_layer = LinearNorm(attention_rnn_dim, attention_dim, bias=False, w_init_gain='tanh')
+        self.memory_layer = LinearNorm(embedding_dim, attention_dim, bias=False, w_init_gain='tanh')
+        self.v = LinearNorm(attention_dim, 1, bias=False)
+        self.location_layer = LocationLayer(attention_location_n_filters, attention_location_kernel_size, attention_dim)
+        self.score_mask_value = -float("inf")
+    def forward(self, attention_hidden_state, memory, processed_memory, attention_weights_cat, mask):
+        processed_query = self.query_layer(attention_hidden_state.unsqueeze(1))
+        processed_attention = self.location_layer(attention_weights_cat)
+        energies = self.v(torch.tanh(processed_query + processed_attention + processed_memory))
+        energies = energies.squeeze(-1)
+        if mask is not None:
+            energies.data.masked_fill_(mask, self.score_mask_value)
+        attention_weights = F.softmax(energies, dim=1)
+        attention_context = torch.bmm(attention_weights.unsqueeze(1), memory)
+        attention_context = attention_context.squeeze(1)
+        return attention_context, attention_weights
+class ASRS2S(nn.Module):
+    def __init__(self, embedding_dim=256, hidden_dim=512, n_location_filters=32, location_kernel_size=63, n_token=40):
+        super().__init__()
+        self.embedding = nn.Embedding(n_token, embedding_dim)
+        val_range = math.sqrt(6 / hidden_dim)
+        self.embedding.weight.data.uniform_(-val_range, val_range)
+        self.decoder_rnn_dim = hidden_dim
+        self.project_to_n_symbols = nn.Linear(self.decoder_rnn_dim, n_token)
+        self.attention_layer = Attention(self.decoder_rnn_dim, hidden_dim, hidden_dim, n_location_filters, location_kernel_size)
+        self.decoder_rnn = nn.LSTMCell(self.decoder_rnn_dim + embedding_dim, self.decoder_rnn_dim)
+        self.project_to_hidden = nn.Sequential(LinearNorm(self.decoder_rnn_dim * 2, hidden_dim), nn.Tanh())
+        self.sos = 1
+        self.eos = 2
+        self.unk_index = 3
+        self.random_mask = 0.1
+    def initialize_decoder_states(self, memory, mask):
+        B, L, H = memory.shape
+        self.decoder_hidden = torch.zeros((B, self.decoder_rnn_dim)).type_as(memory)
+        self.decoder_cell = torch.zeros((B, self.decoder_rnn_dim)).type_as(memory)
+        self.attention_weights = torch.zeros((B, L)).type_as(memory)
+        self.attention_weights_cum = torch.zeros((B, L)).type_as(memory)
+        self.attention_context = torch.zeros((B, H)).type_as(memory)
+        self.memory = memory
+        self.processed_memory = self.attention_layer.memory_layer(memory)
+        self.mask = mask
+    def forward(self, memory, memory_mask, text_input):
+        self.initialize_decoder_states(memory, memory_mask)
+        random_mask = (torch.rand(text_input.shape) < self.random_mask).to(text_input.device)
+        _text_input = text_input.clone()
+        _text_input.masked_fill_(random_mask, self.unk_index)
+        decoder_inputs = self.embedding(_text_input).transpose(0, 1)
+        start_embedding = self.embedding(torch.LongTensor([self.sos] * decoder_inputs.size(1)).to(decoder_inputs.device))
+        decoder_inputs = torch.cat((start_embedding.unsqueeze(0), decoder_inputs), dim=0)
+        hidden_outputs, logit_outputs, alignments = [], [], []
+        while len(hidden_outputs) < decoder_inputs.size(0):
+            decoder_input = decoder_inputs[len(hidden_outputs)]
+            hidden, logit, attention_weights = self.decode(decoder_input)
+            hidden_outputs += [hidden]
+            logit_outputs += [logit]
+            alignments += [attention_weights]
+        hidden_outputs = torch.stack(hidden_outputs).transpose(0, 1).contiguous()
+        logit_outputs = torch.stack(logit_outputs).transpose(0, 1).contiguous()
+        alignments = torch.stack(alignments).transpose(0, 1)
+        return hidden_outputs, logit_outputs, alignments
+    def decode(self, decoder_input):
+        cell_input = torch.cat((decoder_input, self.attention_context), -1)
+        self.decoder_hidden, self.decoder_cell = self.decoder_rnn(cell_input, (self.decoder_hidden, self.decoder_cell))
+        attention_weights_cat = torch.cat((self.attention_weights.unsqueeze(1), self.attention_weights_cum.unsqueeze(1)), dim=1)
+        self.attention_context, self.attention_weights = self.attention_layer(self.decoder_hidden, self.memory, self.processed_memory, attention_weights_cat, self.mask)
+        self.attention_weights_cum += self.attention_weights
+        hidden_and_context = torch.cat((self.decoder_hidden, self.attention_context), -1)
+        hidden = self.project_to_hidden(hidden_and_context)
+        logit = self.project_to_n_symbols(F.dropout(hidden, 0.5, self.training))
+        return hidden, logit, self.attention_weights
+class ASRCNN(nn.Module):
+    def __init__(self, input_dim=80, hidden_dim=256, n_token=35, n_layers=6, token_embedding_dim=256):
+        super().__init__()
+        self.n_token = n_token
+        self.n_down = 1
+        self.to_mfcc = MFCC()
+        self.init_cnn = ConvNorm(input_dim // 2, hidden_dim, kernel_size=7, padding=3, stride=2)
+        self.cnns = nn.Sequential(*[nn.Sequential(ConvBlock(hidden_dim), nn.GroupNorm(num_groups=1, num_channels=hidden_dim)) for _ in range(n_layers)])
+        self.projection = ConvNorm(hidden_dim, hidden_dim // 2)
+        self.ctc_linear = nn.Sequential(LinearNorm(hidden_dim // 2, hidden_dim), nn.ReLU(), LinearNorm(hidden_dim, n_token))
+        self.asr_s2s = ASRS2S(embedding_dim=token_embedding_dim, hidden_dim=hidden_dim // 2, n_token=n_token)
+    def forward(self, x, src_key_padding_mask=None, text_input=None):
+        x = self.to_mfcc(x)
+        x = self.init_cnn(x)
+        x = self.cnns(x)
+        x = self.projection(x)
+        x = x.transpose(1, 2)
+        ctc_logit = self.ctc_linear(x)
+        if text_input is not None:
+            _, s2s_logit, s2s_attn = self.asr_s2s(x, src_key_padding_mask, text_input)
+            return ctc_logit, s2s_logit, s2s_attn
+        else:
+            return ctc_logit
+def load_ASR_models(ASR_MODEL_PATH, ASR_MODEL_CONFIG):
+    """Load ASR model."""
+    with open(ASR_MODEL_CONFIG) as f:
+        config = yaml.safe_load(f)
+    model_config = config['model_params']
+    model = ASRCNN(**model_config)
+    try:
+        ckpt = torch.load(ASR_MODEL_PATH, map_location="cpu", weights_only=False)
+    except TypeError:
+        ckpt = torch.load(ASR_MODEL_PATH, map_location="cpu")
+    params = ckpt["model"]
+    model.load_state_dict(params)
+    return model
+# JDC (F0) model
+class ResBlock_JDC(nn.Module):
+    def __init__(self, in_channels, out_channels, leaky_relu_slope=0.01):
+        super().__init__()
+        self.downsample = in_channels != out_channels
+        self.pre_conv = nn.Sequential(nn.BatchNorm2d(num_features=in_channels), nn.LeakyReLU(leaky_relu_slope, inplace=True), nn.MaxPool2d(kernel_size=(1, 2)))
+        self.conv = nn.Sequential(nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=3, padding=1, bias=False), nn.BatchNorm2d(out_channels), nn.LeakyReLU(leaky_relu_slope, inplace=True), nn.Conv2d(out_channels, out_channels, 3, padding=1, bias=False))
+        self.conv1by1 = None
+        if self.downsample:
+            self.conv1by1 = nn.Conv2d(in_channels, out_channels, 1, bias=False)
+    def forward(self, x):
+        x = self.pre_conv(x)
+        if self.downsample:
+            x = self.conv(x) + self.conv1by1(x)
+        else:
+            x = self.conv(x) + x
+        return x
+class JDCNet(nn.Module):
+    def __init__(self, num_class=722, seq_len=31, leaky_relu_slope=0.01):
+        super().__init__()
+        self.num_class = num_class
+        self.conv_block = nn.Sequential(nn.Conv2d(in_channels=1, out_channels=64, kernel_size=3, padding=1, bias=False), nn.BatchNorm2d(num_features=64), nn.LeakyReLU(leaky_relu_slope, inplace=True), nn.Conv2d(64, 64, 3, padding=1, bias=False))
+        self.res_block1 = ResBlock_JDC(in_channels=64, out_channels=128)
+        self.res_block2 = ResBlock_JDC(in_channels=128, out_channels=192)
+        self.res_block3 = ResBlock_JDC(in_channels=192, out_channels=256)
+        self.pool_block = nn.Sequential(nn.BatchNorm2d(num_features=256), nn.LeakyReLU(leaky_relu_slope, inplace=True), nn.MaxPool2d(kernel_size=(1, 4)), nn.Dropout(p=0.2))
+        # Maxpool layers for auxiliary network
+        self.maxpool1 = nn.MaxPool2d(kernel_size=(1, 40))
+        self.maxpool2 = nn.MaxPool2d(kernel_size=(1, 20))
+        self.maxpool3 = nn.MaxPool2d(kernel_size=(1, 10))
+        # Detector conv
+        self.detector_conv = nn.Sequential(nn.Conv2d(640, 256, 1, bias=False), nn.BatchNorm2d(256), nn.LeakyReLU(leaky_relu_slope, inplace=True), nn.Dropout(p=0.2))
+        # Classifier and detector LSTMs
+        self.bilstm_classifier = nn.LSTM(input_size=512, hidden_size=256, batch_first=True, bidirectional=True)
+        self.bilstm_detector = nn.LSTM(input_size=512, hidden_size=256, batch_first=True, bidirectional=True)
+        # Output layers
+        self.classifier = nn.Linear(in_features=512, out_features=self.num_class)
+        self.detector = nn.Linear(in_features=512, out_features=2)
+    def forward(self, x):
+        seq_len = x.shape[-1]
+        x = x.float().transpose(-1, -2)
+        convblock_out = self.conv_block(x)
+        resblock1_out = self.res_block1(convblock_out)
+        resblock2_out = self.res_block2(resblock1_out)
+        resblock3_out = self.res_block3(resblock2_out)
+        poolblock_out = self.pool_block[0](resblock3_out)
+        poolblock_out = self.pool_block[1](poolblock_out)
+        GAN_feature = poolblock_out.transpose(-1, -2)
+        poolblock_out = self.pool_block[2](poolblock_out)
+        classifier_out = poolblock_out.permute(0, 2, 1, 3).contiguous().view((-1, seq_len, 512))
+        classifier_out, _ = self.bilstm_classifier(classifier_out)
+        classifier_out = classifier_out.contiguous().view((-1, 512))
+        classifier_out = self.classifier(classifier_out)
+        classifier_out = classifier_out.view((-1, seq_len, self.num_class))
+        return torch.abs(classifier_out.squeeze()), GAN_feature, poolblock_out
+def load_F0_models(path):
+    """Load F0 (pitch) model."""
+    F0_model = JDCNet(num_class=1, seq_len=192)
+    params = torch.load(path, map_location='cpu')['net']
+    F0_model.load_state_dict(params)
+    return F0_model
+# ============== Build Model ==============
+def build_model(args, text_aligner, pitch_extractor, bert):
+    """Build the full TTS model."""
+    assert args.decoder.type in ['istftnet', 'hifigan'], 'Decoder type unknown'
+    decoder = Decoder(
+        dim_in=args.hidden_dim,
+        style_dim=args.style_dim,
+        dim_out=args.n_mels,
+        resblock_kernel_sizes=args.decoder.resblock_kernel_sizes,
+        upsample_rates=args.decoder.upsample_rates,
+        upsample_initial_channel=args.decoder.upsample_initial_channel,
+        resblock_dilation_sizes=args.decoder.resblock_dilation_sizes,
+        upsample_kernel_sizes=args.decoder.upsample_kernel_sizes
+    )
+    text_encoder = TextEncoder(channels=args.hidden_dim, kernel_size=5, depth=args.n_layer, n_symbols=args.n_token)
+    predictor = ProsodyPredictor(style_dim=args.style_dim, d_hid=args.hidden_dim, nlayers=args.n_layer, max_dur=args.max_dur, dropout=args.dropout)
+    style_encoder = StyleEncoder(dim_in=args.dim_in, style_dim=args.style_dim, max_conv_dim=args.hidden_dim)
+    predictor_encoder = StyleEncoder(dim_in=args.dim_in, style_dim=args.style_dim, max_conv_dim=args.hidden_dim)
+    if args.multispeaker:
+        transformer = StyleTransformer1d(
+            channels=args.style_dim * 2,
+            context_embedding_features=bert.config.hidden_size,
+            context_features=args.style_dim * 2,
+            **args.diffusion.transformer
+        )
+    else:
+        transformer = Transformer1d(
+            channels=args.style_dim * 2,
+            context_embedding_features=bert.config.hidden_size,
+            **args.diffusion.transformer
+        )
+    diffusion = AudioDiffusionConditional(
+        in_channels=1,
+        embedding_max_length=bert.config.max_position_embeddings,
+        embedding_features=bert.config.hidden_size,
+        embedding_mask_proba=args.diffusion.embedding_mask_proba,
+        channels=args.style_dim * 2,
+        context_features=args.style_dim * 2,
+    )
+    diffusion.diffusion = KDiffusion(
+        net=diffusion.unet,
+        sigma_distribution=LogNormalDistribution(mean=args.diffusion.dist.mean, std=args.diffusion.dist.std),
+        sigma_data=args.diffusion.dist.sigma_data,
+        dynamic_threshold=0.0
+    )
+    diffusion.diffusion.net = transformer
+    diffusion.unet = transformer
+    nets = Munch(
+        bert=bert,
+        bert_encoder=nn.Linear(bert.config.hidden_size, args.hidden_dim),
+        predictor=predictor,
+        decoder=decoder,
+        text_encoder=text_encoder,
+        predictor_encoder=predictor_encoder,
+        style_encoder=style_encoder,
+        diffusion=diffusion,
+        text_aligner=text_aligner,
+        pitch_extractor=pitch_extractor,
+    )
+    return nets

chiluka/models/diffusion/__init__.py ADDED Viewed

	@@ -0,0 +1,22 @@

+"""Diffusion model components."""
+from .sampler import (
+    DiffusionSampler,
+    ADPM2Sampler,
+    KarrasSchedule,
+    KDiffusion,
+    LogNormalDistribution,
+)
+from .modules import Transformer1d, StyleTransformer1d
+from .diffusion import AudioDiffusionConditional
+__all__ = [
+    "DiffusionSampler",
+    "ADPM2Sampler",
+    "KarrasSchedule",
+    "KDiffusion",
+    "LogNormalDistribution",
+    "Transformer1d",
+    "StyleTransformer1d",
+    "AudioDiffusionConditional",
+]

chiluka/models/diffusion/diffusion.py ADDED Viewed

	@@ -0,0 +1,72 @@

+"""Audio diffusion model classes."""
+import torch
+from torch import Tensor, nn
+from .utils import groupby
+from .sampler import UniformDistribution
+class LinearSchedule(nn.Module):
+    def forward(self, num_steps: int, device) -> Tensor:
+        sigmas = torch.linspace(1, 0, num_steps + 1)[:-1]
+        return sigmas
+class VSampler(nn.Module):
+    pass
+class Model1d(nn.Module):
+    def __init__(self, unet_type: str = "base", **kwargs):
+        super().__init__()
+        diffusion_kwargs, kwargs = groupby("diffusion_", kwargs)
+        self.unet = None
+        self.diffusion = None
+    def forward(self, x: Tensor, **kwargs) -> Tensor:
+        return self.diffusion(x, **kwargs)
+    def sample(self, *args, **kwargs) -> Tensor:
+        return self.diffusion.sample(*args, **kwargs)
+def get_default_model_kwargs():
+    return dict(
+        channels=128,
+        patch_size=16,
+        multipliers=[1, 2, 4, 4, 4, 4, 4],
+        factors=[4, 4, 4, 2, 2, 2],
+        num_blocks=[2, 2, 2, 2, 2, 2],
+        attentions=[0, 0, 0, 1, 1, 1, 1],
+        attention_heads=8,
+        attention_features=64,
+        attention_multiplier=2,
+        attention_use_rel_pos=False,
+        diffusion_type="v",
+        diffusion_sigma_distribution=UniformDistribution(),
+    )
+def get_default_sampling_kwargs():
+    return dict(sigma_schedule=LinearSchedule(), sampler=VSampler(), clamp=True)
+class AudioDiffusionConditional(Model1d):
+    def __init__(self, embedding_features: int, embedding_max_length: int, embedding_mask_proba: float = 0.1, **kwargs):
+        self.embedding_mask_proba = embedding_mask_proba
+        default_kwargs = dict(
+            **get_default_model_kwargs(),
+            unet_type="cfg",
+            context_embedding_features=embedding_features,
+            context_embedding_max_length=embedding_max_length,
+        )
+        super().__init__(**{**default_kwargs, **kwargs})
+    def forward(self, *args, **kwargs):
+        default_kwargs = dict(embedding_mask_proba=self.embedding_mask_proba)
+        return super().forward(*args, **{**default_kwargs, **kwargs})
+    def sample(self, *args, **kwargs):
+        default_kwargs = dict(**get_default_sampling_kwargs(), embedding_scale=5.0)
+        return super().sample(*args, **{**default_kwargs, **kwargs})

chiluka/models/diffusion/modules.py ADDED Viewed

	@@ -0,0 +1,367 @@

+"""Diffusion transformer modules."""
+from math import log, pi
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, reduce, repeat
+from einops.layers.torch import Rearrange
+from torch import Tensor, einsum
+from .utils import exists, default, rand_bool
+class AdaLayerNorm(nn.Module):
+    def __init__(self, style_dim, channels, eps=1e-5):
+        super().__init__()
+        self.channels = channels
+        self.eps = eps
+        self.fc = nn.Linear(style_dim, channels * 2)
+    def forward(self, x, s):
+        x = x.transpose(-1, -2)
+        x = x.transpose(1, -1)
+        h = self.fc(s)
+        h = h.view(h.size(0), h.size(1), 1)
+        gamma, beta = torch.chunk(h, chunks=2, dim=1)
+        gamma, beta = gamma.transpose(1, -1), beta.transpose(1, -1)
+        x = F.layer_norm(x, (self.channels,), eps=self.eps)
+        x = (1 + gamma) * x + beta
+        return x.transpose(1, -1).transpose(-1, -2)
+class LearnedPositionalEmbedding(nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        assert (dim % 2) == 0
+        half_dim = dim // 2
+        self.weights = nn.Parameter(torch.randn(half_dim))
+    def forward(self, x: Tensor) -> Tensor:
+        x = rearrange(x, "b -> b 1")
+        freqs = x * rearrange(self.weights, "d -> 1 d") * 2 * pi
+        fouriered = torch.cat((freqs.sin(), freqs.cos()), dim=-1)
+        fouriered = torch.cat((x, fouriered), dim=-1)
+        return fouriered
+def TimePositionalEmbedding(dim: int, out_features: int) -> nn.Module:
+    return nn.Sequential(
+        LearnedPositionalEmbedding(dim),
+        nn.Linear(in_features=dim + 1, out_features=out_features),
+    )
+class FixedEmbedding(nn.Module):
+    def __init__(self, max_length: int, features: int):
+        super().__init__()
+        self.max_length = max_length
+        self.embedding = nn.Embedding(max_length, features)
+    def forward(self, x: Tensor) -> Tensor:
+        batch_size, length, device = *x.shape[0:2], x.device
+        assert length <= self.max_length, "Input sequence length must be <= max_length"
+        position = torch.arange(length, device=device)
+        fixed_embedding = self.embedding(position)
+        fixed_embedding = repeat(fixed_embedding, "n d -> b n d", b=batch_size)
+        return fixed_embedding
+class RelativePositionBias(nn.Module):
+    def __init__(self, num_buckets: int, max_distance: int, num_heads: int):
+        super().__init__()
+        self.num_buckets = num_buckets
+        self.max_distance = max_distance
+        self.num_heads = num_heads
+        self.relative_attention_bias = nn.Embedding(num_buckets, num_heads)
+    @staticmethod
+    def _relative_position_bucket(relative_position: Tensor, num_buckets: int, max_distance: int):
+        num_buckets //= 2
+        ret = (relative_position >= 0).to(torch.long) * num_buckets
+        n = torch.abs(relative_position)
+        max_exact = num_buckets // 2
+        is_small = n < max_exact
+        val_if_large = max_exact + (torch.log(n.float() / max_exact) / log(max_distance / max_exact) * (num_buckets - max_exact)).long()
+        val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))
+        ret += torch.where(is_small, n, val_if_large)
+        return ret
+    def forward(self, num_queries: int, num_keys: int) -> Tensor:
+        i, j, device = num_queries, num_keys, self.relative_attention_bias.weight.device
+        q_pos = torch.arange(j - i, j, dtype=torch.long, device=device)
+        k_pos = torch.arange(j, dtype=torch.long, device=device)
+        rel_pos = rearrange(k_pos, "j -> 1 j") - rearrange(q_pos, "i -> i 1")
+        relative_position_bucket = self._relative_position_bucket(rel_pos, num_buckets=self.num_buckets, max_distance=self.max_distance)
+        bias = self.relative_attention_bias(relative_position_bucket)
+        bias = rearrange(bias, "m n h -> 1 h m n")
+        return bias
+def FeedForward(features: int, multiplier: int) -> nn.Module:
+    mid_features = features * multiplier
+    return nn.Sequential(
+        nn.Linear(in_features=features, out_features=mid_features),
+        nn.GELU(),
+        nn.Linear(in_features=mid_features, out_features=features),
+    )
+class AttentionBase(nn.Module):
+    def __init__(self, features: int, *, head_features: int, num_heads: int, use_rel_pos: bool,
+                 out_features: Optional[int] = None, rel_pos_num_buckets: Optional[int] = None,
+                 rel_pos_max_distance: Optional[int] = None):
+        super().__init__()
+        self.scale = head_features ** -0.5
+        self.num_heads = num_heads
+        self.use_rel_pos = use_rel_pos
+        mid_features = head_features * num_heads
+        if use_rel_pos:
+            assert exists(rel_pos_num_buckets) and exists(rel_pos_max_distance)
+            self.rel_pos = RelativePositionBias(num_buckets=rel_pos_num_buckets, max_distance=rel_pos_max_distance, num_heads=num_heads)
+        if out_features is None:
+            out_features = features
+        self.to_out = nn.Linear(in_features=mid_features, out_features=out_features)
+    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor:
+        h = self.num_heads
+        q = rearrange(q, "b n (h d) -> b h n d", h=h)
+        k = rearrange(k, "b n (h d) -> b h n d", h=h)
+        v = rearrange(v, "b n (h d) -> b h n d", h=h)
+        sim = einsum("b h n d, b h m d -> b h n m", q, k)
+        sim = (sim + self.rel_pos(*sim.shape[-2:])) if self.use_rel_pos else sim
+        sim = sim * self.scale
+        attn = sim.softmax(dim=-1)
+        out = einsum("b h n m, b h m d -> b h n d", attn, v)
+        out = rearrange(out, "b h n d -> b n (h d)")
+        return self.to_out(out)
+class StyleAttention(nn.Module):
+    def __init__(self, features: int, *, style_dim: int, head_features: int, num_heads: int,
+                 context_features: Optional[int] = None, use_rel_pos: bool,
+                 rel_pos_num_buckets: Optional[int] = None, rel_pos_max_distance: Optional[int] = None):
+        super().__init__()
+        self.context_features = context_features
+        mid_features = head_features * num_heads
+        context_features = default(context_features, features)
+        self.norm = AdaLayerNorm(style_dim, features)
+        self.norm_context = AdaLayerNorm(style_dim, context_features)
+        self.to_q = nn.Linear(in_features=features, out_features=mid_features, bias=False)
+        self.to_kv = nn.Linear(in_features=context_features, out_features=mid_features * 2, bias=False)
+        self.attention = AttentionBase(features, num_heads=num_heads, head_features=head_features,
+                                       use_rel_pos=use_rel_pos, rel_pos_num_buckets=rel_pos_num_buckets,
+                                       rel_pos_max_distance=rel_pos_max_distance)
+    def forward(self, x: Tensor, s: Tensor, *, context: Optional[Tensor] = None) -> Tensor:
+        context = default(context, x)
+        x, context = self.norm(x, s), self.norm_context(context, s)
+        q, k, v = (self.to_q(x), *torch.chunk(self.to_kv(context), chunks=2, dim=-1))
+        return self.attention(q, k, v)
+class Attention(nn.Module):
+    def __init__(self, features: int, *, head_features: int, num_heads: int, out_features: Optional[int] = None,
+                 context_features: Optional[int] = None, use_rel_pos: bool,
+                 rel_pos_num_buckets: Optional[int] = None, rel_pos_max_distance: Optional[int] = None):
+        super().__init__()
+        self.context_features = context_features
+        mid_features = head_features * num_heads
+        context_features = default(context_features, features)
+        self.norm = nn.LayerNorm(features)
+        self.norm_context = nn.LayerNorm(context_features)
+        self.to_q = nn.Linear(in_features=features, out_features=mid_features, bias=False)
+        self.to_kv = nn.Linear(in_features=context_features, out_features=mid_features * 2, bias=False)
+        self.attention = AttentionBase(features, out_features=out_features, num_heads=num_heads, head_features=head_features,
+                                       use_rel_pos=use_rel_pos, rel_pos_num_buckets=rel_pos_num_buckets,
+                                       rel_pos_max_distance=rel_pos_max_distance)
+    def forward(self, x: Tensor, *, context: Optional[Tensor] = None) -> Tensor:
+        context = default(context, x)
+        x, context = self.norm(x), self.norm_context(context)
+        q, k, v = (self.to_q(x), *torch.chunk(self.to_kv(context), chunks=2, dim=-1))
+        return self.attention(q, k, v)
+class StyleTransformerBlock(nn.Module):
+    def __init__(self, features: int, num_heads: int, head_features: int, style_dim: int, multiplier: int,
+                 use_rel_pos: bool, rel_pos_num_buckets: Optional[int] = None,
+                 rel_pos_max_distance: Optional[int] = None, context_features: Optional[int] = None):
+        super().__init__()
+        self.use_cross_attention = exists(context_features) and context_features > 0
+        self.attention = StyleAttention(features=features, style_dim=style_dim, num_heads=num_heads, head_features=head_features,
+                                        use_rel_pos=use_rel_pos, rel_pos_num_buckets=rel_pos_num_buckets,
+                                        rel_pos_max_distance=rel_pos_max_distance)
+        if self.use_cross_attention:
+            self.cross_attention = StyleAttention(features=features, style_dim=style_dim, num_heads=num_heads, head_features=head_features,
+                                                  context_features=context_features, use_rel_pos=use_rel_pos,
+                                                  rel_pos_num_buckets=rel_pos_num_buckets, rel_pos_max_distance=rel_pos_max_distance)
+        self.feed_forward = FeedForward(features=features, multiplier=multiplier)
+    def forward(self, x: Tensor, s: Tensor, *, context: Optional[Tensor] = None) -> Tensor:
+        x = self.attention(x, s) + x
+        if self.use_cross_attention:
+            x = self.cross_attention(x, s, context=context) + x
+        x = self.feed_forward(x) + x
+        return x
+class TransformerBlock(nn.Module):
+    def __init__(self, features: int, num_heads: int, head_features: int, multiplier: int, use_rel_pos: bool,
+                 rel_pos_num_buckets: Optional[int] = None, rel_pos_max_distance: Optional[int] = None,
+                 context_features: Optional[int] = None):
+        super().__init__()
+        self.use_cross_attention = exists(context_features) and context_features > 0
+        self.attention = Attention(features=features, num_heads=num_heads, head_features=head_features,
+                                   use_rel_pos=use_rel_pos, rel_pos_num_buckets=rel_pos_num_buckets,
+                                   rel_pos_max_distance=rel_pos_max_distance)
+        if self.use_cross_attention:
+            self.cross_attention = Attention(features=features, num_heads=num_heads, head_features=head_features,
+                                             context_features=context_features, use_rel_pos=use_rel_pos,
+                                             rel_pos_num_buckets=rel_pos_num_buckets, rel_pos_max_distance=rel_pos_max_distance)
+        self.feed_forward = FeedForward(features=features, multiplier=multiplier)
+    def forward(self, x: Tensor, *, context: Optional[Tensor] = None) -> Tensor:
+        x = self.attention(x) + x
+        if self.use_cross_attention:
+            x = self.cross_attention(x, context=context) + x
+        x = self.feed_forward(x) + x
+        return x
+class StyleTransformer1d(nn.Module):
+    def __init__(self, num_layers: int, channels: int, num_heads: int, head_features: int, multiplier: int,
+                 use_context_time: bool = True, use_rel_pos: bool = False, context_features_multiplier: int = 1,
+                 rel_pos_num_buckets: Optional[int] = None, rel_pos_max_distance: Optional[int] = None,
+                 context_features: Optional[int] = None, context_embedding_features: Optional[int] = None,
+                 embedding_max_length: int = 512):
+        super().__init__()
+        self.blocks = nn.ModuleList([
+            StyleTransformerBlock(features=channels + context_embedding_features, head_features=head_features, num_heads=num_heads,
+                                  multiplier=multiplier, style_dim=context_features, use_rel_pos=use_rel_pos,
+                                  rel_pos_num_buckets=rel_pos_num_buckets, rel_pos_max_distance=rel_pos_max_distance)
+            for _ in range(num_layers)
+        ])
+        self.to_out = nn.Sequential(
+            Rearrange("b t c -> b c t"),
+            nn.Conv1d(in_channels=channels + context_embedding_features, out_channels=channels, kernel_size=1),
+        )
+        use_context_features = exists(context_features)
+        self.use_context_features = use_context_features
+        self.use_context_time = use_context_time
+        if use_context_time or use_context_features:
+            context_mapping_features = channels + context_embedding_features
+            self.to_mapping = nn.Sequential(nn.Linear(context_mapping_features, context_mapping_features), nn.GELU(),
+                                            nn.Linear(context_mapping_features, context_mapping_features), nn.GELU())
+        if use_context_time:
+            self.to_time = nn.Sequential(TimePositionalEmbedding(dim=channels, out_features=context_mapping_features), nn.GELU())
+        if use_context_features:
+            self.to_features = nn.Sequential(nn.Linear(in_features=context_features, out_features=context_mapping_features), nn.GELU())
+        self.fixed_embedding = FixedEmbedding(max_length=embedding_max_length, features=context_embedding_features)
+    def get_mapping(self, time: Optional[Tensor] = None, features: Optional[Tensor] = None) -> Optional[Tensor]:
+        items, mapping = [], None
+        if self.use_context_time:
+            items += [self.to_time(time)]
+        if self.use_context_features:
+            items += [self.to_features(features)]
+        if self.use_context_time or self.use_context_features:
+            mapping = reduce(torch.stack(items), "n b m -> b m", "sum")
+            mapping = self.to_mapping(mapping)
+        return mapping
+    def run(self, x, time, embedding, features):
+        mapping = self.get_mapping(time, features)
+        x = torch.cat([x.expand(-1, embedding.size(1), -1), embedding], axis=-1)
+        mapping = mapping.unsqueeze(1).expand(-1, embedding.size(1), -1)
+        for block in self.blocks:
+            x = x + mapping
+            x = block(x, features)
+        x = x.mean(axis=1).unsqueeze(1)
+        x = self.to_out(x)
+        x = x.transpose(-1, -2)
+        return x
+    def forward(self, x: Tensor, time: Tensor, embedding_mask_proba: float = 0.0, embedding: Optional[Tensor] = None,
+                features: Optional[Tensor] = None, embedding_scale: float = 1.0) -> Tensor:
+        b, device = embedding.shape[0], embedding.device
+        fixed_embedding = self.fixed_embedding(embedding)
+        if embedding_mask_proba > 0.0:
+            batch_mask = rand_bool(shape=(b, 1, 1), proba=embedding_mask_proba, device=device)
+            embedding = torch.where(batch_mask, fixed_embedding, embedding)
+        if embedding_scale != 1.0:
+            out = self.run(x, time, embedding=embedding, features=features)
+            out_masked = self.run(x, time, embedding=fixed_embedding, features=features)
+            return out_masked + (out - out_masked) * embedding_scale
+        else:
+            return self.run(x, time, embedding=embedding, features=features)
+class Transformer1d(nn.Module):
+    def __init__(self, num_layers: int, channels: int, num_heads: int, head_features: int, multiplier: int,
+                 use_context_time: bool = True, use_rel_pos: bool = False, context_features_multiplier: int = 1,
+                 rel_pos_num_buckets: Optional[int] = None, rel_pos_max_distance: Optional[int] = None,
+                 context_features: Optional[int] = None, context_embedding_features: Optional[int] = None,
+                 embedding_max_length: int = 512):
+        super().__init__()
+        self.blocks = nn.ModuleList([
+            TransformerBlock(features=channels + context_embedding_features, head_features=head_features, num_heads=num_heads,
+                             multiplier=multiplier, use_rel_pos=use_rel_pos, rel_pos_num_buckets=rel_pos_num_buckets,
+                             rel_pos_max_distance=rel_pos_max_distance)
+            for _ in range(num_layers)
+        ])
+        self.to_out = nn.Sequential(
+            Rearrange("b t c -> b c t"),
+            nn.Conv1d(in_channels=channels + context_embedding_features, out_channels=channels, kernel_size=1),
+        )
+        use_context_features = exists(context_features)
+        self.use_context_features = use_context_features
+        self.use_context_time = use_context_time
+        if use_context_time or use_context_features:
+            context_mapping_features = channels + context_embedding_features
+            self.to_mapping = nn.Sequential(nn.Linear(context_mapping_features, context_mapping_features), nn.GELU(),
+                                            nn.Linear(context_mapping_features, context_mapping_features), nn.GELU())
+        if use_context_time:
+            self.to_time = nn.Sequential(TimePositionalEmbedding(dim=channels, out_features=context_mapping_features), nn.GELU())
+        if use_context_features:
+            self.to_features = nn.Sequential(nn.Linear(in_features=context_features, out_features=context_mapping_features), nn.GELU())
+        self.fixed_embedding = FixedEmbedding(max_length=embedding_max_length, features=context_embedding_features)
+    def get_mapping(self, time: Optional[Tensor] = None, features: Optional[Tensor] = None) -> Optional[Tensor]:
+        items, mapping = [], None
+        if self.use_context_time:
+            items += [self.to_time(time)]
+        if self.use_context_features:
+            items += [self.to_features(features)]
+        if self.use_context_time or self.use_context_features:
+            mapping = reduce(torch.stack(items), "n b m -> b m", "sum")
+            mapping = self.to_mapping(mapping)
+        return mapping
+    def run(self, x, time, embedding, features):
+        mapping = self.get_mapping(time, features)
+        x = torch.cat([x.expand(-1, embedding.size(1), -1), embedding], axis=-1)
+        mapping = mapping.unsqueeze(1).expand(-1, embedding.size(1), -1)
+        for block in self.blocks:
+            x = x + mapping
+            x = block(x)
+        x = x.mean(axis=1).unsqueeze(1)
+        x = self.to_out(x)
+        x = x.transpose(-1, -2)
+        return x
+    def forward(self, x: Tensor, time: Tensor, embedding_mask_proba: float = 0.0, embedding: Optional[Tensor] = None,
+                features: Optional[Tensor] = None, embedding_scale: float = 1.0) -> Tensor:
+        b, device = embedding.shape[0], embedding.device
+        fixed_embedding = self.fixed_embedding(embedding)
+        if embedding_mask_proba > 0.0:
+            batch_mask = rand_bool(shape=(b, 1, 1), proba=embedding_mask_proba, device=device)
+            embedding = torch.where(batch_mask, fixed_embedding, embedding)
+        if embedding_scale != 1.0:
+            out = self.run(x, time, embedding=embedding, features=features)
+            out_masked = self.run(x, time, embedding=fixed_embedding, features=features)
+            return out_masked + (out - out_masked) * embedding_scale
+        else:
+            return self.run(x, time, embedding=embedding, features=features)

chiluka/models/diffusion/sampler.py ADDED Viewed

	@@ -0,0 +1,176 @@

+"""Diffusion sampling classes."""
+from math import atan, cos, pi, sin, sqrt
+from typing import Any, Callable, List, Optional, Tuple, Type
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, reduce
+from torch import Tensor
+from .utils import exists, default
+class Distribution:
+    def __call__(self, num_samples: int, device: torch.device):
+        raise NotImplementedError()
+class LogNormalDistribution(Distribution):
+    def __init__(self, mean: float, std: float):
+        self.mean = mean
+        self.std = std
+    def __call__(self, num_samples: int, device: torch.device = torch.device("cpu")) -> Tensor:
+        normal = self.mean + self.std * torch.randn((num_samples,), device=device)
+        return normal.exp()
+class UniformDistribution(Distribution):
+    def __call__(self, num_samples: int, device: torch.device = torch.device("cpu")):
+        return torch.rand(num_samples, device=device)
+def to_batch(batch_size: int, device: torch.device, x: Optional[float] = None, xs: Optional[Tensor] = None) -> Tensor:
+    assert exists(x) ^ exists(xs), "Either x or xs must be provided"
+    if exists(x):
+        xs = torch.full(size=(batch_size,), fill_value=x).to(device)
+    assert exists(xs)
+    return xs
+class Diffusion(nn.Module):
+    alias: str = ""
+    def denoise_fn(self, x_noisy: Tensor, sigmas: Optional[Tensor] = None, sigma: Optional[float] = None, **kwargs) -> Tensor:
+        raise NotImplementedError("Diffusion class missing denoise_fn")
+    def forward(self, x: Tensor, noise: Tensor = None, **kwargs) -> Tensor:
+        raise NotImplementedError("Diffusion class missing forward function")
+class KDiffusion(Diffusion):
+    """Elucidated Diffusion (Karras et al. 2022)"""
+    alias = "k"
+    def __init__(self, net: nn.Module, *, sigma_distribution: Distribution, sigma_data: float, dynamic_threshold: float = 0.0):
+        super().__init__()
+        self.net = net
+        self.sigma_data = sigma_data
+        self.sigma_distribution = sigma_distribution
+        self.dynamic_threshold = dynamic_threshold
+    def get_scale_weights(self, sigmas: Tensor) -> Tuple[Tensor, ...]:
+        sigma_data = self.sigma_data
+        c_noise = torch.log(sigmas) * 0.25
+        sigmas = rearrange(sigmas, "b -> b 1 1")
+        c_skip = (sigma_data ** 2) / (sigmas ** 2 + sigma_data ** 2)
+        c_out = sigmas * sigma_data * (sigma_data ** 2 + sigmas ** 2) ** -0.5
+        c_in = (sigmas ** 2 + sigma_data ** 2) ** -0.5
+        return c_skip, c_out, c_in, c_noise
+    def denoise_fn(self, x_noisy: Tensor, sigmas: Optional[Tensor] = None, sigma: Optional[float] = None, **kwargs) -> Tensor:
+        batch_size, device = x_noisy.shape[0], x_noisy.device
+        sigmas = to_batch(x=sigma, xs=sigmas, batch_size=batch_size, device=device)
+        c_skip, c_out, c_in, c_noise = self.get_scale_weights(sigmas)
+        x_pred = self.net(c_in * x_noisy, c_noise, **kwargs)
+        x_denoised = c_skip * x_noisy + c_out * x_pred
+        return x_denoised
+    def loss_weight(self, sigmas: Tensor) -> Tensor:
+        return (sigmas ** 2 + self.sigma_data ** 2) * (sigmas * self.sigma_data) ** -2
+    def forward(self, x: Tensor, noise: Tensor = None, **kwargs) -> Tensor:
+        batch_size, device = x.shape[0], x.device
+        sigmas = self.sigma_distribution(num_samples=batch_size, device=device)
+        sigmas_padded = rearrange(sigmas, "b -> b 1 1")
+        noise = default(noise, lambda: torch.randn_like(x))
+        x_noisy = x + sigmas_padded * noise
+        x_denoised = self.denoise_fn(x_noisy, sigmas=sigmas, **kwargs)
+        losses = F.mse_loss(x_denoised, x, reduction="none")
+        losses = reduce(losses, "b ... -> b", "mean")
+        losses = losses * self.loss_weight(sigmas)
+        return losses.mean()
+class Schedule(nn.Module):
+    def forward(self, num_steps: int, device: torch.device) -> Tensor:
+        raise NotImplementedError()
+class KarrasSchedule(Schedule):
+    def __init__(self, sigma_min: float, sigma_max: float, rho: float = 7.0):
+        super().__init__()
+        self.sigma_min = sigma_min
+        self.sigma_max = sigma_max
+        self.rho = rho
+    def forward(self, num_steps: int, device: Any) -> Tensor:
+        rho_inv = 1.0 / self.rho
+        steps = torch.arange(num_steps, device=device, dtype=torch.float32)
+        sigmas = (
+            self.sigma_max ** rho_inv
+            + (steps / (num_steps - 1))
+            * (self.sigma_min ** rho_inv - self.sigma_max ** rho_inv)
+        ) ** self.rho
+        sigmas = F.pad(sigmas, pad=(0, 1), value=0.0)
+        return sigmas
+class Sampler(nn.Module):
+    diffusion_types: List[Type[Diffusion]] = []
+    def forward(self, noise: Tensor, fn: Callable, sigmas: Tensor, num_steps: int) -> Tensor:
+        raise NotImplementedError()
+class ADPM2Sampler(Sampler):
+    diffusion_types = [KDiffusion]
+    def __init__(self, rho: float = 1.0):
+        super().__init__()
+        self.rho = rho
+    def get_sigmas(self, sigma: float, sigma_next: float) -> Tuple[float, float, float]:
+        r = self.rho
+        sigma_up = sqrt(sigma_next ** 2 * (sigma ** 2 - sigma_next ** 2) / sigma ** 2)
+        sigma_down = sqrt(sigma_next ** 2 - sigma_up ** 2)
+        sigma_mid = ((sigma ** (1 / r) + sigma_down ** (1 / r)) / 2) ** r
+        return sigma_up, sigma_down, sigma_mid
+    def step(self, x: Tensor, fn: Callable, sigma: float, sigma_next: float) -> Tensor:
+        sigma_up, sigma_down, sigma_mid = self.get_sigmas(sigma, sigma_next)
+        d = (x - fn(x, sigma=sigma)) / sigma
+        x_mid = x + d * (sigma_mid - sigma)
+        d_mid = (x_mid - fn(x_mid, sigma=sigma_mid)) / sigma_mid
+        x = x + d_mid * (sigma_down - sigma)
+        x_next = x + torch.randn_like(x) * sigma_up
+        return x_next
+    def forward(self, noise: Tensor, fn: Callable, sigmas: Tensor, num_steps: int) -> Tensor:
+        x = sigmas[0] * noise
+        for i in range(num_steps - 1):
+            x = self.step(x, fn=fn, sigma=sigmas[i], sigma_next=sigmas[i + 1])
+        return x
+class DiffusionSampler(nn.Module):
+    def __init__(self, diffusion: Diffusion, *, sampler: Sampler, sigma_schedule: Schedule, num_steps: Optional[int] = None, clamp: bool = True):
+        super().__init__()
+        self.denoise_fn = diffusion.denoise_fn
+        self.sampler = sampler
+        self.sigma_schedule = sigma_schedule
+        self.num_steps = num_steps
+        self.clamp = clamp
+    def forward(self, noise: Tensor, num_steps: Optional[int] = None, **kwargs) -> Tensor:
+        device = noise.device
+        num_steps = default(num_steps, self.num_steps)
+        assert exists(num_steps), "Parameter `num_steps` must be provided"
+        sigmas = self.sigma_schedule(num_steps, device)
+        fn = lambda *a, **ka: self.denoise_fn(*a, **{**ka, **kwargs})
+        x = self.sampler(noise, fn=fn, sigmas=sigmas, num_steps=num_steps)
+        x = x.clamp(-1.0, 1.0) if self.clamp else x
+        return x

chiluka/models/diffusion/utils.py ADDED Viewed

	@@ -0,0 +1,40 @@

+"""Diffusion utility functions."""
+from functools import reduce
+from inspect import isfunction
+from math import ceil, floor, log2
+from typing import Callable, Dict, List, Optional, Sequence, Tuple, TypeVar, Union
+import torch
+import torch.nn.functional as F
+from typing_extensions import TypeGuard
+T = TypeVar("T")
+def exists(val: Optional[T]) -> TypeGuard[T]:
+    return val is not None
+def default(val: Optional[T], d: Union[Callable[..., T], T]) -> T:
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+def rand_bool(shape, proba, device=None):
+    if proba == 1:
+        return torch.ones(shape, device=device, dtype=torch.bool)
+    elif proba == 0:
+        return torch.zeros(shape, device=device, dtype=torch.bool)
+    else:
+        return torch.bernoulli(torch.full(shape, proba, device=device)).to(torch.bool)
+def groupby(prefix: str, d: Dict, keep_prefix: bool = False) -> Tuple[Dict, Dict]:
+    kwargs_with_prefix = {k: v for k, v in d.items() if k.startswith(prefix)}
+    kwargs = {k: v for k, v in d.items() if not k.startswith(prefix)}
+    if keep_prefix:
+        return kwargs_with_prefix, kwargs
+    kwargs_no_prefix = {k[len(prefix):]: v for k, v in kwargs_with_prefix.items()}
+    return kwargs_no_prefix, kwargs

chiluka/models/hifigan.py ADDED Viewed

	@@ -0,0 +1,266 @@

+"""HiFi-GAN decoder for waveform synthesis."""
+import math
+import random
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import Conv1d, ConvTranspose1d
+from torch.nn.utils import weight_norm, remove_weight_norm
+LRELU_SLOPE = 0.1
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size * dilation - dilation) / 2)
+class AdaIN1d(nn.Module):
+    def __init__(self, style_dim, num_features):
+        super().__init__()
+        self.norm = nn.InstanceNorm1d(num_features, affine=False)
+        self.fc = nn.Linear(style_dim, num_features * 2)
+    def forward(self, x, s):
+        h = self.fc(s)
+        h = h.view(h.size(0), h.size(1), 1)
+        gamma, beta = torch.chunk(h, chunks=2, dim=1)
+        return (1 + gamma) * self.norm(x) + beta
+class AdaINResBlock1(nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), style_dim=64):
+        super().__init__()
+        self.convs1 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], padding=get_padding(kernel_size, dilation[1]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], padding=get_padding(kernel_size, dilation[2])))
+        ])
+        self.convs1.apply(init_weights)
+        self.convs2 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1)))
+        ])
+        self.convs2.apply(init_weights)
+        self.adain1 = nn.ModuleList([AdaIN1d(style_dim, channels) for _ in range(3)])
+        self.adain2 = nn.ModuleList([AdaIN1d(style_dim, channels) for _ in range(3)])
+        self.alpha1 = nn.ParameterList([nn.Parameter(torch.ones(1, channels, 1)) for _ in range(len(self.convs1))])
+        self.alpha2 = nn.ParameterList([nn.Parameter(torch.ones(1, channels, 1)) for _ in range(len(self.convs2))])
+    def forward(self, x, s):
+        for c1, c2, n1, n2, a1, a2 in zip(self.convs1, self.convs2, self.adain1, self.adain2, self.alpha1, self.alpha2):
+            xt = n1(x, s)
+            xt = xt + (1 / a1) * (torch.sin(a1 * xt) ** 2)
+            xt = c1(xt)
+            xt = n2(xt, s)
+            xt = xt + (1 / a2) * (torch.sin(a2 * xt) ** 2)
+            xt = c2(xt)
+            x = xt + x
+        return x
+class SineGen(nn.Module):
+    def __init__(self, samp_rate, upsample_scale, harmonic_num=0, sine_amp=0.1, noise_std=0.003, voiced_threshold=0, flag_for_pulse=False):
+        super().__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = noise_std
+        self.harmonic_num = harmonic_num
+        self.dim = harmonic_num + 1
+        self.sampling_rate = samp_rate
+        self.voiced_threshold = voiced_threshold
+        self.flag_for_pulse = flag_for_pulse
+        self.upsample_scale = upsample_scale
+    def _f02uv(self, f0):
+        uv = (f0 > self.voiced_threshold).type(torch.float32)
+        return uv
+    def _f02sine(self, f0_values):
+        rad_values = (f0_values / self.sampling_rate) % 1
+        rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], device=f0_values.device)
+        rand_ini[:, 0] = 0
+        rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
+        rad_values = F.interpolate(rad_values.transpose(1, 2), scale_factor=1/self.upsample_scale, mode="linear").transpose(1, 2)
+        phase = torch.cumsum(rad_values, dim=1) * 2 * np.pi
+        phase = F.interpolate(phase.transpose(1, 2) * self.upsample_scale, scale_factor=self.upsample_scale, mode="linear").transpose(1, 2)
+        sines = torch.sin(phase)
+        return sines
+    def forward(self, f0):
+        fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))
+        sine_waves = self._f02sine(fn) * self.sine_amp
+        uv = self._f02uv(f0)
+        noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
+        noise = noise_amp * torch.randn_like(sine_waves)
+        sine_waves = sine_waves * uv + noise
+        return sine_waves, uv, noise
+class SourceModuleHnNSF(nn.Module):
+    def __init__(self, sampling_rate, upsample_scale, harmonic_num=0, sine_amp=0.1, add_noise_std=0.003, voiced_threshod=0):
+        super().__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = add_noise_std
+        self.l_sin_gen = SineGen(sampling_rate, upsample_scale, harmonic_num, sine_amp, add_noise_std, voiced_threshod)
+        self.l_linear = nn.Linear(harmonic_num + 1, 1)
+        self.l_tanh = nn.Tanh()
+    def forward(self, x):
+        with torch.no_grad():
+            sine_wavs, uv, _ = self.l_sin_gen(x)
+        sine_merge = self.l_tanh(self.l_linear(sine_wavs))
+        noise = torch.randn_like(uv) * self.sine_amp / 3
+        return sine_merge, noise, uv
+class Generator(nn.Module):
+    def __init__(self, style_dim, resblock_kernel_sizes, upsample_rates, upsample_initial_channel, resblock_dilation_sizes, upsample_kernel_sizes):
+        super().__init__()
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        resblock = AdaINResBlock1
+        self.m_source = SourceModuleHnNSF(sampling_rate=24000, upsample_scale=np.prod(upsample_rates), harmonic_num=8, voiced_threshod=10)
+        self.f0_upsamp = nn.Upsample(scale_factor=np.prod(upsample_rates))
+        self.noise_convs = nn.ModuleList()
+        self.ups = nn.ModuleList()
+        self.noise_res = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            c_cur = upsample_initial_channel // (2 ** (i + 1))
+            self.ups.append(weight_norm(ConvTranspose1d(upsample_initial_channel // (2 ** i), upsample_initial_channel // (2 ** (i + 1)), k, u, padding=(u // 2 + u % 2), output_padding=u % 2)))
+            if i + 1 < len(upsample_rates):
+                stride_f0 = np.prod(upsample_rates[i + 1:])
+                self.noise_convs.append(Conv1d(1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=(stride_f0 + 1) // 2))
+                self.noise_res.append(resblock(c_cur, 7, [1, 3, 5], style_dim))
+            else:
+                self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
+                self.noise_res.append(resblock(c_cur, 11, [1, 3, 5], style_dim))
+        self.resblocks = nn.ModuleList()
+        self.alphas = nn.ParameterList()
+        self.alphas.append(nn.Parameter(torch.ones(1, upsample_initial_channel, 1)))
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            self.alphas.append(nn.Parameter(torch.ones(1, ch, 1)))
+            for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
+                self.resblocks.append(resblock(ch, k, d, style_dim))
+        self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+    def forward(self, x, s, f0):
+        f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2)
+        har_source, noi_source, uv = self.m_source(f0)
+        har_source = har_source.transpose(1, 2)
+        for i in range(self.num_upsamples):
+            x = x + (1 / self.alphas[i]) * (torch.sin(self.alphas[i] * x) ** 2)
+            x_source = self.noise_convs[i](har_source)
+            x_source = self.noise_res[i](x_source, s)
+            x = self.ups[i](x)
+            x = x + x_source
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x, s)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x, s)
+            x = xs / self.num_kernels
+        x = x + (1 / self.alphas[i + 1]) * (torch.sin(self.alphas[i + 1] * x) ** 2)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+class UpSample1d(nn.Module):
+    def __init__(self, layer_type):
+        super().__init__()
+        self.layer_type = layer_type
+    def forward(self, x):
+        if self.layer_type == 'none':
+            return x
+        else:
+            return F.interpolate(x, scale_factor=2, mode='nearest')
+class AdainResBlk1d(nn.Module):
+    def __init__(self, dim_in, dim_out, style_dim=64, actv=nn.LeakyReLU(0.2), upsample='none', dropout_p=0.0):
+        super().__init__()
+        self.actv = actv
+        self.upsample_type = upsample
+        self.upsample = UpSample1d(upsample)
+        self.learned_sc = dim_in != dim_out
+        self._build_weights(dim_in, dim_out, style_dim)
+        self.dropout = nn.Dropout(dropout_p)
+        if upsample == 'none':
+            self.pool = nn.Identity()
+        else:
+            self.pool = weight_norm(nn.ConvTranspose1d(dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1, output_padding=1))
+    def _build_weights(self, dim_in, dim_out, style_dim):
+        self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1))
+        self.conv2 = weight_norm(nn.Conv1d(dim_out, dim_out, 3, 1, 1))
+        self.norm1 = AdaIN1d(style_dim, dim_in)
+        self.norm2 = AdaIN1d(style_dim, dim_out)
+        if self.learned_sc:
+            self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False))
+    def _shortcut(self, x):
+        x = self.upsample(x)
+        if self.learned_sc:
+            x = self.conv1x1(x)
+        return x
+    def _residual(self, x, s):
+        x = self.norm1(x, s)
+        x = self.actv(x)
+        x = self.pool(x)
+        x = self.conv1(self.dropout(x))
+        x = self.norm2(x, s)
+        x = self.actv(x)
+        x = self.conv2(self.dropout(x))
+        return x
+    def forward(self, x, s):
+        out = self._residual(x, s)
+        out = (out + self._shortcut(x)) / math.sqrt(2)
+        return out
+class Decoder(nn.Module):
+    def __init__(self, dim_in=512, F0_channel=512, style_dim=64, dim_out=80, resblock_kernel_sizes=[3, 7, 11],
+                 upsample_rates=[10, 5, 3, 2], upsample_initial_channel=512, resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+                 upsample_kernel_sizes=[20, 10, 6, 4]):
+        super().__init__()
+        self.decode = nn.ModuleList()
+        self.encode = AdainResBlk1d(dim_in + 2, 1024, style_dim)
+        self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
+        self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
+        self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
+        self.decode.append(AdainResBlk1d(1024 + 2 + 64, 512, style_dim, upsample=True))
+        self.F0_conv = weight_norm(nn.Conv1d(1, 1, kernel_size=3, stride=2, groups=1, padding=1))
+        self.N_conv = weight_norm(nn.Conv1d(1, 1, kernel_size=3, stride=2, groups=1, padding=1))
+        self.asr_res = nn.Sequential(weight_norm(nn.Conv1d(512, 64, kernel_size=1)))
+        self.generator = Generator(style_dim, resblock_kernel_sizes, upsample_rates, upsample_initial_channel, resblock_dilation_sizes, upsample_kernel_sizes)
+    def forward(self, asr, F0_curve, N, s):
+        F0 = self.F0_conv(F0_curve.unsqueeze(1))
+        N = self.N_conv(N.unsqueeze(1))
+        x = torch.cat([asr, F0, N], axis=1)
+        x = self.encode(x, s)
+        asr_res = self.asr_res(asr)
+        res = True
+        for block in self.decode:
+            if res:
+                x = torch.cat([x, asr_res, F0, N], axis=1)
+            x = block(x, s)
+            if block.upsample_type != "none":
+                res = False
+        x = self.generator(x, s, F0_curve)
+        return x

chiluka/pretrained/ASR/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

chiluka/pretrained/ASR/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (150 Bytes). View file

chiluka/pretrained/ASR/__pycache__/layers.cpython-310.pyc ADDED Viewed

Binary file (11 kB). View file

chiluka/pretrained/ASR/__pycache__/models.cpython-310.pyc ADDED Viewed

Binary file (6.12 kB). View file

chiluka/pretrained/ASR/config.yml ADDED Viewed

	@@ -0,0 +1,29 @@

+log_dir: "logs/20201006"
+save_freq: 5
+device: "cuda"
+epochs: 180
+batch_size: 64
+pretrained_model: ""
+train_data: "ASRDataset/train_list.txt"
+val_data: "ASRDataset/val_list.txt"
+dataset_params:
+  data_augmentation: false
+preprocess_parasm:
+  sr: 24000
+  spect_params:
+    n_fft: 2048
+    win_length: 1200
+    hop_length: 300
+  mel_params:
+    n_mels: 80
+model_params:
+   input_dim: 80
+   hidden_dim: 256
+   n_token: 178
+   token_embedding_dim: 512
+optimizer_params:
+  lr: 0.0005

chiluka/pretrained/ASR/epoch_00080.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fedd55a1234b0c56e1e8b509c74edf3a5e2f27106a66038a4a946047a775bd6c
+size 94552811

chiluka/pretrained/ASR/layers.py ADDED Viewed

	@@ -0,0 +1,354 @@

+import math
+import torch
+from torch import nn
+from typing import Optional, Any
+from torch import Tensor
+import torch.nn.functional as F
+import torchaudio
+import torchaudio.functional as audio_F
+import random
+random.seed(0)
+def _get_activation_fn(activ):
+    if activ == 'relu':
+        return nn.ReLU()
+    elif activ == 'lrelu':
+        return nn.LeakyReLU(0.2)
+    elif activ == 'swish':
+        return lambda x: x*torch.sigmoid(x)
+    else:
+        raise RuntimeError('Unexpected activ type %s, expected [relu, lrelu, swish]' % activ)
+class LinearNorm(torch.nn.Module):
+    def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
+        super(LinearNorm, self).__init__()
+        self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
+        torch.nn.init.xavier_uniform_(
+            self.linear_layer.weight,
+            gain=torch.nn.init.calculate_gain(w_init_gain))
+    def forward(self, x):
+        return self.linear_layer(x)
+class ConvNorm(torch.nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1,
+                 padding=None, dilation=1, bias=True, w_init_gain='linear', param=None):
+        super(ConvNorm, self).__init__()
+        if padding is None:
+            assert(kernel_size % 2 == 1)
+            padding = int(dilation * (kernel_size - 1) / 2)
+        self.conv = torch.nn.Conv1d(in_channels, out_channels,
+                                    kernel_size=kernel_size, stride=stride,
+                                    padding=padding, dilation=dilation,
+                                    bias=bias)
+        torch.nn.init.xavier_uniform_(
+            self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain, param=param))
+    def forward(self, signal):
+        conv_signal = self.conv(signal)
+        return conv_signal
+class CausualConv(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=1, dilation=1, bias=True, w_init_gain='linear', param=None):
+        super(CausualConv, self).__init__()
+        if padding is None:
+            assert(kernel_size % 2 == 1)
+            padding = int(dilation * (kernel_size - 1) / 2) * 2
+        else:
+            self.padding = padding * 2
+        self.conv = nn.Conv1d(in_channels, out_channels,
+                              kernel_size=kernel_size, stride=stride,
+                              padding=self.padding,
+                              dilation=dilation,
+                              bias=bias)
+        torch.nn.init.xavier_uniform_(
+            self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain, param=param))
+    def forward(self, x):
+        x = self.conv(x)
+        x = x[:, :, :-self.padding]
+        return x
+class CausualBlock(nn.Module):
+    def __init__(self, hidden_dim, n_conv=3, dropout_p=0.2, activ='lrelu'):
+        super(CausualBlock, self).__init__()
+        self.blocks = nn.ModuleList([
+            self._get_conv(hidden_dim, dilation=3**i, activ=activ, dropout_p=dropout_p)
+            for i in range(n_conv)])
+    def forward(self, x):
+        for block in self.blocks:
+            res = x
+            x = block(x)
+            x += res
+        return x
+    def _get_conv(self, hidden_dim, dilation, activ='lrelu', dropout_p=0.2):
+        layers = [
+            CausualConv(hidden_dim, hidden_dim, kernel_size=3, padding=dilation, dilation=dilation),
+            _get_activation_fn(activ),
+            nn.BatchNorm1d(hidden_dim),
+            nn.Dropout(p=dropout_p),
+            CausualConv(hidden_dim, hidden_dim, kernel_size=3, padding=1, dilation=1),
+            _get_activation_fn(activ),
+            nn.Dropout(p=dropout_p)
+        ]
+        return nn.Sequential(*layers)
+class ConvBlock(nn.Module):
+    def __init__(self, hidden_dim, n_conv=3, dropout_p=0.2, activ='relu'):
+        super().__init__()
+        self._n_groups = 8
+        self.blocks = nn.ModuleList([
+            self._get_conv(hidden_dim, dilation=3**i, activ=activ, dropout_p=dropout_p)
+            for i in range(n_conv)])
+    def forward(self, x):
+        for block in self.blocks:
+            res = x
+            x = block(x)
+            x += res
+        return x
+    def _get_conv(self, hidden_dim, dilation, activ='relu', dropout_p=0.2):
+        layers = [
+            ConvNorm(hidden_dim, hidden_dim, kernel_size=3, padding=dilation, dilation=dilation),
+            _get_activation_fn(activ),
+            nn.GroupNorm(num_groups=self._n_groups, num_channels=hidden_dim),
+            nn.Dropout(p=dropout_p),
+            ConvNorm(hidden_dim, hidden_dim, kernel_size=3, padding=1, dilation=1),
+            _get_activation_fn(activ),
+            nn.Dropout(p=dropout_p)
+        ]
+        return nn.Sequential(*layers)
+class LocationLayer(nn.Module):
+    def __init__(self, attention_n_filters, attention_kernel_size,
+                 attention_dim):
+        super(LocationLayer, self).__init__()
+        padding = int((attention_kernel_size - 1) / 2)
+        self.location_conv = ConvNorm(2, attention_n_filters,
+                                      kernel_size=attention_kernel_size,
+                                      padding=padding, bias=False, stride=1,
+                                      dilation=1)
+        self.location_dense = LinearNorm(attention_n_filters, attention_dim,
+                                         bias=False, w_init_gain='tanh')
+    def forward(self, attention_weights_cat):
+        processed_attention = self.location_conv(attention_weights_cat)
+        processed_attention = processed_attention.transpose(1, 2)
+        processed_attention = self.location_dense(processed_attention)
+        return processed_attention
+class Attention(nn.Module):
+    def __init__(self, attention_rnn_dim, embedding_dim, attention_dim,
+                 attention_location_n_filters, attention_location_kernel_size):
+        super(Attention, self).__init__()
+        self.query_layer = LinearNorm(attention_rnn_dim, attention_dim,
+                                      bias=False, w_init_gain='tanh')
+        self.memory_layer = LinearNorm(embedding_dim, attention_dim, bias=False,
+                                       w_init_gain='tanh')
+        self.v = LinearNorm(attention_dim, 1, bias=False)
+        self.location_layer = LocationLayer(attention_location_n_filters,
+                                            attention_location_kernel_size,
+                                            attention_dim)
+        self.score_mask_value = -float("inf")
+    def get_alignment_energies(self, query, processed_memory,
+                               attention_weights_cat):
+        """
+        PARAMS
+        ------
+        query: decoder output (batch, n_mel_channels * n_frames_per_step)
+        processed_memory: processed encoder outputs (B, T_in, attention_dim)
+        attention_weights_cat: cumulative and prev. att weights (B, 2, max_time)
+        RETURNS
+        -------
+        alignment (batch, max_time)
+        """
+        processed_query = self.query_layer(query.unsqueeze(1))
+        processed_attention_weights = self.location_layer(attention_weights_cat)
+        energies = self.v(torch.tanh(
+            processed_query + processed_attention_weights + processed_memory))
+        energies = energies.squeeze(-1)
+        return energies
+    def forward(self, attention_hidden_state, memory, processed_memory,
+                attention_weights_cat, mask):
+        """
+        PARAMS
+        ------
+        attention_hidden_state: attention rnn last output
+        memory: encoder outputs
+        processed_memory: processed encoder outputs
+        attention_weights_cat: previous and cummulative attention weights
+        mask: binary mask for padded data
+        """
+        alignment = self.get_alignment_energies(
+            attention_hidden_state, processed_memory, attention_weights_cat)
+        if mask is not None:
+            alignment.data.masked_fill_(mask, self.score_mask_value)
+        attention_weights = F.softmax(alignment, dim=1)
+        attention_context = torch.bmm(attention_weights.unsqueeze(1), memory)
+        attention_context = attention_context.squeeze(1)
+        return attention_context, attention_weights
+class ForwardAttentionV2(nn.Module):
+    def __init__(self, attention_rnn_dim, embedding_dim, attention_dim,
+                 attention_location_n_filters, attention_location_kernel_size):
+        super(ForwardAttentionV2, self).__init__()
+        self.query_layer = LinearNorm(attention_rnn_dim, attention_dim,
+                                      bias=False, w_init_gain='tanh')
+        self.memory_layer = LinearNorm(embedding_dim, attention_dim, bias=False,
+                                       w_init_gain='tanh')
+        self.v = LinearNorm(attention_dim, 1, bias=False)
+        self.location_layer = LocationLayer(attention_location_n_filters,
+                                            attention_location_kernel_size,
+                                            attention_dim)
+        self.score_mask_value = -float(1e20)
+    def get_alignment_energies(self, query, processed_memory,
+                               attention_weights_cat):
+        """
+        PARAMS
+        ------
+        query: decoder output (batch, n_mel_channels * n_frames_per_step)
+        processed_memory: processed encoder outputs (B, T_in, attention_dim)
+        attention_weights_cat:  prev. and cumulative att weights (B, 2, max_time)
+        RETURNS
+        -------
+        alignment (batch, max_time)
+        """
+        processed_query = self.query_layer(query.unsqueeze(1))
+        processed_attention_weights = self.location_layer(attention_weights_cat)
+        energies = self.v(torch.tanh(
+            processed_query + processed_attention_weights + processed_memory))
+        energies = energies.squeeze(-1)
+        return energies
+    def forward(self, attention_hidden_state, memory, processed_memory,
+                attention_weights_cat, mask, log_alpha):
+        """
+        PARAMS
+        ------
+        attention_hidden_state: attention rnn last output
+        memory: encoder outputs
+        processed_memory: processed encoder outputs
+        attention_weights_cat: previous and cummulative attention weights
+        mask: binary mask for padded data
+        """
+        log_energy = self.get_alignment_energies(
+            attention_hidden_state, processed_memory, attention_weights_cat)
+        #log_energy =
+        if mask is not None:
+            log_energy.data.masked_fill_(mask, self.score_mask_value)
+        #attention_weights = F.softmax(alignment, dim=1)
+        #content_score = log_energy.unsqueeze(1) #[B, MAX_TIME] -> [B, 1, MAX_TIME]
+        #log_alpha = log_alpha.unsqueeze(2) #[B, MAX_TIME] -> [B, MAX_TIME, 1]
+        #log_total_score = log_alpha + content_score
+        #previous_attention_weights = attention_weights_cat[:,0,:]
+        log_alpha_shift_padded = []
+        max_time = log_energy.size(1)
+        for sft in range(2):
+            shifted = log_alpha[:,:max_time-sft]
+            shift_padded = F.pad(shifted, (sft,0), 'constant', self.score_mask_value)
+            log_alpha_shift_padded.append(shift_padded.unsqueeze(2))
+        biased = torch.logsumexp(torch.cat(log_alpha_shift_padded,2), 2)
+        log_alpha_new = biased +  log_energy
+        attention_weights =  F.softmax(log_alpha_new, dim=1)
+        attention_context = torch.bmm(attention_weights.unsqueeze(1), memory)
+        attention_context = attention_context.squeeze(1)
+        return attention_context, attention_weights, log_alpha_new
+class PhaseShuffle2d(nn.Module):
+    def __init__(self, n=2):
+        super(PhaseShuffle2d, self).__init__()
+        self.n = n
+        self.random = random.Random(1)
+    def forward(self, x, move=None):
+        # x.size = (B, C, M, L)
+        if move is None:
+            move = self.random.randint(-self.n, self.n)
+        if move == 0:
+            return x
+        else:
+            left = x[:, :, :, :move]
+            right = x[:, :, :, move:]
+            shuffled = torch.cat([right, left], dim=3)
+        return shuffled
+class PhaseShuffle1d(nn.Module):
+    def __init__(self, n=2):
+        super(PhaseShuffle1d, self).__init__()
+        self.n = n
+        self.random = random.Random(1)
+    def forward(self, x, move=None):
+        # x.size = (B, C, M, L)
+        if move is None:
+            move = self.random.randint(-self.n, self.n)
+        if move == 0:
+            return x
+        else:
+            left = x[:, :,  :move]
+            right = x[:, :, move:]
+            shuffled = torch.cat([right, left], dim=2)
+        return shuffled
+class MFCC(nn.Module):
+    def __init__(self, n_mfcc=40, n_mels=80):
+        super(MFCC, self).__init__()
+        self.n_mfcc = n_mfcc
+        self.n_mels = n_mels
+        self.norm = 'ortho'
+        dct_mat = audio_F.create_dct(self.n_mfcc, self.n_mels, self.norm)
+        self.register_buffer('dct_mat', dct_mat)
+    def forward(self, mel_specgram):
+        if len(mel_specgram.shape) == 2:
+            mel_specgram = mel_specgram.unsqueeze(0)
+            unsqueezed = True
+        else:
+            unsqueezed = False
+        # (channel, n_mels, time).tranpose(...) dot (n_mels, n_mfcc)
+        # -> (channel, time, n_mfcc).tranpose(...)
+        mfcc = torch.matmul(mel_specgram.transpose(1, 2), self.dct_mat).transpose(1, 2)
+        # unpack batch
+        if unsqueezed:
+            mfcc = mfcc.squeeze(0)
+        return mfcc

chiluka/pretrained/ASR/models.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import math
+import torch
+from torch import nn
+from torch.nn import TransformerEncoder
+import torch.nn.functional as F
+from .layers import MFCC, Attention, LinearNorm, ConvNorm, ConvBlock
+class ASRCNN(nn.Module):
+    def __init__(self,
+                 input_dim=80,
+                 hidden_dim=256,
+                 n_token=35,
+                 n_layers=6,
+                 token_embedding_dim=256,
+    ):
+        super().__init__()
+        self.n_token = n_token
+        self.n_down = 1
+        self.to_mfcc = MFCC()
+        self.init_cnn = ConvNorm(input_dim//2, hidden_dim, kernel_size=7, padding=3, stride=2)
+        self.cnns = nn.Sequential(
+            *[nn.Sequential(
+                ConvBlock(hidden_dim),
+                nn.GroupNorm(num_groups=1, num_channels=hidden_dim)
+            ) for n in range(n_layers)])
+        self.projection = ConvNorm(hidden_dim, hidden_dim // 2)
+        self.ctc_linear = nn.Sequential(
+            LinearNorm(hidden_dim//2, hidden_dim),
+            nn.ReLU(),
+            LinearNorm(hidden_dim, n_token))
+        self.asr_s2s = ASRS2S(
+            embedding_dim=token_embedding_dim,
+            hidden_dim=hidden_dim//2,
+            n_token=n_token)
+    def forward(self, x, src_key_padding_mask=None, text_input=None):
+        x = self.to_mfcc(x)
+        x = self.init_cnn(x)
+        x = self.cnns(x)
+        x = self.projection(x)
+        x = x.transpose(1, 2)
+        ctc_logit = self.ctc_linear(x)
+        if text_input is not None:
+            _, s2s_logit, s2s_attn = self.asr_s2s(x, src_key_padding_mask, text_input)
+            return ctc_logit, s2s_logit, s2s_attn
+        else:
+            return ctc_logit
+    def get_feature(self, x):
+        x = self.to_mfcc(x.squeeze(1))
+        x = self.init_cnn(x)
+        x = self.cnns(x)
+        x = self.projection(x)
+        return x
+    def length_to_mask(self, lengths):
+        mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
+        mask = torch.gt(mask+1, lengths.unsqueeze(1)).to(lengths.device)
+        return mask
+    def get_future_mask(self, out_length, unmask_future_steps=0):
+        """
+        Args:
+            out_length (int): returned mask shape is (out_length, out_length).
+            unmask_futre_steps (int): unmasking future step size.
+        Return:
+            mask (torch.BoolTensor): mask future timesteps mask[i, j] = True if i > j + unmask_future_steps else False
+        """
+        index_tensor = torch.arange(out_length).unsqueeze(0).expand(out_length, -1)
+        mask = torch.gt(index_tensor, index_tensor.T + unmask_future_steps)
+        return mask
+class ASRS2S(nn.Module):
+    def __init__(self,
+                 embedding_dim=256,
+                 hidden_dim=512,
+                 n_location_filters=32,
+                 location_kernel_size=63,
+                 n_token=40):
+        super(ASRS2S, self).__init__()
+        self.embedding = nn.Embedding(n_token, embedding_dim)
+        val_range = math.sqrt(6 / hidden_dim)
+        self.embedding.weight.data.uniform_(-val_range, val_range)
+        self.decoder_rnn_dim = hidden_dim
+        self.project_to_n_symbols = nn.Linear(self.decoder_rnn_dim, n_token)
+        self.attention_layer = Attention(
+            self.decoder_rnn_dim,
+            hidden_dim,
+            hidden_dim,
+            n_location_filters,
+            location_kernel_size
+        )
+        self.decoder_rnn = nn.LSTMCell(self.decoder_rnn_dim + embedding_dim, self.decoder_rnn_dim)
+        self.project_to_hidden = nn.Sequential(
+            LinearNorm(self.decoder_rnn_dim * 2, hidden_dim),
+            nn.Tanh())
+        self.sos = 1
+        self.eos = 2
+    def initialize_decoder_states(self, memory, mask):
+        """
+        moemory.shape = (B, L, H) = (Batchsize, Maxtimestep, Hiddendim)
+        """
+        B, L, H = memory.shape
+        self.decoder_hidden = torch.zeros((B, self.decoder_rnn_dim)).type_as(memory)
+        self.decoder_cell = torch.zeros((B, self.decoder_rnn_dim)).type_as(memory)
+        self.attention_weights = torch.zeros((B, L)).type_as(memory)
+        self.attention_weights_cum = torch.zeros((B, L)).type_as(memory)
+        self.attention_context = torch.zeros((B, H)).type_as(memory)
+        self.memory = memory
+        self.processed_memory = self.attention_layer.memory_layer(memory)
+        self.mask = mask
+        self.unk_index = 3
+        self.random_mask = 0.1
+    def forward(self, memory, memory_mask, text_input):
+        """
+        moemory.shape = (B, L, H) = (Batchsize, Maxtimestep, Hiddendim)
+        moemory_mask.shape = (B, L, )
+        texts_input.shape = (B, T)
+        """
+        self.initialize_decoder_states(memory, memory_mask)
+        # text random mask
+        random_mask = (torch.rand(text_input.shape) < self.random_mask).to(text_input.device)
+        _text_input = text_input.clone()
+        _text_input.masked_fill_(random_mask, self.unk_index)
+        decoder_inputs = self.embedding(_text_input).transpose(0, 1) # -> [T, B, channel]
+        start_embedding = self.embedding(
+            torch.LongTensor([self.sos]*decoder_inputs.size(1)).to(decoder_inputs.device))
+        decoder_inputs = torch.cat((start_embedding.unsqueeze(0), decoder_inputs), dim=0)
+        hidden_outputs, logit_outputs, alignments = [], [], []
+        while len(hidden_outputs) < decoder_inputs.size(0):
+            decoder_input = decoder_inputs[len(hidden_outputs)]
+            hidden, logit, attention_weights = self.decode(decoder_input)
+            hidden_outputs += [hidden]
+            logit_outputs += [logit]
+            alignments += [attention_weights]
+        hidden_outputs, logit_outputs, alignments = \
+            self.parse_decoder_outputs(
+                hidden_outputs, logit_outputs, alignments)
+        return hidden_outputs, logit_outputs, alignments
+    def decode(self, decoder_input):
+        cell_input = torch.cat((decoder_input, self.attention_context), -1)
+        self.decoder_hidden, self.decoder_cell = self.decoder_rnn(
+            cell_input,
+            (self.decoder_hidden, self.decoder_cell))
+        attention_weights_cat = torch.cat(
+            (self.attention_weights.unsqueeze(1),
+            self.attention_weights_cum.unsqueeze(1)),dim=1)
+        self.attention_context, self.attention_weights = self.attention_layer(
+            self.decoder_hidden,
+            self.memory,
+            self.processed_memory,
+            attention_weights_cat,
+            self.mask)
+        self.attention_weights_cum += self.attention_weights
+        hidden_and_context = torch.cat((self.decoder_hidden, self.attention_context), -1)
+        hidden = self.project_to_hidden(hidden_and_context)
+        # dropout to increasing g
+        logit = self.project_to_n_symbols(F.dropout(hidden, 0.5, self.training))
+        return hidden, logit, self.attention_weights
+    def parse_decoder_outputs(self, hidden, logit, alignments):
+        # -> [B, T_out + 1, max_time]
+        alignments = torch.stack(alignments).transpose(0,1)
+        # [T_out + 1, B, n_symbols] -> [B, T_out + 1,  n_symbols]
+        logit = torch.stack(logit).transpose(0, 1).contiguous()
+        hidden = torch.stack(hidden).transpose(0, 1).contiguous()
+        return hidden, logit, alignments

chiluka/pretrained/JDC/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

chiluka/pretrained/JDC/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (150 Bytes). View file

chiluka/pretrained/JDC/__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (4.78 kB). View file

chiluka/pretrained/JDC/bst.t7 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:54dc94364b97e18ac1dfa6287714ed121248cfaac4cfd39d061c6e0a089ef169
+size 21029926

chiluka/pretrained/JDC/model.py ADDED Viewed

	@@ -0,0 +1,190 @@

+"""
+Implementation of model from:
+Kum et al. - "Joint Detection and Classification of Singing Voice Melody Using
+Convolutional Recurrent Neural Networks" (2019)
+Link: https://www.semanticscholar.org/paper/Joint-Detection-and-Classification-of-Singing-Voice-Kum-Nam/60a2ad4c7db43bace75805054603747fcd062c0d
+"""
+import torch
+from torch import nn
+class JDCNet(nn.Module):
+    """
+    Joint Detection and Classification Network model for singing voice melody.
+    """
+    def __init__(self, num_class=722, seq_len=31, leaky_relu_slope=0.01):
+        super().__init__()
+        self.num_class = num_class
+        # input = (b, 1, 31, 513), b = batch size
+        self.conv_block = nn.Sequential(
+            nn.Conv2d(in_channels=1, out_channels=64, kernel_size=3, padding=1, bias=False),  # out: (b, 64, 31, 513)
+            nn.BatchNorm2d(num_features=64),
+            nn.LeakyReLU(leaky_relu_slope, inplace=True),
+            nn.Conv2d(64, 64, 3, padding=1, bias=False),  # (b, 64, 31, 513)
+        )
+        # res blocks
+        self.res_block1 = ResBlock(in_channels=64, out_channels=128)  # (b, 128, 31, 128)
+        self.res_block2 = ResBlock(in_channels=128, out_channels=192)  # (b, 192, 31, 32)
+        self.res_block3 = ResBlock(in_channels=192, out_channels=256)  # (b, 256, 31, 8)
+        # pool block
+        self.pool_block = nn.Sequential(
+            nn.BatchNorm2d(num_features=256),
+            nn.LeakyReLU(leaky_relu_slope, inplace=True),
+            nn.MaxPool2d(kernel_size=(1, 4)),  # (b, 256, 31, 2)
+            nn.Dropout(p=0.2),
+        )
+        # maxpool layers (for auxiliary network inputs)
+        # in = (b, 128, 31, 513) from conv_block, out = (b, 128, 31, 2)
+        self.maxpool1 = nn.MaxPool2d(kernel_size=(1, 40))
+        # in = (b, 128, 31, 128) from res_block1, out = (b, 128, 31, 2)
+        self.maxpool2 = nn.MaxPool2d(kernel_size=(1, 20))
+        # in = (b, 128, 31, 32) from res_block2, out = (b, 128, 31, 2)
+        self.maxpool3 = nn.MaxPool2d(kernel_size=(1, 10))
+        # in = (b, 640, 31, 2), out = (b, 256, 31, 2)
+        self.detector_conv = nn.Sequential(
+            nn.Conv2d(640, 256, 1, bias=False),
+            nn.BatchNorm2d(256),
+            nn.LeakyReLU(leaky_relu_slope, inplace=True),
+            nn.Dropout(p=0.2),
+        )
+        # input: (b, 31, 512) - resized from (b, 256, 31, 2)
+        self.bilstm_classifier = nn.LSTM(
+            input_size=512, hidden_size=256,
+            batch_first=True, bidirectional=True)  # (b, 31, 512)
+        # input: (b, 31, 512) - resized from (b, 256, 31, 2)
+        self.bilstm_detector = nn.LSTM(
+            input_size=512, hidden_size=256,
+            batch_first=True, bidirectional=True)  # (b, 31, 512)
+        # input: (b * 31, 512)
+        self.classifier = nn.Linear(in_features=512, out_features=self.num_class)  # (b * 31, num_class)
+        # input: (b * 31, 512)
+        self.detector = nn.Linear(in_features=512, out_features=2)  # (b * 31, 2) - binary classifier
+        # initialize weights
+        self.apply(self.init_weights)
+    def get_feature_GAN(self, x):
+        seq_len = x.shape[-2]
+        x = x.float().transpose(-1, -2)
+        convblock_out = self.conv_block(x)
+        resblock1_out = self.res_block1(convblock_out)
+        resblock2_out = self.res_block2(resblock1_out)
+        resblock3_out = self.res_block3(resblock2_out)
+        poolblock_out = self.pool_block[0](resblock3_out)
+        poolblock_out = self.pool_block[1](poolblock_out)
+        return poolblock_out.transpose(-1, -2)
+    def get_feature(self, x):
+        seq_len = x.shape[-2]
+        x = x.float().transpose(-1, -2)
+        convblock_out = self.conv_block(x)
+        resblock1_out = self.res_block1(convblock_out)
+        resblock2_out = self.res_block2(resblock1_out)
+        resblock3_out = self.res_block3(resblock2_out)
+        poolblock_out = self.pool_block[0](resblock3_out)
+        poolblock_out = self.pool_block[1](poolblock_out)
+        return self.pool_block[2](poolblock_out)
+    def forward(self, x):
+        """
+        Returns:
+            classification_prediction, detection_prediction
+            sizes: (b, 31, 722), (b, 31, 2)
+        """
+        ###############################
+        # forward pass for classifier #
+        ###############################
+        seq_len = x.shape[-1]
+        x = x.float().transpose(-1, -2)
+        convblock_out = self.conv_block(x)
+        resblock1_out = self.res_block1(convblock_out)
+        resblock2_out = self.res_block2(resblock1_out)
+        resblock3_out = self.res_block3(resblock2_out)
+        poolblock_out = self.pool_block[0](resblock3_out)
+        poolblock_out = self.pool_block[1](poolblock_out)
+        GAN_feature = poolblock_out.transpose(-1, -2)
+        poolblock_out = self.pool_block[2](poolblock_out)
+        # (b, 256, 31, 2) => (b, 31, 256, 2) => (b, 31, 512)
+        classifier_out = poolblock_out.permute(0, 2, 1, 3).contiguous().view((-1, seq_len, 512))
+        classifier_out, _ = self.bilstm_classifier(classifier_out)  # ignore the hidden states
+        classifier_out = classifier_out.contiguous().view((-1, 512))  # (b * 31, 512)
+        classifier_out = self.classifier(classifier_out)
+        classifier_out = classifier_out.view((-1, seq_len, self.num_class))  # (b, 31, num_class)
+        # sizes: (b, 31, 722), (b, 31, 2)
+        # classifier output consists of predicted pitch classes per frame
+        # detector output consists of: (isvoice, notvoice) estimates per frame
+        return torch.abs(classifier_out.squeeze()), GAN_feature, poolblock_out
+    @staticmethod
+    def init_weights(m):
+        if isinstance(m, nn.Linear):
+            nn.init.kaiming_uniform_(m.weight)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.Conv2d):
+            nn.init.xavier_normal_(m.weight)
+        elif isinstance(m, nn.LSTM) or isinstance(m, nn.LSTMCell):
+            for p in m.parameters():
+                if p.data is None:
+                    continue
+                if len(p.shape) >= 2:
+                    nn.init.orthogonal_(p.data)
+                else:
+                    nn.init.normal_(p.data)
+class ResBlock(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, leaky_relu_slope=0.01):
+        super().__init__()
+        self.downsample = in_channels != out_channels
+        # BN / LReLU / MaxPool layer before the conv layer - see Figure 1b in the paper
+        self.pre_conv = nn.Sequential(
+            nn.BatchNorm2d(num_features=in_channels),
+            nn.LeakyReLU(leaky_relu_slope, inplace=True),
+            nn.MaxPool2d(kernel_size=(1, 2)),  # apply downsampling on the y axis only
+        )
+        # conv layers
+        self.conv = nn.Sequential(
+            nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
+                      kernel_size=3, padding=1, bias=False),
+            nn.BatchNorm2d(out_channels),
+            nn.LeakyReLU(leaky_relu_slope, inplace=True),
+            nn.Conv2d(out_channels, out_channels, 3, padding=1, bias=False),
+        )
+        # 1 x 1 convolution layer to match the feature dimensions
+        self.conv1by1 = None
+        if self.downsample:
+            self.conv1by1 = nn.Conv2d(in_channels, out_channels, 1, bias=False)
+    def forward(self, x):
+        x = self.pre_conv(x)
+        if self.downsample:
+            x = self.conv(x) + self.conv1by1(x)
+        else:
+            x = self.conv(x) + x
+        return x

chiluka/pretrained/PLBERT/__pycache__/util.cpython-310.pyc ADDED Viewed

Binary file (1.75 kB). View file

chiluka/pretrained/PLBERT/config.yml ADDED Viewed

	@@ -0,0 +1,30 @@

+log_dir: "Checkpoint"
+mixed_precision: "fp16"
+data_folder: "wikipedia_20220301.en.processed"
+batch_size: 192
+save_interval: 5000
+log_interval: 10
+num_process: 1 # number of GPUs
+num_steps: 1000000
+dataset_params:
+    tokenizer: "transfo-xl-wt103"
+    token_separator: " " # token used for phoneme separator (space)
+    token_mask: "M" # token used for phoneme mask (M)
+    word_separator: 3039 # token used for word separator (<formula>)
+    token_maps: "token_maps.pkl" # token map path
+    max_mel_length: 512 # max phoneme length
+    word_mask_prob: 0.15 # probability to mask the entire word
+    phoneme_mask_prob: 0.1 # probability to mask each phoneme
+    replace_prob: 0.2 # probablity to replace phonemes
+model_params:
+    vocab_size: 178
+    hidden_size: 768
+    num_attention_heads: 12
+    intermediate_size: 2048
+    max_position_embeddings: 512
+    num_hidden_layers: 12
+    dropout: 0.1

chiluka/pretrained/PLBERT/step_1000000.t7 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0714ff85804db43e06b3b0ac5749bf90cf206257c6c5916e8a98c5933b4c21e0
+size 25185187

chiluka/pretrained/PLBERT/util.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import os
+import yaml
+import torch
+from transformers import AlbertConfig, AlbertModel
+class CustomAlbert(AlbertModel):
+    def forward(self, *args, **kwargs):
+        # Call the original forward method
+        outputs = super().forward(*args, **kwargs)
+        # Only return the last_hidden_state
+        return outputs.last_hidden_state
+def load_plbert(log_dir):
+    config_path = os.path.join(log_dir, "config.yml")
+    plbert_config = yaml.safe_load(open(config_path))
+    albert_base_configuration = AlbertConfig(**plbert_config['model_params'])
+    bert = CustomAlbert(albert_base_configuration)
+    files = os.listdir(log_dir)
+    ckpts = []
+    for f in os.listdir(log_dir):
+        if f.startswith("step_"): ckpts.append(f)
+    iters = [int(f.split('_')[-1].split('.')[0]) for f in ckpts if os.path.isfile(os.path.join(log_dir, f))]
+    iters = sorted(iters)[-1]
+    checkpoint = torch.load(log_dir + "/step_" + str(iters) + ".t7", map_location='cpu')
+    state_dict = checkpoint['net']
+    from collections import OrderedDict
+    new_state_dict = OrderedDict()
+    for k, v in state_dict.items():
+        name = k[7:] # remove `module.`
+        if name.startswith('encoder.'):
+            name = name[8:] # remove `encoder.`
+            new_state_dict[name] = v
+    del new_state_dict["embeddings.position_ids"]
+    bert.load_state_dict(new_state_dict, strict=False)
+    return bert

chiluka/text_utils.py ADDED Viewed

	@@ -0,0 +1,24 @@

+"""Text processing utilities for phoneme tokenization."""
+_pad = "$"
+_punctuation = ';:,.!?¡¿—…"«»"" '
+_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
+_letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
+symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
+_symbol_to_id = {s: i for i, s in enumerate(symbols)}
+class TextCleaner:
+    """Converts phoneme strings to token IDs."""
+    def __init__(self):
+        self.word_index_dictionary = _symbol_to_id
+    def __call__(self, text):
+        indexes = []
+        for char in text:
+            if char in self.word_index_dictionary:
+                indexes.append(self.word_index_dictionary[char])
+        return indexes

chiluka/utils.py ADDED Viewed

	@@ -0,0 +1,21 @@

+"""Utility functions for Chiluka."""
+import torch
+from munch import Munch
+def length_to_mask(lengths):
+    """Convert lengths to attention mask."""
+    mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
+    mask = torch.gt(mask + 1, lengths.unsqueeze(1))
+    return mask
+def recursive_munch(d):
+    """Recursively convert dict to Munch for dot notation access."""
+    if isinstance(d, dict):
+        return Munch((k, recursive_munch(v)) for k, v in d.items())
+    elif isinstance(d, list):
+        return [recursive_munch(v) for v in d]
+    else:
+        return d

examples/basic_synthesis.py ADDED Viewed

	@@ -0,0 +1,51 @@

+#!/usr/bin/env python3
+"""
+Basic example of using Chiluka for TTS synthesis.
+Usage:
+    python basic_synthesis.py --reference path/to/reference.wav --text "Hello world"
+"""
+import argparse
+import sys
+import os
+# Add parent directory to path if running from examples folder
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from chiluka import Chiluka
+def main():
+    parser = argparse.ArgumentParser(description="Chiluka TTS Synthesis")
+    parser.add_argument("--reference", "-r", required=True, help="Path to reference audio file")
+    parser.add_argument("--text", "-t", default="Hello, this is Chiluka speaking!", help="Text to synthesize")
+    parser.add_argument("--language", "-l", default="en", help="Language code (en, te, hi, etc.)")
+    parser.add_argument("--output", "-o", default="output.wav", help="Output WAV file path")
+    parser.add_argument("--alpha", type=float, default=0.3, help="Acoustic style mixing (0-1)")
+    parser.add_argument("--beta", type=float, default=0.7, help="Prosodic style mixing (0-1)")
+    parser.add_argument("--steps", type=int, default=5, help="Diffusion steps")
+    args = parser.parse_args()
+    # Initialize - uses bundled models
+    print("Initializing Chiluka TTS...")
+    tts = Chiluka()
+    # Synthesize
+    print(f"Synthesizing: '{args.text}'")
+    wav = tts.synthesize(
+        text=args.text,
+        reference_audio=args.reference,
+        language=args.language,
+        alpha=args.alpha,
+        beta=args.beta,
+        diffusion_steps=args.steps,
+    )
+    # Save
+    tts.save_wav(wav, args.output)
+    print(f"Done! Output saved to: {args.output}")
+if __name__ == "__main__":
+    main()

examples/telugu_synthesis.py ADDED Viewed

	@@ -0,0 +1,53 @@

+#!/usr/bin/env python3
+"""
+Telugu TTS synthesis example using Chiluka.
+Usage:
+    python telugu_synthesis.py --reference path/to/telugu_reference.wav
+"""
+import argparse
+import sys
+import os
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from chiluka import Chiluka
+def main():
+    parser = argparse.ArgumentParser(description="Chiluka Telugu TTS")
+    parser.add_argument("--reference", "-r", required=True, help="Path to Telugu reference audio")
+    parser.add_argument("--output", "-o", default="telugu_output.wav", help="Output file")
+    args = parser.parse_args()
+    # Sample Telugu texts
+    texts = [
+        "నమస్కారం, నేను చిలుక మాట్లాడుతున్నాను",
+        "మహారాజా తమరిని మోసగించి నేను ఎక్కడికి పారిపోగలను",
+        "మీకు ధన్యవాదాలు",
+    ]
+    # Initialize
+    print("Initializing Chiluka TTS...")
+    tts = Chiluka()
+    # Synthesize each text
+    for i, text in enumerate(texts):
+        print(f"\nSynthesizing ({i+1}/{len(texts)}): {text}")
+        wav = tts.synthesize(
+            text=text,
+            reference_audio=args.reference,
+            language="te",
+            alpha=0.3,
+            beta=0.7,
+        )
+        output_path = args.output.replace(".wav", f"_{i+1}.wav")
+        tts.save_wav(wav, output_path)
+    print("\nDone!")
+if __name__ == "__main__":
+    main()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,64 @@

+[build-system]
+requires = ["setuptools>=45", "wheel", "setuptools-scm[toml]>=6.2"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "chiluka"
+version = "0.1.0"
+description = "Chiluka - A lightweight TTS inference package based on StyleTTS2"
+readme = "README.md"
+license = {text = "MIT"}
+requires-python = ">=3.8"
+authors = [
+    {name = "Your Name", email = "your.email@example.com"}
+]
+keywords = ["tts", "text-to-speech", "speech-synthesis", "styletts2", "deep-learning"]
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Topic :: Multimedia :: Sound/Audio :: Speech",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+]
+dependencies = [
+    "torch>=1.13.0",
+    "torchaudio>=0.13.0",
+    "transformers>=4.20.0",
+    "librosa>=0.9.0",
+    "phonemizer>=3.0.0",
+    "nltk>=3.7",
+    "PyYAML>=6.0",
+    "munch>=2.5.0",
+    "einops>=0.6.0",
+    "einops-exts>=0.0.4",
+    "numpy>=1.21.0",
+    "scipy>=1.7.0",
+]
+[project.optional-dependencies]
+playback = ["pyaudio>=0.2.11"]
+dev = ["pytest>=7.0.0", "black>=22.0.0", "isort>=5.10.0"]
+[project.urls]
+Homepage = "https://github.com/yourusername/chiluka"
+Documentation = "https://github.com/yourusername/chiluka#readme"
+Repository = "https://github.com/yourusername/chiluka"
+Issues = "https://github.com/yourusername/chiluka/issues"
+[tool.setuptools.packages.find]
+where = ["."]
+[tool.black]
+line-length = 120
+target-version = ['py38', 'py39', 'py310', 'py311']
+[tool.isort]
+profile = "black"
+line_length = 120

setup.py ADDED Viewed

	@@ -0,0 +1,60 @@

+"""Setup script for Chiluka TTS package."""
+from setuptools import setup, find_packages
+with open("README.md", "r", encoding="utf-8") as fh:
+    long_description = fh.read()
+setup(
+    name="chiluka",
+    version="0.1.0",
+    author="Your Name",
+    author_email="your.email@example.com",
+    description="Chiluka - A lightweight TTS inference package based on StyleTTS2",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    url="https://github.com/yourusername/chiluka",
+    packages=find_packages(),
+    classifiers=[
+        "Development Status :: 3 - Alpha",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Science/Research",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
+        "Topic :: Multimedia :: Sound/Audio :: Speech",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    ],
+    python_requires=">=3.8",
+    install_requires=[
+        "torch>=1.13.0",
+        "torchaudio>=0.13.0",
+        "transformers>=4.20.0",
+        "librosa>=0.9.0",
+        "phonemizer>=3.0.0",
+        "nltk>=3.7",
+        "PyYAML>=6.0",
+        "munch>=2.5.0",
+        "einops>=0.6.0",
+        "einops-exts>=0.0.4",
+        "numpy>=1.21.0",
+        "scipy>=1.7.0",
+    ],
+    extras_require={
+        "playback": ["pyaudio>=0.2.11"],
+        "dev": [
+            "pytest>=7.0.0",
+            "black>=22.0.0",
+            "isort>=5.10.0",
+        ],
+    },
+    entry_points={
+        "console_scripts": [
+            "chiluka=chiluka.cli:main",
+        ],
+    },
+)