Spaces:

dkounadis
/

audiogen2

Sleeping

App Files Files Community

Dionyssos commited on Sep 3, 2025

Commit

26bbca3

1 Parent(s): 13909fb

try .74s LM on CPU

Browse files

Files changed (9) hide show

README.md +11 -7
app.py +715 -0
audiocraft/__init__.py +1 -0
audiocraft/builders.py +78 -0
audiocraft/encodec.py +390 -0
audiocraft/lm.py +162 -0
audiocraft/transformer.py +173 -0
audiocraft/vq.py +119 -0
vits.py +623 -0

README.md CHANGED Viewed

@@ -1,14 +1,18 @@
 ---
-title: Audiogen2
-emoji: 🌖
-colorFrom: purple
-colorTo: indigo
 sdk: gradio
-sdk_version: 5.44.1
 app_file: app.py
-pinned: false
-license: cc-by-nc-4.0
 short_description: AudioGen for CPU
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Audiogen
+emoji: 🍍
+colorFrom: gray
+colorTo: gray
 sdk: gradio
+sdk_version: 5.41.1
 app_file: app.py
 short_description: AudioGen for CPU
+license: cc-by-nc-4.0
+tags:
+- audiogen
+- soundscapes
+- shift
+- tts
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,715 @@

+# -*- coding: utf-8 -*-
+import json
+import soundfile
+import re
+import unicodedata
+import gradio as gr
+import textwrap
+import numpy as np
+import torch
+import nltk
+from num2words import num2words
+from num2word_greek.numbers2words import convert_numbers
+from vits import VitsModel, VitsTokenizer
+from audiocraft.builders import AudioGen  # fixed bug for repeated calls
+nltk.download('punkt', download_dir='./')  # comment if downloaded once
+nltk.download('punkt_tab', download_dir='./')
+nltk.data.path.append('.')
+device = 'cpu'
+def fix_vocals(text, lang='ron'):
+    # Longer phrases should come before shorter ones to prevent partial matches.
+    ron_replacements = {
+        'ţ': 'ț',
+        'ț': 'ts',
+        'î': 'u',
+        'â': 'a',
+        'ş': 's',
+        'w': 'oui',
+        'k': 'c',
+        'l': 'll',
+        # Math symbols
+        'sqrt': ' rădăcina pătrată din ',
+        '^': ' la puterea ',
+        '+': ' plus ',
+        ' - ': ' minus ',  # only replace if standalone so to not say minus if is a-b-c
+        '*': ' ori ',  # times
+        '/': ' împărțit la ',  # divided by
+        '=': ' egal cu ',  # equals
+        'pi': ' pi ',
+        '<': ' mai mic decât ',
+        '>': ' mai mare decât',
+        '%': ' la sută ', # percent (from previous)
+        '(': ' paranteză deschisă ',
+        ')': ' paranteză închisă ',
+        '[': ' paranteză pătrată deschisă ',
+        ']': ' paranteză pătrată închisă ',
+        '{': ' acoladă deschisă ',
+        '}': ' acoladă închisă ',
+        '≠': ' nu este egal cu ',
+        '≤': ' mai mic sau egal cu ',
+        '≥': ' mai mare sau egal cu ',
+        '≈': ' aproximativ ',
+        '∞': ' infinit ',
+        '€': ' euro ',
+        '$': ' dolar ',
+        '£': ' liră ',
+        '&': ' și ',  # and
+        '@': ' la ',  # at
+        '#': ' diez ',  # hash
+        '∑': ' sumă ',
+        '∫': ' integrală ',
+        '√': ' rădăcina pătrată a ', # more generic square root
+    }
+    eng_replacements = {
+        'wik': 'weaky',
+        'sh': 'ss',
+        'ch': 'ttss',
+        'oo': 'oeo',
+        # Math symbols for English
+        'sqrt': ' square root of ',
+        '^': ' to the power of ',
+        '+': ' plus ',
+        ' - ': ' minus ',
+        '*': ' times ',
+        ' / ': ' divided by ',
+        '=': ' equals ',
+        'pi': ' pi ',
+        '<': ' less than ',
+        '>': ' greater than ',
+        # Additional common math symbols from previous list
+        '%': ' percent ',
+        '(': ' open parenthesis ',
+        ')': ' close parenthesis ',
+        '[': ' open bracket ',
+        ']': ' close bracket ',
+        '{': ' open curly brace ',
+        '}': ' close curly brace ',
+        '∑': ' sum ',
+        '∫': ' integral ',
+        '√': ' square root of ',
+        '≠': ' not equals ',
+        '≤': ' less than or equals ',
+        '≥': ' greater than or equals ',
+        '≈': ' approximately ',
+        '∞': ' infinity ',
+        '€': ' euro ',
+        '$': ' dollar ',
+        '£': ' pound ',
+        '&': ' and ',
+        '@': ' at ',
+        '#': ' hash ',
+    }
+    serbian_replacements = {
+        'rn': 'rrn',
+        'ć': 'č',
+        'c': 'č',
+        'đ': 'd',
+        'j': 'i',
+        'l': 'lll',
+        'w': 'v',
+        #  https://huggingface.co/facebook/mms-tts-rmc-script_latin
+        'sqrt': 'kvadratni koren iz',
+        '^': ' na stepen ',
+        '+': ' plus ',
+        ' - ': ' minus ',
+        '*': ' puta ',
+        ' / ': ' podeljeno sa ',
+        '=': ' jednako ',
+        'pi': ' pi ',
+        '<': ' manje od ',
+        '>': ' veće od ',
+        '%': ' procenat ',
+        '(': ' otvorena zagrada ',
+        ')': ' zatvorena zagrada ',
+        '[': ' otvorena uglasta zagrada ',
+        ']': ' zatvorena uglasta zagrada ',
+        '{': ' otvorena vitičasta zagrada ',
+        '}': ' zatvorena vitičasta zagrada ',
+        '∑': ' suma ',
+        '∫': ' integral ',
+        '√': ' kvadratni koren ',
+        '≠': ' nije jednako ',
+        '≤': ' manje ili jednako od ',
+        '≥': ' veće ili jednako od ',
+        '≈': ' približno ',
+        '∞': ' beskonačnost ',
+        '€': ' evro ',
+        '$': ' dolar ',
+        '£': ' funta ',
+        '&': ' i ',
+        '@': ' et ',
+        '#': ' taraba ',
+        # Others
+        #     'rn': 'rrn',
+        # 'ć': 'č',
+        # 'c': 'č',
+        # 'đ': 'd',
+        # 'l': 'le',
+        # 'ij': 'i',
+        # 'ji': 'i',
+        # 'j': 'i',
+        # 'služ': 'sloooozz',  # 'službeno'
+        # 'suver': 'siuveeerra',  # 'suverena'
+        # 'država': 'dirrezav',  # 'država'
+        # 'iči': 'ici',  # 'Graniči'
+        # 's ': 'se',  # a s with space
+        # 'q': 'ku',
+        # 'w': 'aou',
+        # 'z': 's',
+        # "š": "s",
+        # 'th': 'ta',
+        # 'v': 'vv',
+        # "ć": "č",
+        # "đ": "ď",
+        # "lj": "ľ",
+        # "nj": "ň",
+        # "ž": "z",
+        # "c": "č"
+    }
+    deu_replacements = {
+        'sch': 'sh',
+        'ch': 'kh',
+        'ie': 'ee',
+        'ei': 'ai',
+        'ä': 'ae',
+        'ö': 'oe',
+        'ü': 'ue',
+        'ß': 'ss',
+        # Math symbols for German
+        'sqrt': ' Quadratwurzel aus ',
+        '^': ' hoch ',
+        '+': ' plus ',
+        ' - ': ' minus ',
+        '*': ' mal ',
+        ' / ': ' geteilt durch ',
+        '=': ' gleich ',
+        'pi': ' pi ',
+        '<': ' kleiner als ',
+        '>': ' größer als',
+        # Additional common math symbols from previous list
+        '%': ' prozent ',
+        '(': ' Klammer auf ',
+        ')': ' Klammer zu ',
+        '[': ' eckige Klammer auf ',
+        ']': ' eckige Klammer zu ',
+        '{': ' geschweifte Klammer auf ',
+        '}': ' geschweifte Klammer zu ',
+        '∑': ' Summe ',
+        '∫': ' Integral ',
+        '√': ' Quadratwurzel ',
+        '≠': ' ungleich ',
+        '≤': ' kleiner oder gleich ',
+        '≥': ' größer oder gleich ',
+        '≈': ' ungefähr ',
+        '∞': ' unendlich ',
+        '€': ' euro ',
+        '$': ' dollar ',
+        '£': ' pfund ',
+        '&': ' und ',
+        '@': ' at ', # 'Klammeraffe' is also common but 'at' is simpler
+        '#': ' raute ',
+    }
+    fra_replacements = {
+        # French specific phonetic replacements (add as needed)
+        # e.g., 'ç': 's', 'é': 'e', etc.
+        'w': 'v',
+        # Math symbols for French
+        'sqrt': ' racine carrée de ',
+        '^': ' à la puissance ',
+        '+': ' plus ',
+        ' - ': ' moins ',  # tiré ;
+        '*': ' fois ',
+        ' / ': ' divisé par ',
+        '=': ' égale ',
+        'pi': ' pi ',
+        '<': ' inférieur à ',
+        '>': ' supérieur à ',
+        # Add more common math symbols as needed for French
+        '%': ' pour cent ',
+        '(': ' parenthèse ouverte ',
+        ')': ' parenthèse fermée ',
+        '[': ' crochet ouvert ',
+        ']': ' crochet fermé ',
+        '{': ' accolade ouverte ',
+        '}': ' accolade fermée ',
+        '∑': ' somme ',
+        '∫': ' intégrale ',
+        '√': ' racine carrée ',
+        '≠': ' n\'égale pas ',
+        '≤': ' inférieur ou égal à ',
+        '≥': ' supérieur ou égal à ',
+        '≈': ' approximativement ',
+        '∞': ' infini ',
+        '€': ' euro ',
+        '$': ' dollar ',
+        '£': ' livre ',
+        '&': ' et ',
+        '@': ' arobase ',
+        '#': ' dièse ',
+    }
+    hun_replacements = {
+        # Hungarian specific phonetic replacements (add as needed)
+        # e.g., 'á': 'a', 'é': 'e', etc.
+        'ch': 'ts',
+        'cs': 'tz',
+        'g': 'gk',
+        'w': 'v',
+        'z': 'zz',
+        # Math symbols for Hungarian
+        'sqrt': ' négyzetgyök ',
+        '^': ' hatvány ',
+        '+': ' plusz ',
+        ' - ': ' mínusz ',
+        '*': ' szorozva ',
+        ' / ': ' osztva ',
+        '=': ' egyenlő ',
+        'pi': ' pi ',
+        '<': ' kisebb mint ',
+        '>': ' nagyobb mint ',
+        # Add more common math symbols as needed for Hungarian
+        '%': ' százalék ',
+        '(': ' nyitó zárójel ',
+        ')': ' záró zárójel ',
+        '[': ' nyitó szögletes zárójel ',
+        ']': ' záró szögletes zárójel ',
+        '{': ' nyitó kapcsos zárójel ',
+        '}': ' záró kapcsos zárójel ',
+        '∑': ' szumma ',
+        '∫': ' integrál ',
+        '√': ' négyzetgyök ',
+        '≠': ' nem egyenlő ',
+        '≤': ' kisebb vagy egyenlő ',
+        '≥': ' nagyobb vagy egyenlő ',
+        '≈': ' körülbelül ',
+        '∞': ' végtelen ',
+        '€': ' euró ',
+        '$': ' dollár ',
+        '£': ' font ',
+        '&': ' és ',
+        '@': ' kukac ',
+        '#': ' kettőskereszt ',
+    }
+    grc_replacements = {
+        # Ancient Greek specific phonetic replacements (add as needed)
+        # These are more about transliterating Greek letters if they are in the input text.
+        # Math symbols for Ancient Greek (literal translations)
+        'sqrt': ' τετραγωνικὴ ῥίζα ',
+        '^': ' εἰς δύναμιν ',
+        '+': ' σὺν ',
+        ' - ': ' χωρὶς ',
+        '*': ' πολλάκις ',
+        ' / ': ' διαιρέω ',
+        '=': ' ἴσον ',
+        'pi': ' πῖ ',
+        '<': ' ἔλαττον ',
+        '>': ' μείζον ',
+        # Add more common math symbols as needed for Ancient Greek
+        '%': ' τοῖς ἑκατόν ', # tois hekaton - 'of the hundred'
+        '(': ' ἀνοικτὴ παρένθεσις ',
+        ')': ' κλειστὴ παρένθεσις ',
+        '[': ' ἀνοικτὴ ἀγκύλη ',
+        ']': ' κλειστὴ ἀγκύλη ',
+        '{': ' ἀνοικτὴ σγουρὴ ἀγκύλη ',
+        '}': ' κλειστὴ σγουρὴ ἀγκύλη ',
+        '∑': ' ἄθροισ��α ',
+        '∫': ' ὁλοκλήρωμα ',
+        '√': ' τετραγωνικὴ ῥίζα ',
+        '≠': ' οὐκ ἴσον ',
+        '≤': ' ἔλαττον ἢ ἴσον ',
+        '≥': ' μεῖζον ἢ ἴσον ',
+        '≈': ' περίπου ',
+        '∞': ' ἄπειρον ',
+        '€': ' εὐρώ ',
+        '$': ' δολάριον ',
+        '£': ' λίρα ',
+        '&': ' καὶ ',
+        '@': ' ἀτ ', # at
+        '#': ' δίεση ', # hash
+    }
+    # Select the appropriate replacement dictionary based on the language
+    replacements_map = {
+        'grc': grc_replacements,
+        'ron': ron_replacements,
+        'eng': eng_replacements,
+        'deu': deu_replacements,
+        'fra': fra_replacements,
+        'hun': hun_replacements,
+        'rmc-script_latin': serbian_replacements,
+    }
+    current_replacements = replacements_map.get(lang)
+    if current_replacements:
+        # Sort replacements by length of the key in descending order.
+        # This is crucial for correctly replacing multi-character strings (like 'sqrt', 'sch')
+        # before their shorter substrings ('s', 'ch', 'q', 'r', 't').
+        sorted_replacements = sorted(current_replacements.items(), key=lambda item: len(item[0]), reverse=True)
+        for old, new in sorted_replacements:
+            text = text.replace(old, new)
+        return text
+    else:
+        # If the language is not supported, return the original text
+        print(f"Warning: Language '{lang}' not supported for text replacement. Returning original text.")
+        return text
+import unicodedata
+def only_greek_or_only_latin(text, lang='grc'):
+    '''
+        str: The converted string in the specified target script.
+             Characters not found in any mapping are preserved as is.
+             Latin accented characters in the input (e.g., 'É', 'ü') will
+             be preserved in their lowercase form (e.g., 'é', 'ü') if
+             converting to Latin.
+    '''
+    # --- Mapping Dictionaries ---
+    # Keys are in lowercase as input text is case-folded.
+    # If the output needs to maintain original casing, additional logic is required.
+    latin_to_greek_map = {
+        'a': 'α', 'b': 'β', 'g': 'γ', 'd': 'δ', 'e': 'ε',
+        'ch': 'τσο', # Example of a multi-character Latin sequence
+        'z': 'ζ', 'h': 'χ', 'i': 'ι', 'k': 'κ', 'l': 'λ',
+        'm': 'μ', 'n': 'ν', 'x': 'ξ', 'o': 'ο', 'p': 'π',
+        'v': 'β', 'sc': 'σκ', 'r': 'ρ', 's': 'σ', 't': 'τ',
+        'u': 'ου', 'f': 'φ', 'c': 'σ', 'w': 'β', 'y': 'γ',
+    }
+    greek_to_latin_map = {
+        'ου': 'ou', # Prioritize common diphthongs/digraphs
+        'α': 'a', 'β': 'v', 'γ': 'g', 'δ': 'd', 'ε': 'e',
+        'ζ': 'z', 'η': 'i', 'θ': 'th', 'ι': 'i', 'κ': 'k',
+        'λ': 'l', 'μ': 'm', 'ν': 'n', 'ξ': 'x', 'ο': 'o',
+        'π': 'p', 'ρ': 'r', 'σ': 's', 'τ': 't', 'υ': 'y', # 'y' is a common transliteration for upsilon
+        'φ': 'f', 'χ': 'ch', 'ψ': 'ps', 'ω': 'o',
+        'ς': 's', # Final sigma
+    }
+    cyrillic_to_latin_map = {
+        'а': 'a', 'б': 'b', 'в': 'v', 'г': 'g', 'д': 'd', 'е': 'e', 'ё': 'yo', 'ж': 'zh',
+        'з': 'z', 'и': 'i', 'й': 'y', 'к': 'k', 'л': 'l', 'м': 'm', 'н': 'n', 'о': 'o',
+        'п': 'p', 'р': 'r', 'с': 's', 'т': 't', 'у': 'u', 'ф': 'f', 'х': 'kh', 'ц': 'ts',
+        'ч': 'ch', 'ш': 'sh', 'щ': 'shch', 'ъ': '', 'ы': 'y', 'ь': '', 'э': 'e', 'ю': 'yu',
+        'я': 'ya',
+    }
+    # Direct Cyrillic to Greek mapping based on phonetic similarity.
+    # These are approximations and may not be universally accepted transliterations.
+    cyrillic_to_greek_map = {
+        'а': 'α', 'б': 'β', 'в': 'β', 'г': 'γ', 'д': 'δ', 'е': 'ε', 'ё': 'ιο', 'ж': 'ζ',
+        'з': 'ζ', 'и': 'ι', 'й': 'ι', 'κ': 'κ', 'λ': 'λ', 'м': 'μ', 'н': 'ν', 'о': 'ο',
+        'π': 'π', 'ρ': 'ρ', 'σ': 'σ', 'τ': 'τ', 'у': 'ου', 'ф': 'φ', 'х': 'χ', 'ц': 'τσ',
+        'ч': 'τσ', # or τζ depending on desired sound
+        'ш': 'σ', 'щ': 'σ', # approximations
+        'ъ': '', 'ы': 'ι', 'ь': '', 'э': 'ε', 'ю': 'ιου',
+        'я': 'ια',
+    }
+    # Convert the input text to lowercase, preserving accents for Latin characters.
+    # casefold() is used for more robust caseless matching across Unicode characters.
+    lowercased_text = text.lower()  #casefold()
+    output_chars = []
+    current_index = 0
+    if lang == 'grc':
+        # Combine all relevant maps for direct lookup to Greek
+        conversion_map = {**latin_to_greek_map, **cyrillic_to_greek_map}
+        # Sort keys by length in reverse order to handle multi-character sequences first
+        sorted_source_keys = sorted(
+            list(latin_to_greek_map.keys()) + list(cyrillic_to_greek_map.keys()),
+            key=len,
+            reverse=True
+        )
+        while current_index < len(lowercased_text):
+            found_conversion = False
+            for key in sorted_source_keys:
+                if lowercased_text.startswith(key, current_index):
+                    output_chars.append(conversion_map[key])
+                    current_index += len(key)
+                    found_conversion = True
+                    break
+            if not found_conversion:
+                # If no specific mapping found, append the character as is.
+                # This handles unmapped characters and already Greek characters.
+                output_chars.append(lowercased_text[current_index])
+                current_index += 1
+        return ''.join(output_chars)
+    else: # Default to 'lat' conversion
+        # Combine Greek to Latin and Cyrillic to Latin maps.
+        # Cyrillic map keys will take precedence in case of overlap if defined after Greek.
+        combined_to_latin_map = {**greek_to_latin_map, **cyrillic_to_latin_map}
+        # Sort all relevant source keys by length in reverse for replacement
+        sorted_source_keys = sorted(
+            list(greek_to_latin_map.keys()) + list(cyrillic_to_latin_map.keys()),
+            key=len,
+            reverse=True
+        )
+        while current_index < len(lowercased_text):
+            found_conversion = False
+            for key in sorted_source_keys:
+                if lowercased_text.startswith(key, current_index):
+                    latin_equivalent = combined_to_latin_map[key]
+                    # Strip accents ONLY if the source character was from the Greek map.
+                    # This preserves accents on original Latin characters (like 'é')
+                    # and allows for intentional accent stripping from Greek transliterations.
+                    if key in greek_to_latin_map:
+                        normalized_latin = unicodedata.normalize('NFD', latin_equivalent)
+                        stripped_latin = ''.join(c for c in normalized_latin if not unicodedata.combining(c))
+                        output_chars.append(stripped_latin)
+                    else:
+                        output_chars.append(latin_equivalent)
+                    current_index += len(key)
+                    found_conversion = True
+                    break
+            if not found_conversion:
+                # If no conversion happened from Greek or Cyrillic, append the character as is.
+                # This preserves existing Latin characters (including accented ones from input),
+                # numbers, punctuation, and other symbols.
+                output_chars.append(lowercased_text[current_index])
+                current_index += 1
+        return ''.join(output_chars)
+def _num2words(text='01234', lang=None):
+    if lang == 'grc':
+        return convert_numbers(text)
+    return num2words(text, lang=lang)  # HAS TO BE kwarg lang=lang
+def transliterate_number(number_string,
+                         lang=None):
+    if lang == 'rmc-script_latin':
+        lang = 'sr'
+        exponential_pronoun = ' puta deset na stepen od '
+        comma = ' tačka '
+    elif lang == 'ron':
+        lang = 'ro'
+        exponential_pronoun = ' tízszer a erejéig '
+        comma = ' virgulă '
+    elif lang == 'hun':
+        lang = 'hu'
+        exponential_pronoun = ' tízszer a erejéig '
+        comma = ' virgula '
+    elif lang == 'deu':
+        exponential_pronoun = ' mal zehn hoch '
+        comma = ' komma '
+    elif lang == 'fra':
+        lang = 'fr'
+        exponential_pronoun = ' puissance '
+        comma = 'virgule'
+    elif lang == 'grc':
+        exponential_pronoun = ' εις την δυναμην του '
+        comma = 'κομμα'
+    else:
+        lang = lang[:2]
+        exponential_pronoun = ' times ten to the power of '
+        comma = ' point '
+    def replace_number(match):
+        prefix = match.group(1) or ""
+        number_part = match.group(2)
+        suffix = match.group(5) or ""
+        try:
+            if 'e' in number_part.lower():
+                base, exponent = number_part.lower().split('e')
+                words = _num2words(base, lang=lang) + exponential_pronoun + _num2words(exponent, lang=lang)
+            elif '.' in number_part:
+                integer_part, decimal_part = number_part.split('.')
+                words = _num2words(integer_part, lang=lang) + comma + " ".join(
+                    [_num2words(digit, lang=lang) for digit in decimal_part])
+            else:
+                words = _num2words(number_part, lang=lang)
+            return prefix + words + suffix
+        except ValueError:
+            return match.group(0)  # Return original if conversion fails
+    pattern = r'([^\d]*)(\d+(\.\d+)?([Ee][+-]?\d+)?)([^\d]*)'
+    return re.sub(pattern, replace_number, number_string)
+language_names = ['Ancient greek',
+                  'English',
+                  'Deutsch',
+                  'French',
+                  'Hungarian',
+                  'Romanian',
+                  'Serbian (Approx.)']
+def other_tts(text=None,
+              lang='romanian',
+              soundscape=''):
+    # https://huggingface.co/dkounadis/artificial-styletts2/blob/main/msinference.py
+    lang = lang.lower()
+    # https://huggingface.co/spaces/mms-meta/MMS
+    if 'hun' in lang:
+        lang_code = 'hun'
+    elif any([i in lang for i in ['ser', 'bosn', 'herzegov', 'montenegr', 'macedon']]):
+        # romani carpathian (has also Vlax) - cooler voice
+        lang_code = 'rmc-script_latin'
+    elif 'rom' in lang:
+        lang_code = 'ron'
+    elif 'ger' in lang or 'deu' in lang or 'allem' in lang:
+        lang_code = 'deu'
+    elif 'french' in lang:
+        lang_code = 'fra'
+    elif 'eng' in lang:
+        lang_code = 'eng'
+    elif 'ancient greek' in lang:
+        lang_code = 'grc'
+    else:
+        lang_code = lang.split()[0].strip()   # latin & future option
+    # LATIN / GRC / CYRILLIC
+    text = only_greek_or_only_latin(text, lang=lang_code)  # assure gr-chars if lang=='grc' / latin if lang!='grc'
+    # NUMERALS (^ in math expression found & substituted here before arriving to fix_vocals)
+    text = transliterate_number(text, lang=lang_code)
+    # PRONOUNC.
+    text = fix_vocals(text, lang=lang_code)
+    # VITS
+    global cached_lang_code, cached_net_g, cached_tokenizer
+    if 'cached_lang_code' not in globals() or cached_lang_code != lang_code:
+        cached_lang_code = lang_code
+        cached_net_g = VitsModel.from_pretrained(f'facebook/mms-tts-{lang_code}').eval().to(device)
+        cached_tokenizer = VitsTokenizer.from_pretrained(f'facebook/mms-tts-{lang_code}')
+    net_g = cached_net_g
+    tokenizer = cached_tokenizer
+    total_audio = []
+    # AUDIOGEN
+    audiogen = AudioGen().eval().to('cpu')
+    if not isinstance(text, list):
+        text = textwrap.wrap(text, width=439)
+        # text = [i + '. ' for sent in nltk.sent_tokenize(text) for i in textwrap.wrap(sent, width=420)]    # short sentences call the model a lot of times - slower in CPU
+    for _t in text:
+        inputs = tokenizer(_t, return_tensors="pt")
+        with torch.no_grad():
+            x = net_g(input_ids=inputs.input_ids.to(device),
+                      attention_mask=inputs.attention_mask.to(device),
+                      lang_code=lang_code,
+                      )[0, :]
+            total_audio.append(x)              # crop the 1st audio - is PREFIX text 156000 samples to chose deu voice / VitsAttention()
+        print(f'\n\n_______________________________ {_t} {x.shape=}')
+    x = torch.cat(total_audio).cpu().numpy()
+    # x /= np.abs(x).max() + 1e-7  ~ Volume normalisation @api.py:tts_multi_sentence() OR demo.py
+    # AUDIOGEN
+    # --
+    if soundscape != '':
+        background = audiogen.generate(
+            soundscape,
+            duration=len(x)/16000 + .74,  # duration in seconds
+        ).detach().cpu().numpy()
+        # stereo blend
+        background /= 1.02 * np.abs(background).max() + 1e-7  # volume to [-1,1]
+        background = background[:len(x), None]
+        x = x[:, None]
+        x = np.concatenate(
+            [.49 * x + .51 * background,
+             .51 * background + .49 * x], 1)  # stereo
+    # --
+    tmp_file = f'_speech.wav'  # N x clients (cleanup vs tmp file / client)
+    soundfile.write(tmp_file, x, 16000)
+    return tmp_file
+other_tts(text='Η γρηγορη καφετι αλεπου πειδαει πανω απο τον τεμπελη σκυλο.',
+          lang='English',
+          soundscape='cats meowing')
+# iface = gr.Interface(
+#     fn=other_tts,
+#     # title="audioNarTTS",
+#     # description='TTS - [VITS duration of oscillation](https://huggingface.co/spaces/dkounadis/audioNarTTS/blob/main/vits.py#L560) via [fairseq MMS TTS](https://github.com/facebookresearch/fairseq/blob/main/examples/mms/README.md) langs. For [SHIFT-europe](https://shift-europe.eu/).',
+#     inputs=[
+#         gr.Textbox(lines=4,
+#                    value='Η γρηγορη καφετι αλεπου πειδαει πανω απο τον τεμπελη σκυλο.',
+#                    label="Type text for TTS"),
+#         gr.Dropdown(
+#             choices=language_names,
+#             label="TTS lang",
+#             value="Ancient greek",
+#         ),
+#         gr.Textbox(lines=1,
+#                    value="dogs barg",
+#                    label="AudioGen Txt"
+#         ),
+#     ],
+#     outputs="audio",
+# )
+# iface.launch()

audiocraft/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .builders import AudioGen

audiocraft/builders.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import torch
+from torch import nn
+from omegaconf import OmegaConf
+import numpy as np
+from huggingface_hub import hf_hub_download
+import os
+from audiocraft.encodec import EncodecModel
+from audiocraft.lm import LMModel
+N_REPEAT = 2  # num (virtual batch_size) clones of audio sounds
+def _shift(x):
+    #print(x.shape, 'BATCH Independent SHIFT\n AudioGen')
+    for i, _slice in enumerate(x):
+        n = x.shape[2]
+        offset = np.random.randint(.24 * n, max(1, .74 * n))  # high should be above >= 0 TBD
+        print(offset)
+        x[i, :, :] = torch.roll(_slice, offset, dims=1)  # _slice 2D
+    return x
+class AudioGen(torch.nn.Module):
+    # https://huggingface.co/facebook/audiogen-medium
+    def __init__(self):
+        super().__init__()
+        _file_1 = hf_hub_download(
+            repo_id='facebook/audiogen-medium',
+            filename="compression_state_dict.bin",
+            cache_dir=os.environ.get('AUDIOCRAFT_CACHE_DIR', None),
+            library_name="audiocraft",
+            library_version= '1.3.0a1')  # Found at __init__.py #audiocraft.__version__)
+        pkg = torch.load(_file_1, map_location='cpu')# kwargs = OmegaConf.create(pkg['xp.cfg'])
+        self.compression_model = EncodecModel()
+        self.compression_model.load_state_dict(pkg['best_state'], strict=False)
+        self.compression_model.eval()  # ckpt has also unused encoder weights
+        #  T5 &
+        #  LM
+        _file_2 = hf_hub_download(
+            repo_id='facebook/audiogen-medium',
+            filename="state_dict.bin",
+            cache_dir=os.environ.get('AUDIOCRAFT_CACHE_DIR', None),
+            library_name="audiocraft",
+            library_version= '1.3.0a1')  # Found at __init__.py #audiocraft.__version__)
+        pkg = torch.load(_file_2, map_location='cpu')
+        cfg = OmegaConf.create(pkg['xp.cfg'])  # CFG inside torch bin
+        _best = pkg['best_state']
+        _best['t5.output_proj.weight'] = _best.pop('condition_provider.conditioners.description.output_proj.weight')#.to(torch.float)
+        _best['t5.output_proj.bias'] = _best.pop('condition_provider.conditioners.description.output_proj.bias')#.to(torch.float)
+        self.lm = LMModel()
+        self.lm.load_state_dict(pkg['best_state'], strict=True)
+        self.lm.eval()
+    @torch.no_grad()
+    def generate(self,
+                 prompt='dogs mewo',
+                 duration=2.24,  # seconds of audio
+                 ):
+        torch.manual_seed(42)  # https://github.com/facebookresearch/audiocraft/issues/111#issuecomment-1614732858
+        self.lm.n_draw = int(duration / .74) + 1  # different beam every 0.47 seconds of audio
+        with torch.autocast(device_type='cpu', dtype=torch.bfloat16):
+            gen_tokens = self.lm.generate(
+                text_condition=[prompt] * N_REPEAT  + [''] * N_REPEAT,#['dogs', 'dogs...!', '', '']
+                max_tokens=int(duration / (N_REPEAT * self.lm.n_draw) * self.compression_model.frame_rate)
+                ) # [bs, 4, 74 * self.lm.n_draw]
+        x = self.compression_model.decode(gen_tokens)   #[bs, 1, 11840]
+        for _ in range(7):  # perhaps shift is too random as already lm.n_draw has randomness
+               x = _shift(x)
+        return x.reshape(-1) #x / (x.abs().max() + 1e-7)

audiocraft/encodec.py ADDED Viewed

	@@ -0,0 +1,390 @@

+import numpy as np
+import torch
+from torch import nn
+import math
+import typing as tp
+import warnings
+import torch
+from torch.nn import functional as F
+from torch.nn.utils import weight_norm
+from audiocraft.vq import ResidualVectorQuantizer
+class EncodecModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.decoder = SEANetDecoder()
+        self.quantizer = ResidualVectorQuantizer()
+        self.frame_rate = 50
+    def decode(self, codes):
+        # B,K,T -> B,C,T
+        emb = self.quantizer.decode(codes)
+        out = self.decoder(emb)
+        return out
+class StreamableLSTM(nn.Module):
+    """LSTM without worrying about the hidden state, nor the layout of the data.
+    Expects input as convolutional layout.
+    """
+    def __init__(self, dimension: int, num_layers: int = 2, skip: bool = True):
+        super().__init__()
+        self.skip = skip
+        self.lstm = nn.LSTM(dimension, dimension, num_layers)
+    def forward(self, x):
+        print('LSTM called 1c')
+        x = x.permute(2, 0, 1)
+        y, _ = self.lstm(x)
+        if self.skip:
+            y = y + x
+        y = y.permute(1, 2, 0)
+        return y
+class SEANetResnetBlock(nn.Module):
+    """Residual block from SEANet model.
+    Args:
+        dim (int): Dimension of the input/output.
+        kernel_sizes (list): List of kernel sizes for the convolutions.
+        dilations (list): List of dilations for the convolutions.
+        activation (str): Activation function.
+        activation_params (dict): Parameters to provide to the activation function.
+        norm (str): Normalization method.
+        norm_params (dict): Parameters to provide to the underlying normalization used along with the convolution.
+        causal (bool): Whether to use fully causal convolution.
+        pad_mode (str): Padding mode for the convolutions.
+        compress (int): Reduced dimensionality in residual branches (from Demucs v3).
+        true_skip (bool): Whether to use true skip connection or a simple
+            (streamable) convolution as the skip connection.
+    """
+    def __init__(self, dim: int, kernel_sizes: tp.List[int] = [3, 1], dilations: tp.List[int] = [1, 1],
+                 activation: str = 'ELU', activation_params: dict = {'alpha': 1.0},
+                 norm: str = 'none', norm_params: tp.Dict[str, tp.Any] = {}, causal: bool = False,
+                 pad_mode: str = 'reflect', compress: int = 2, true_skip: bool = True):
+        super().__init__()
+        assert len(kernel_sizes) == len(dilations), 'Number of kernel sizes should match number of dilations'
+        act = getattr(nn, activation)
+        hidden = dim // compress
+        block = []
+        for i, (kernel_size, dilation) in enumerate(zip(kernel_sizes, dilations)):
+            in_chs = dim if i == 0 else hidden
+            out_chs = dim if i == len(kernel_sizes) - 1 else hidden
+            block += [
+                act(**activation_params),
+                StreamableConv1d(in_chs, out_chs, kernel_size=kernel_size, dilation=dilation,
+                                 norm=norm, norm_kwargs=norm_params,
+                                 causal=causal, pad_mode=pad_mode),
+            ]
+        self.block = nn.Sequential(*block)
+        self.shortcut: nn.Module
+        if true_skip:
+            self.shortcut = nn.Identity()
+        else:
+            self.shortcut = StreamableConv1d(dim, dim, kernel_size=1, norm=norm, norm_kwargs=norm_params,
+                                             causal=causal, pad_mode=pad_mode)
+    def forward(self, x):
+        return self.shortcut(x) + self.block(x)
+class SEANetDecoder(nn.Module):
+#  channels=1 dimension=128 n_filters=64 n_residual_layers=1 ratios=[8, 5, 4, 2]
+# activation='ELU' activation_params={'alpha': 1.0}, final_activation=None
+# final_activation_params=None norm='weight_norm'
+# norm_params={} kernel_size=7 last_kernel_size=7 residual_kernel_size=3 dilation_base=2
+# causal=False pad_mode='constant'
+# true_skip=True compress=2 lstm=2 disable_norm_outer_blocks=0 trim_right_ratio=1.0
+    def __init__(self,
+                 channels = 1,
+                 dimension = 128,
+                 n_filters = 64,
+                 n_residual_layers = 1,
+                 ratios = [8, 5, 4, 2],
+                 activation = 'ELU',
+                 activation_params: dict = {'alpha': 1.0},
+                 final_activation = None,
+                 final_activation_params = None,
+                 norm = 'weight_norm',
+                 norm_params = {},
+                 kernel_size = 7,
+                 last_kernel_size = 7,
+                 residual_kernel_size = 3,
+                 dilation_base = 2,
+                 causal = False,
+                 pad_mode = 'constant',
+                 true_skip = True,
+                 compress = 2,
+                 lstm = 2,
+                 disable_norm_outer_blocks = 0,
+                 trim_right_ratio = 1.0):
+        super().__init__()
+        self.dimension = dimension
+        self.channels = channels
+        self.n_filters = n_filters
+        self.ratios = ratios
+        del ratios
+        self.n_residual_layers = n_residual_layers
+        self.hop_length = np.prod(self.ratios)
+        self.n_blocks = len(self.ratios) + 2  # first and last conv + residual blocks
+        self.disable_norm_outer_blocks = disable_norm_outer_blocks
+        assert self.disable_norm_outer_blocks >= 0 and self.disable_norm_outer_blocks <= self.n_blocks, \
+            "Number of blocks for which to disable norm is invalid." \
+            "It should be lower or equal to the actual number of blocks in the network and greater or equal to 0."
+        act = getattr(nn, activation)
+        mult = int(2 ** len(self.ratios))
+        model: tp.List[nn.Module] = [
+            StreamableConv1d(dimension, mult * n_filters, kernel_size,
+                             norm='none' if self.disable_norm_outer_blocks == self.n_blocks else norm,
+                             norm_kwargs=norm_params, causal=causal, pad_mode=pad_mode)
+        ]
+        if lstm:
+            print('\n\n\n\nLSTM IN SEANET\n\n\n\n')
+            model += [StreamableLSTM(mult * n_filters, num_layers=lstm)]
+        # Upsample to raw audio scale
+        for i, ratio in enumerate(self.ratios):
+            block_norm = 'none' if self.disable_norm_outer_blocks >= self.n_blocks - (i + 1) else norm
+            # Add upsampling layers
+            model += [
+                act(**activation_params),
+                StreamableConvTranspose1d(mult * n_filters, mult * n_filters // 2,
+                                          kernel_size=ratio * 2, stride=ratio,
+                                          norm=block_norm, norm_kwargs=norm_params,
+                                          causal=causal, trim_right_ratio=trim_right_ratio),
+            ]
+            # Add residual layers
+            for j in range(n_residual_layers):
+                model += [
+                    SEANetResnetBlock(mult * n_filters // 2, kernel_sizes=[residual_kernel_size, 1],
+                                      dilations=[dilation_base ** j, 1],
+                                      activation=activation, activation_params=activation_params,
+                                      norm=block_norm, norm_params=norm_params, causal=causal,
+                                      pad_mode=pad_mode, compress=compress, true_skip=true_skip)]
+            mult //= 2
+        # Add final layers
+        model += [
+            act(**activation_params),
+            StreamableConv1d(n_filters, channels, last_kernel_size,
+                             norm='none' if self.disable_norm_outer_blocks >= 1 else norm,
+                             norm_kwargs=norm_params, causal=causal, pad_mode=pad_mode)
+        ]
+        # Add optional final activation to decoder (eg. tanh)
+        if final_activation is not None:
+            final_act = getattr(nn, final_activation)
+            final_activation_params = final_activation_params or {}
+            model += [
+                final_act(**final_activation_params)
+            ]
+        self.model = nn.Sequential(*model)
+    def forward(self, z):
+        print(f'\n   Enter seanet with shape {z.shape}\n')  # arrives here with (1,128,35)
+        # how can this convnet care for the value that is in z so it crashes?
+        y = self.model(z)
+        print(f'\n   Exit seanet with shape {y.shape}\n')  # arrives here with (1,128,35)
+        return y
+# --
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+CONV_NORMALIZATIONS = frozenset(['none', 'weight_norm', 'spectral_norm',
+                                 'time_group_norm'])
+def apply_parametrization_norm(module: nn.Module, norm: str = 'none'):
+    assert norm in CONV_NORMALIZATIONS
+    if norm == 'weight_norm':
+        return weight_norm(module)
+    elif norm == 'spectral_norm':
+        raise FileNotFoundError
+        # return spectral_norm(module)
+    else:
+        raise ValueError
+        # We already check was in CONV_NORMALIZATION, so any other choice
+        # doesn't need reparametrization.
+        return module
+def get_extra_padding_for_conv1d(x: torch.Tensor, kernel_size: int, stride: int,
+                                 padding_total: int = 0) -> int:
+    """See `pad_for_conv1d`."""
+    length = x.shape[-1]
+    n_frames = (length - kernel_size + padding_total) / stride + 1
+    ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total)
+    return ideal_length - length
+def pad1d(x: torch.Tensor, paddings: tp.Tuple[int, int], mode: str = 'constant', value: float = 0.):
+    """Tiny wrapper around F.pad, just to allow for reflect padding on small input.
+    If this is the case, we insert extra 0 padding to the right before the reflection happen.
+    """
+    length = x.shape[-1]
+    padding_left, padding_right = paddings
+    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
+    if mode == 'reflect':
+        max_pad = max(padding_left, padding_right)
+        extra_pad = 0
+        if length <= max_pad:
+            extra_pad = max_pad - length + 1
+            x = F.pad(x, (0, extra_pad))
+        padded = F.pad(x, paddings, mode, value)
+        end = padded.shape[-1] - extra_pad
+        return padded[..., :end]
+    else:
+        return F.pad(x, paddings, mode, value)
+def unpad1d(x: torch.Tensor, paddings: tp.Tuple[int, int]):
+    """Remove padding from x, handling properly zero padding. Only for 1d!"""
+    padding_left, padding_right = paddings
+    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
+    assert (padding_left + padding_right) <= x.shape[-1]
+    end = x.shape[-1] - padding_right
+    return x[..., padding_left: end]
+class NormConv1d(nn.Module):
+    def __init__(self, *args,
+                 causal = False, norm = 'none',
+                 norm_kwargs = {}, **kwargs):
+        super().__init__()
+        self.conv = apply_parametrization_norm(nn.Conv1d(*args, **kwargs), norm)  # norm = weight_norm
+    def forward(self, x):
+        return self.conv(x)
+class NormConvTranspose1d(nn.Module):
+    def __init__(self, *args, causal: bool = False, norm: str = 'none',
+                 norm_kwargs: tp.Dict[str, tp.Any] = {}, **kwargs):
+        super().__init__()
+        self.convtr = apply_parametrization_norm(nn.ConvTranspose1d(*args, **kwargs), norm)
+    def forward(self, x):
+        return self.convtr(x)
+class StreamableConv1d(nn.Module):
+    """Conv1d with some builtin handling of asymmetric or causal padding
+    and normalization.
+    """
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 causal=False,
+                 norm='none',
+                 norm_kwargs={},
+                 pad_mode='reflect'):
+        super().__init__()
+        # warn user on unusual setup between dilation and stride
+        # if stride > 1 and dilation > 1:
+        #     warnings.warn("StreamableConv1d has been initialized with stride > 1 and dilation > 1"
+        #                   f" (kernel_size={kernel_size} stride={stride}, dilation={dilation}).")
+        self.conv = NormConv1d(in_channels, out_channels, kernel_size, stride,
+                               dilation=dilation, groups=groups, bias=bias, causal=causal,
+                               norm=norm, norm_kwargs=norm_kwargs)
+        self.causal = causal
+        self.pad_mode = pad_mode
+    def forward(self, x):
+        B, C, T = x.shape
+        kernel_size = self.conv.conv.kernel_size[0]
+        stride = self.conv.conv.stride[0]
+        dilation = self.conv.conv.dilation[0]
+        kernel_size = (kernel_size - 1) * dilation + 1  # effective kernel size with dilations
+        padding_total = kernel_size - stride
+        extra_padding = get_extra_padding_for_conv1d(x, kernel_size, stride, padding_total)
+        if self.causal:
+            # Left padding for causal
+            # x = pad1d(x, (padding_total, extra_padding), mode=self.pad_mode)
+            print('\n   \n\n\nn\n\n\nnCAUSAL N\n\n\n')
+        else:
+            # Asymmetric padding required for odd strides
+            padding_right = padding_total // 2
+            padding_left = padding_total - padding_right
+            print(f'L147 PADs {padding_left=} {padding_right=} {extra_padding=}')
+            x = pad1d(x, (padding_left, padding_right + extra_padding), mode=self.pad_mode)
+            # print(f'\n   \/n\n\n\nANTICaus N {x.shape=}\n')
+            # ANTICaus CONV OLD_SHAPE=torch.Size([1, 512, 280]) x.shape=torch.Size([1, 512, 282])
+        return self.conv(x)
+class StreamableConvTranspose1d(nn.Module):
+    """ConvTranspose1d with some builtin handling of asymmetric or causal padding
+    and normalization.
+    """
+    def __init__(self, in_channels: int, out_channels: int,
+                 kernel_size: int, stride: int = 1, causal: bool = False,
+                 norm: str = 'none', trim_right_ratio: float = 1.,
+                 norm_kwargs: tp.Dict[str, tp.Any] = {}):
+        super().__init__()
+        self.convtr = NormConvTranspose1d(in_channels, out_channels, kernel_size, stride,
+                                          causal=causal, norm=norm, norm_kwargs=norm_kwargs)
+        self.causal = causal
+        self.trim_right_ratio = trim_right_ratio
+        assert self.causal or self.trim_right_ratio == 1., \
+            "`trim_right_ratio` != 1.0 only makes sense for causal convolutions"
+        assert self.trim_right_ratio >= 0. and self.trim_right_ratio <= 1.
+    def forward(self, x):
+        kernel_size = self.convtr.convtr.kernel_size[0]
+        stride = self.convtr.convtr.stride[0]
+        padding_total = kernel_size - stride
+        y = self.convtr(x)
+        # We will only trim fixed padding. Extra padding from `pad_for_conv1d` would be
+        # removed at the very end, when keeping only the right length for the output,
+        # as removing it here would require also passing the length at the matching layer
+        # in the encoder.
+        if self.causal:
+            print('\n   \n\n\nn\n\n\nnCAUSAL T\n\n\n\n\n')
+        else:
+            # Asymmetric padding required for odd strides
+            # print('\n   \n\n\nn\n\n\nnANTICAUSAL T\n\n\n')
+            padding_right = padding_total // 2
+            padding_left = padding_total - padding_right
+            y = unpad1d(y, (padding_left, padding_right))
+        return y

audiocraft/lm.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import torch
+from audiocraft.transformer import StreamingTransformer
+from torch import nn
+from transformers import T5EncoderModel, T5Tokenizer  # type: ignore
+class T5(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.output_proj = nn.Linear(1024,  # t5-large
+                                     1536)  # lm hidden
+        self.t5_tokenizer = T5Tokenizer.from_pretrained('t5-large', legacy=True)
+        t5 = T5EncoderModel.from_pretrained('t5-large').train(mode=False)
+        # this makes sure that the t5 is not part
+        # of the saved checkpoint
+        self.__dict__['t5'] = t5.to('cpu')
+    def forward(self, prompt):
+        with torch.set_grad_enabled(False): #, torch.autocast(device_type='cpu', dtype=torch.float32):
+            bs = len(prompt) // 2
+            d = self.t5_tokenizer(prompt,
+                                    return_tensors='pt',
+                                    padding=True).to(self.output_proj.bias.device)
+            d['attention_mask'][bs:, :] = 0  # null condition t5 attn_mask should be zero
+            x = self.t5(input_ids=d['input_ids'],
+                            attention_mask=d['attention_mask']).last_hidden_state  # no kv
+        # Float 16
+        # > self.output_proj() is outside of autocast of t5 - however inside the autocast of lm thus computed in torch.float16
+        x = self.output_proj(x)  # nn.Linear() - produces different result if there is no duplicate txt condition here
+        x[bs:, :, :] = 0  # venv/../site-packages/audiocraft/modules/conditioners.py -> tokenize()
+        return x
+class LMModel(nn.Module):
+    def __init__(self,
+                 n_q = 4,
+                 card = 2048,
+                 dim = 1536
+                 ):
+        super().__init__()
+        self.t5 = T5()
+        self.card = card # 2048
+        self.n_draw = 1  # draw > 1 tokens of different CFG scale
+                         # batch size > 1 is slower from n_draw as calls transformer on larger batch
+        self.emb = nn.ModuleList([nn.Embedding(self.card + 1, dim) for _ in range(n_q)])  # EMBEDDING HAS 2049
+        self.transformer = StreamingTransformer()
+        self.out_norm = nn.LayerNorm(dim, eps=1e-5)
+        self.linears = nn.ModuleList([nn.Linear(dim, self.card, bias=False) for _ in range(n_q)])  # LINEAR DOESNT HAVE 2049
+    def forward(self,
+                sequence,
+                condition_tensors=None,
+                cache_position=None):
+        bs, n_q, time_frames = sequence.shape # [bs, 4, time]
+        input_ = sum([self.emb[k](sequence[:, k]) for k in range(n_q)])
+        out = self.transformer(torch.cat([input_, input_], 0),  # duplicate null condition (bs x 2) for ClassifierFreeGuidance
+                               cross_attention_src=condition_tensors,
+                               cache_position=cache_position
+                               )
+        logits = torch.stack([self.linears[k](self.out_norm(out)) for k in range(n_q)], dim=1) # [2*bs, 4, 1,      2048]
+        logits = 3 * logits[:bs, :, :, :] - self._scale * logits[bs:, :, :, :]                 # [  bs, 4, n_draw, 2048]
+        k = 24
+        logits = torch.softmax(logits / 1.0, dim=3)  # [bs, 4, 1, 2048]
+        p, ix = torch.topk(logits, k, dim=3)  # p = [bs, 4, 1, 24], ix = [bs, 4, 1, 2048]
+        # Exponential Distribution
+        deflation = torch.empty_like(p).exponential_(lambd=1)
+        p = p / deflation
+        # divide large probs with exp(prob) If prob=.001 then 1/exp(1*.001) -> almost by 0  --> exp doesnt really produce (0, Inf)
+        p = p.argmax(dim=3, keepdim=True)  # [bs, 4, n_draw, 24]
+        tok = ix.gather(dim=3, index=p).to(torch.int64)  # [bs, 4, n_draw, 1]
+        return tok[:, :, :, 0].transpose(1, 2)  # [bs, n_draw, 4]
+    @torch.no_grad()
+    def generate(self,
+                 max_tokens=None,
+                 text_condition=None):
+        x = self.t5(text_condition)
+        bs = x.shape[0] // 2  # has null conditions - bs*2*N_REPEAT applys in builders.py
+        self._scale = .3 * torch.rand(1, 1, self.n_draw, 1, device=x.device) + 1.94
+        cache_position = 0
+        out_codes = torch.full((bs,
+                                self.n_draw,
+                                4,
+                                4 + 3 + max_tokens),  # 4 + max_tokens + 4-1 to have sufficient to index the 1st antidiagonal of 4x4 + 4 xtra tokens
+                               self.card,
+                               dtype=torch.long,
+                               device=x.device) # [bs, n_draw, 4, dur]
+        # A/R
+        for offset in range(0, max_tokens + 4 - 1):  # max_tokens + n_q - 1
+            # extract diagonal via indexing out_codes[ [0, 1, 2, 3], [0, 1, 2, 3] ]
+            next_token = self.forward(out_codes[:, 0, [0, 1, 2, 3], torch.tensor([3, 2, 1, 0]) + offset][:, :, None],  # index diagonal & exapnd to [bs, n_q, dur=1]
+                                      #gen_sequence[:, 0, :, offset-1:offset],  # DIAGINDEXING for setting prediction of lm into gen_sequence THE GENSEQUENCE has to be un-delayed in the end [Because it has to be de-delayed for the vocoder then is actually only the lm input that requires to see the delay thus we could just feed by diaggather] so it matches gen_codes -1 a[[0, 1, 2, 3], torch.tensor([0, 1, 2, 3]) + 5]  the gen_sequence is indexed by vertical column and fed to lm however the prediction of lm is place diagonally with delay to the gen_sequence
+                                      condition_tensors=x,  # utilisation of the attention mask of txt condition ?
+                                      cache_position=cache_position)  # [bs, n_draw, 4]
+            # Fill of next_token should be also placed on antidiagonal [not column]
+            #   Do Not Overwrite 2048 of TRIU/TRIL = START/END => Do Not Fill them by Predicted Tokens
+            # 0-th antidiagonal should be full of card = [2048, 2048, 2048, 2048]
+            #
+            #   [2048, 2048, 2048, 2048,    0,    1,    2,    3,    4,    5,    6, 2048, 2048, 2048],
+            #   [2048, 2048, 2048, 2048, 2048,    0,    1,    2,    3,    4,    5,    6, 2048, 2048],
+            #   [2048, 2048, 2048, 2048, 2048, 2048,    0,    1,    2,    3,    4,    5,    6, 2048],
+            #   [2048, 2048, 2048, 2048, 2048, 2048, 2048,    0,    1,    2,    3,    4,    5,    6]]
+            # NO OVerWriting
+            if offset == 0:
+                next_token[:, :, 1:4] = 2048  # self.card - bottom 3 entries of the antidiagonal should remain 2048
+            elif offset == 1:
+                next_token[:, :, 2:4] = 2048  # bottom 2 entries of the antidiagonal should remain 2048
+            elif offset == 2:
+                next_token[:, :, 3:4] = 2048
+            elif offset == max_tokens:
+                next_token[:, :, 0:1] = 2048  # top 1 entry of the antidiagonal should stay to 2048
+            elif offset == (max_tokens + 1):
+                next_token[:, :, 0:2] = 2048
+            elif offset == (max_tokens + 2):
+                next_token[:, :, 0:3] = 2048
+            else:  # offset 3,4,5,6,7...... max_tokens-1   # FILL Complete n_q = 4 ANTIDIAGONAL ENTRIES
+                pass #print('No delete anti-diag')
+            out_codes[:, :, [0, 1, 2, 3], torch.tensor([3, 2, 1, 0]) + offset + 1] = next_token
+            # Sink Attn
+            if (offset > 0) and (offset % 71) == 0:
+                n_preserve = 4
+                self.transformer._flush(n_preserve=n_preserve)
+                cache_position = n_preserve
+            else:
+                cache_position += 1
+        # [bs, n_draw, 4, time+xtra] -> [bs, 4, n_draw, time] ->  [bs, 4, time * n_draw]
+        out_codes = out_codes[:, :, :, 4:max_tokens+4].transpose(1, 2).reshape(bs, 4, self.n_draw * max_tokens)
+        # flush for next API call
+        self.transformer._flush()
+        return out_codes  # SKIP THE 4 fill 2048

audiocraft/transformer.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from einops import rearrange
+torch.backends.cuda.enable_mem_efficient_sdp(True)
+def create_sin_embedding(positions,
+                         dim,
+                         max_period=10000
+                         ):
+    # assert dim % 2 == 0
+    half_dim = dim // 2
+    positions = positions.to(torch.float)
+    adim = torch.arange(half_dim, device=positions.device,
+                        dtype=torch.float).view(1, 1, -1)
+    max_period_tensor = torch.full([],
+                                   max_period,
+                                   device=positions.device,
+                                   dtype=torch.float)  # avoid sync point
+    phase = positions / (max_period_tensor ** (adim / (half_dim - 1)))
+    # OFFICIAL is torch.float32 HOWEVER self_attn.in_prod_weight = torch.float16
+    return torch.cat([torch.cos(phase), torch.sin(phase)], dim=-1)
+class StreamingMultiheadAttention(nn.Module):
+    def __init__(self,
+                 embed_dim,
+                 num_heads,
+                 cross_attention=False,
+                 ):
+        super().__init__()
+        self.cross_attention = cross_attention
+        # if not self.cross_attention then it has kvcachingn
+        self.k_history = None
+        # cleanup history through LM inside GENERATION - Each 0,..,47 mha has different kv history
+        self.v_history = None
+        self.num_heads = num_heads
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=False)
+        self.register_buffer('in_proj_weight', torch.ones((3 * embed_dim, embed_dim),
+                                                          dtype=torch.float))
+    def forward(self,
+                query,
+                key=None,
+                value=None):
+        layout = "b h t d"
+        if self.cross_attention:
+            # Different queries, keys, values > split in_proj_weight
+            dim = self.in_proj_weight.shape[0] // 3
+            q = nn.functional.linear(query, self.in_proj_weight[:dim])
+            k = nn.functional.linear(key,   self.in_proj_weight[dim: 2 * dim])
+            v = nn.functional.linear(value, self.in_proj_weight[2 * dim:])
+            q, k, v = [
+                rearrange(x, f"b t (h d) -> {layout}", h=self.num_heads) for x in [q, k, v]]
+        else:
+            # 1st projected makes k,v (instantaneous)
+            # Here else is self_attention for audio with itself (above is cross attention txt)
+            # HISTORY - DIFFERENT FOR EACH TRANSF LAYER
+            # here we have different floating values from official
+            projected = nn.functional.linear(query, self.in_proj_weight, None)
+            # print(query.sum(), projected.sum() , self.in_proj_weight.sum(), 'Lc')   # verified official AudioGen values
+            bound_layout = "b h p t d"
+            packed = rearrange(
+                projected, f"b t (p h d) -> {bound_layout}", p=3, h=self.num_heads)
+            q, k, v = packed.unbind(dim=2)
+            if self.k_history is not None:
+                # IF ctrl^c during live_demo the assigning of each of kv is non-atomic k!=v
+                # thus it will try to continue with incompatible k/v dims!
+                self.k_history = torch.cat([self.k_history, k], 2)
+                self.v_history = torch.cat([self.v_history, v], 2)
+            else:
+                self.k_history = k
+                self.v_history = v
+            # Assign Completed k / v to k / v
+            k = self.k_history
+            v = self.v_history
+            # -> kv CACHE ONLY APPLIES if not self.cross_attention
+        x = torch.nn.functional.scaled_dot_product_attention(
+            q, k, v, attn_mask=None, is_causal=False, dropout_p=0.0)
+        x = rearrange(x, f"{layout} -> b t (h d)", h=self.num_heads)
+        x = self.out_proj(x)
+        return x
+class StreamingTransformerLayer(nn.Module):
+    def __init__(self,
+                 d_model,
+                 num_heads,
+                 dim_feedforward):
+        super().__init__()
+        self.self_attn = StreamingMultiheadAttention(embed_dim=d_model,
+                                                     num_heads=num_heads)
+        self.linear1 = nn.Linear(d_model, dim_feedforward, bias=False)
+        self.linear2 = nn.Linear(dim_feedforward, d_model, bias=False)
+        self.cross_attention = StreamingMultiheadAttention(embed_dim=d_model,
+                                                           num_heads=num_heads,
+                                                           cross_attention=True)
+        self.norm_cross = nn.LayerNorm(d_model, eps=1e-5)
+        self.norm1 = nn.LayerNorm(d_model, eps=1e-5)
+        self.norm2 = nn.LayerNorm(d_model, eps=1e-5)
+    def forward(self,
+                x,
+                cross_attention_src=None):
+        x = x + self.self_attn(self.norm1(x))
+        x = x + self.cross_attention(query=self.norm_cross(x),
+                                     key=cross_attention_src,
+                                     value=cross_attention_src)  # txtcondition
+        x = x + self.linear2(F.gelu(self.linear1(self.norm2(x))))
+        return x
+class StreamingTransformer(nn.Module):
+    def __init__(self,
+                 d_model=1536,
+                 num_heads=24,
+                 num_layers=48,
+                 dim_feedforward=6144):
+        super().__init__()
+        self.layers = nn.ModuleList(
+                [
+                    StreamingTransformerLayer(d_model=d_model,
+                                              num_heads=num_heads,
+                                              dim_feedforward=dim_feedforward) for _ in range(num_layers)
+                    ]
+                )
+    def forward(self,
+                x,
+                cache_position=None,
+                cross_attention_src=None):
+        x = x + create_sin_embedding(
+                torch.zeros(x.shape[0], 1, 1, device=x.device) + cache_position, 1536)
+        for lay in self.layers:
+            x = lay(x,
+                    cross_attention_src=cross_attention_src)
+        return x
+    def _flush(self,
+               n_preserve=None):
+        for lay in self.layers:
+            if n_preserve is not None:
+                # cache position is difficult to choose to also preserve kv from end
+                lay.self_attn.k_history = lay.self_attn.k_history[:, :, :n_preserve, :]
+                lay.self_attn.v_history = lay.self_attn.v_history[:, :, :n_preserve, :]
+            else:
+                lay.self_attn.k_history = None
+                lay.self_attn.v_history = None

audiocraft/vq.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import math
+import torch
+from torch import nn
+from einops import rearrange
+import torch.nn.functional as F
+class EuclideanCodebook(nn.Module):
+    def __init__(self,
+                 dim,
+                 codebook_size):
+        super().__init__()
+        self.register_buffer("embed", torch.zeros(codebook_size, dim))
+    def decode(self, embed_ind):
+        return F.embedding(embed_ind, self.embed)
+class VectorQuantization(nn.Module):
+    def __init__(
+        self,
+        dim,
+        codebook_size,
+        codebook_dim=None,
+        decay=0.8,
+        epsilon=1e-5,
+        kmeans_init=False,
+        kmeans_iters=10,
+        channels_last=False,
+    ):
+        super().__init__()
+        _codebook_dim = codebook_dim if codebook_dim is not None else dim
+        self._codebook = EuclideanCodebook(dim=_codebook_dim, codebook_size=codebook_size)
+        self.codebook_size = codebook_size
+        self.channels_last = channels_last
+    def _postprocess(self, quantize):
+        if not self.channels_last:
+            # raise ValueError
+            quantize = rearrange(quantize, "b n d -> b d n")
+        return quantize
+    def decode(self, embed_ind):
+        quantize = self._codebook.decode(embed_ind)
+        # quantize = self.project_out(quantize)
+        quantize = self._postprocess(quantize)
+        return quantize
+class ResidualVectorQuantization(nn.Module):
+    """Residual vector quantization implementation.
+    Follows Algorithm 1. in https://arxiv.org/pdf/2107.03312.pdf
+    """
+    def __init__(self, *, num_quantizers, **kwargs):
+        super().__init__()
+        self.layers = nn.ModuleList(
+            [VectorQuantization(**kwargs) for _ in range(num_quantizers)]
+        )
+    def decode(self, q_indices: torch.Tensor) -> torch.Tensor:
+        quantized_out = torch.tensor(0.0, device=q_indices.device)
+        for i, indices in enumerate(q_indices):
+            layer = self.layers[i]
+            quantized = layer.decode(indices)
+            quantized_out = quantized_out + quantized
+        return quantized_out
+class ResidualVectorQuantizer(nn.Module):
+# dimension=128 n_q=4 q_dropout=False bins=2048 decay=0.99 kmeans_init=True kmeans_iters=50 threshold_ema_dead_code=2
+# orthogonal_reg_weight=0.0 orthogonal_reg_active_codes_only=False orthogonal_reg_max_codes=None
+    def __init__(
+        self,
+        dimension = 128,
+        n_q = 4,
+        q_dropout = False,
+        bins = 2048,
+        decay = 0.99,
+        kmeans_init = True,
+        kmeans_iters = 50,
+        threshold_ema_dead_code = 2,
+        orthogonal_reg_weight = 0.0,
+        orthogonal_reg_active_codes_only = False,
+        orthogonal_reg_max_codes = None,
+    ):
+        super().__init__()
+        self.max_n_q = n_q
+        self.n_q = n_q
+        self.q_dropout = q_dropout
+        self.dimension = dimension
+        self.bins = bins
+        self.decay = decay
+        self.kmeans_init = kmeans_init
+        self.kmeans_iters = kmeans_iters
+        self.threshold_ema_dead_code = threshold_ema_dead_code
+        self.orthogonal_reg_weight = orthogonal_reg_weight
+        self.orthogonal_reg_active_codes_only = orthogonal_reg_active_codes_only
+        self.orthogonal_reg_max_codes = orthogonal_reg_max_codes
+        print(f'         {kmeans_init=}\n\n\n\n')
+        self.vq = ResidualVectorQuantization(
+            dim=self.dimension,
+            codebook_size=self.bins,
+            num_quantizers=self.n_q,
+            decay=self.decay,
+            kmeans_init=self.kmeans_init,
+            kmeans_iters=self.kmeans_iters,
+            channels_last=False
+        )
+    def decode(self, codes):
+        """Decode the given codes to the quantized representation."""
+        # codes is [B, K, T], with T frames, K nb of codebooks, vq.decode expects [K, B, T].
+        codes = codes.transpose(0, 1)
+        return self.vq.decode(codes)

vits.py ADDED Viewed

	@@ -0,0 +1,623 @@

+import math
+import numpy as np
+import torch
+from torch import nn
+from transformers.modeling_utils import PreTrainedModel
+from transformers.configuration_utils import PretrainedConfig
+import json
+import os
+import re
+from transformers.tokenization_utils import PreTrainedTokenizer
+import phonemizer
+import torch.nn.functional as F
+OSCILLATION = {
+        'deu': [1, 2, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1],
+        'rmc-script_latin': [2, 2, 1, 2, 2],
+        'hun': [1, 2, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1],
+        'fra': [1, 2, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1],
+        'eng': [1, 2, 2, 1, 2, 2],
+        'grc': [1, 2, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1],
+        'ron': [1, 2, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2, 2],
+    }
+def has_non_roman_characters(input_string):
+    # Find any character outside the ASCII range
+    non_roman_pattern = re.compile(r"[^\x00-\x7F]")
+    # Search the input string for non-Roman characters
+    match = non_roman_pattern.search(input_string)
+    has_non_roman = match is not None
+    return has_non_roman
+class VitsConfig(PretrainedConfig):
+    model_type = "vits"
+    def __init__(
+        self,
+        vocab_size=38,
+        hidden_size=192,
+        num_hidden_layers=6,
+        num_attention_heads=2,
+        use_bias=True,
+        ffn_dim=768,
+        ffn_kernel_size=3,
+        flow_size=192,
+        # hidden_act="relu",
+        upsample_initial_channel=512,
+        upsample_rates=[8, 8, 2, 2],
+        upsample_kernel_sizes=[16, 16, 4, 4],
+        resblock_kernel_sizes=[3, 7, 11],
+        resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+        prior_encoder_num_flows=4,
+        prior_encoder_num_wavenet_layers=4,
+        wavenet_kernel_size=5,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.use_bias = use_bias
+        self.ffn_dim = ffn_dim
+        self.ffn_kernel_size = ffn_kernel_size
+        self.flow_size = flow_size
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_rates = upsample_rates
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.prior_encoder_num_flows = prior_encoder_num_flows
+        self.prior_encoder_num_wavenet_layers = prior_encoder_num_wavenet_layers
+        self.wavenet_kernel_size = wavenet_kernel_size
+        super().__init__()
+class VitsWaveNet(torch.nn.Module):
+    def __init__(self, config, num_layers):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.num_layers = num_layers
+        self.in_layers = torch.nn.ModuleList()
+        self.res_skip_layers = torch.nn.ModuleList()
+        # if hasattr(nn.utils.parametrizations, "weight_norm"):
+        #     # raise ValueError
+        weight_norm = nn.utils.parametrizations.weight_norm
+        # else:
+        #     raise ValueError
+        #     # weight_norm = nn.utils.weight_norm
+        for i in range(num_layers):
+            in_layer = torch.nn.Conv1d(
+                in_channels=config.hidden_size,
+                out_channels=2 * config.hidden_size,
+                kernel_size=config.wavenet_kernel_size,
+                dilation=1,
+                padding=2,
+            )
+            in_layer = weight_norm(in_layer, name="weight")
+            self.in_layers.append(in_layer)
+            # last one is not necessary
+            if i < num_layers - 1:
+                res_skip_channels = 2 * config.hidden_size
+            else:
+                res_skip_channels = config.hidden_size
+            res_skip_layer = torch.nn.Conv1d(config.hidden_size, res_skip_channels, 1)
+            res_skip_layer = weight_norm(res_skip_layer, name="weight")
+            self.res_skip_layers.append(res_skip_layer)
+    def forward(self,
+                inputs):
+        outputs = torch.zeros_like(inputs)
+        num_channels = torch.IntTensor([self.hidden_size])[0]
+        for i in range(self.num_layers):
+            in_act = self.in_layers[i](inputs)
+            # global_states = torch.zeros_like(hidden_states)  # style ?
+            # acts = fused_add_tanh_sigmoid_multiply(hidden_states, global_states, num_channels_tensor[0])
+            # --
+            # def fused_add_tanh_sigmoid_multiply(input_a, input_b, num_channels):
+            # in_act = input_a #  + input_b
+            t_act = torch.tanh(in_act[:, :num_channels, :])
+            s_act = torch.sigmoid(in_act[:, num_channels:, :])
+            acts = t_act * s_act
+            res_skip_acts = self.res_skip_layers[i](acts)
+            if i < self.num_layers - 1:
+                res_acts = res_skip_acts[:, : self.hidden_size, :]
+                inputs = inputs + res_acts
+                outputs = outputs + res_skip_acts[:, self.hidden_size :, :]
+            else:
+                outputs = outputs + res_skip_acts
+        return outputs
+# Copied from transformers.models.speecht5.modeling_speecht5.HifiGanResidualBlock
+class HifiGanResidualBlock(nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), leaky_relu_slope=0.1):
+        super().__init__()
+        self.leaky_relu_slope = leaky_relu_slope
+        self.convs1 = nn.ModuleList(
+            [
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    stride=1,
+                    dilation=dilation[i],
+                    padding=self.get_padding(kernel_size, dilation[i]),
+                )
+                for i in range(len(dilation))
+            ]
+        )
+        self.convs2 = nn.ModuleList(
+            [
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    stride=1,
+                    dilation=1,
+                    padding=self.get_padding(kernel_size, 1),
+                )
+                for _ in range(len(dilation))
+            ]
+        )
+    def get_padding(self, kernel_size, dilation=1):
+        # 1, 3, 5, 15
+        return (kernel_size * dilation - dilation) // 2
+    def forward(self, hidden_states):
+        for conv1, conv2 in zip(self.convs1, self.convs2):
+            residual = hidden_states
+            hidden_states = nn.functional.leaky_relu(hidden_states, negative_slope=self.leaky_relu_slope)
+            hidden_states = conv1(hidden_states)
+            hidden_states = nn.functional.leaky_relu(hidden_states, negative_slope=self.leaky_relu_slope)
+            hidden_states = conv2(hidden_states)
+            hidden_states = hidden_states + residual
+        return hidden_states
+class VitsHifiGan(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.num_kernels = len(config.resblock_kernel_sizes)
+        self.num_upsamples = len(config.upsample_rates)
+        self.conv_pre = nn.Conv1d(
+            config.flow_size,
+            config.upsample_initial_channel,
+            kernel_size=7,
+            stride=1,
+            padding=3,
+        )
+        self.upsampler = nn.ModuleList()
+        for i, (upsample_rate, kernel_size) in enumerate(zip(config.upsample_rates, config.upsample_kernel_sizes)):
+            self.upsampler.append(
+                nn.ConvTranspose1d(
+                    config.upsample_initial_channel // (2**i),
+                    config.upsample_initial_channel // (2 ** (i + 1)),
+                    kernel_size=kernel_size,
+                    stride=upsample_rate,
+                    padding=(kernel_size - upsample_rate) // 2,
+                )
+            )
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.upsampler)):
+            channels = config.upsample_initial_channel // (2 ** (i + 1))
+            for kernel_size, dilation in zip(config.resblock_kernel_sizes, config.resblock_dilation_sizes):
+                self.resblocks.append(HifiGanResidualBlock(channels, kernel_size, dilation))
+        self.conv_post = nn.Conv1d(channels, 1, kernel_size=7, stride=1, padding=3, bias=False)
+    def forward(self,
+                spectrogram):
+        hidden_states = self.conv_pre(spectrogram)
+        for i in range(self.num_upsamples):
+            hidden_states = F.leaky_relu(hidden_states, negative_slope=.1, inplace=True)
+            hidden_states = self.upsampler[i](hidden_states)
+            res_state = self.resblocks[i * self.num_kernels](hidden_states)
+            for j in range(1, self.num_kernels):
+                res_state += self.resblocks[i * self.num_kernels + j](hidden_states)
+            hidden_states = res_state / self.num_kernels
+        hidden_states = F.leaky_relu(hidden_states, negative_slope=.01, inplace=True)
+        hidden_states = self.conv_post(hidden_states)
+        waveform = torch.tanh(hidden_states)
+        return waveform
+class VitsResidualCouplingLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.half_channels = config.flow_size // 2
+        self.conv_pre = nn.Conv1d(self.half_channels, config.hidden_size, 1)
+        self.wavenet = VitsWaveNet(config, num_layers=config.prior_encoder_num_wavenet_layers)
+        self.conv_post = nn.Conv1d(config.hidden_size, self.half_channels, 1)
+    def forward(self,
+                x,
+                reverse=False):
+        first_half, second_half = torch.split(x, [self.half_channels] * 2, dim=1)
+        hidden_states = self.conv_pre(first_half)
+        hidden_states = self.wavenet(hidden_states)
+        mean = self.conv_post(hidden_states)
+        second_half = (second_half - mean)
+        outputs = torch.cat([first_half, second_half], dim=1)
+        return outputs
+class VitsResidualCouplingBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.flows = nn.ModuleList()
+        for _ in range(config.prior_encoder_num_flows):
+            self.flows.append(VitsResidualCouplingLayer(config))
+    def forward(self, x, reverse=False):
+        # x L [1, 192, 481]
+        for flow in reversed(self.flows):
+            x = torch.flip(x, [1])  # flipud CHANNELs
+            x = flow(x, reverse=True)
+        return x
+class VitsAttention(nn.Module):
+    """has no positional info"""
+    def __init__(self, config):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        self.scaling = self.head_dim**-0.5
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_bias)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_bias)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_bias)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_bias)
+    def _shape(self, tensor, seq_len, bsz):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def forward(
+        self,
+        hidden_states,
+        layer_head_mask = None,
+        output_attentions = False,
+    ):
+        bsz, tgt_len, _ = hidden_states.size()
+        # Q
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # K/V
+        hidden_states = hidden_states[:, :40, :]  # drop time-frames from k/v [bs*2, time, 96=ch]
+        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        attn_output = torch.bmm(attn_weights,
+                                value_states)
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned aross GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+        attn_output = self.out_proj(attn_output)
+        return attn_output
+class VitsFeedForward(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.conv_1 = nn.Conv1d(config.hidden_size, config.ffn_dim, config.ffn_kernel_size, padding=1)
+        self.conv_2 = nn.Conv1d(config.ffn_dim, config.hidden_size, config.ffn_kernel_size, padding=1)
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.permute(0, 2, 1)
+        hidden_states = F.relu(self.conv_1(hidden_states))  # inplace changes sound ;
+        hidden_states = self.conv_2(hidden_states)
+        hidden_states = hidden_states.permute(0, 2, 1)
+        return hidden_states
+class VitsEncoderLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.attention = VitsAttention(config)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=1e-5)
+        self.feed_forward = VitsFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=1e-5)
+    def forward(
+        self,
+        hidden_states,
+        output_attentions = False,
+    ):
+        residual = hidden_states
+        hidden_states = self.attention(
+            hidden_states=hidden_states,
+            # attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.layer_norm(residual + hidden_states)
+        residual = hidden_states
+        hidden_states = self.feed_forward(hidden_states)
+        hidden_states = self.final_layer_norm(residual + hidden_states)
+        outputs = (hidden_states,)
+        return outputs
+class VitsEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([VitsEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+    def forward(
+        self,
+        hidden_states):
+        for _layer in self.layers:
+            layer_outputs = _layer(hidden_states)
+            hidden_states = layer_outputs[0]
+        return hidden_states
+class VitsTextEncoder(nn.Module):
+    """
+    Has VitsEncoder
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
+        self.encoder = VitsEncoder(config)  # 6 Layers of VitsAttention
+        self.project = nn.Conv1d(config.hidden_size, config.flow_size * 2, kernel_size=1)
+    def forward(self,
+                input_ids
+                ):
+        hidden_states = self.embed_tokens(input_ids) * 4      #Actually4-or-4.856406460551018-@-845-len-ids-deu
+        stats = self.project(self.encoder(hidden_states=hidden_states).transpose(1, 2)).transpose(1, 2)
+        return stats[:, :, :self.config.flow_size]  # prior_means
+class VitsPreTrainedModel(PreTrainedModel):
+    config_class = VitsConfig
+    base_model_prefix = "vits"
+    main_input_name = "input_ids"
+    supports_gradient_checkpointing = True
+class VitsModel(VitsPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.text_encoder = VitsTextEncoder(config)  # has VitsEncoder that includes 6L of VitsAttention
+        self.flow = VitsResidualCouplingBlock(config)
+        self.decoder = VitsHifiGan(config)
+    def forward(
+        self,
+        input_ids = None,
+        attention_mask = None,
+        speaker_id = None,
+        output_attentions = None,
+        output_hidden_states = None,
+        return_dict = None,
+        labels = None,
+        speed = None,
+        lang_code = 'deu',  # speed oscillation pattern per voice/lang
+    ):
+        mask_dtype = self.text_encoder.embed_tokens.weight.dtype
+        if attention_mask is not None:
+            input_padding_mask = attention_mask.unsqueeze(-1).to(mask_dtype)
+        else:
+            raise ValueError
+            input_padding_mask = torch.ones_like(input_ids).unsqueeze(-1).to(mask_dtype)
+        prior_means = self.text_encoder(input_ids=input_ids)
+        input_padding_mask = input_padding_mask.transpose(1, 2)
+        bs, in_len, _ = prior_means.shape
+        # VITS Duration Oscillation
+        pattern = OSCILLATION.get(lang_code, [1, 2, 1])
+        duration = torch.tensor(pattern,
+                                device=prior_means.device).repeat(int(in_len / len(pattern)) + 2)[None, None, :in_len]   # perhaps define [1, 2, 1] per voice or language
+        duration[:, :, 0] = 4
+        duration[:, :, -1] = 3
+        # ATTN
+        predicted_lengths = torch.clamp_min(torch.sum(duration, [1, 2]), 1).long()
+        indices = torch.arange(predicted_lengths.max(), dtype=predicted_lengths.dtype, device=predicted_lengths.device)
+        output_padding_mask = indices.unsqueeze(0) < predicted_lengths.unsqueeze(1)
+        output_padding_mask = output_padding_mask.unsqueeze(1).to(input_padding_mask.dtype)
+        attn_mask = torch.unsqueeze(input_padding_mask, 2) * torch.unsqueeze(output_padding_mask, -1)
+        batch_size, _, output_length, input_length = attn_mask.shape
+        cum_duration = torch.cumsum(duration, -1).view(batch_size * input_length, 1)
+        indices = torch.arange(output_length, dtype=duration.dtype, device=duration.device)
+        valid_indices = indices.unsqueeze(0) < cum_duration
+        valid_indices = valid_indices.to(attn_mask.dtype).view(batch_size, input_length, output_length)
+        padded_indices = valid_indices - nn.functional.pad(valid_indices, [0, 0, 1, 0, 0, 0])[:, :-1]
+        attn = padded_indices.unsqueeze(1).transpose(2, 3) * attn_mask
+        attn = attn[:, 0, :, :]
+        attn = attn + 1e-4 * torch.rand_like(attn)
+        attn /= attn.sum(2, keepdims=True)
+        #print(attn)
+        prior_means = torch.matmul(attn, prior_means)  # try attn to contain .5/.5 instead of 1/0 so it smoothly interpolates repeated prior_means
+        #prior_means = F.interpolate(prior_means.transpose(1,2),   int(1.74 * prior_means.shape[1]), mode='linear').transpose(1,2)  # extend for slow speed
+        # prior means have now been replicated x duration of each prior mean
+        latents = self.flow(prior_means.transpose(1, 2), # + torch.randn_like(prior_means) * .94,
+                            reverse=True)
+        waveform = self.decoder(latents)  # [bs, 1, 16000]
+        return waveform[:, 0, :]
+class VitsTokenizer(PreTrainedTokenizer):
+    vocab_files_names = {"vocab_file": "vocab.json"}
+    model_input_names = ["input_ids", "attention_mask"]
+    def __init__(
+        self,
+        vocab_file,
+        pad_token="<pad>",
+        unk_token="<unk>",
+        language=None,
+        add_blank=True,
+        normalize=True,
+        phonemize=True,
+        is_uroman=False,
+        **kwargs,
+    ):
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.language = language
+        self.add_blank = add_blank
+        self.normalize = normalize
+        self.phonemize = phonemize
+        self.is_uroman = is_uroman
+        super().__init__(
+            pad_token=pad_token,
+            unk_token=unk_token,
+            language=language,
+            add_blank=add_blank,
+            normalize=normalize,
+            phonemize=phonemize,
+            is_uroman=is_uroman,
+            **kwargs,
+        )
+    @property
+    def vocab_size(self):
+        return len(self.encoder)
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+    def normalize_text(self, input_string):
+        """Lowercase the input string, respecting any special token ids that may be part or entirely upper-cased."""
+        all_vocabulary = list(self.encoder.keys()) + list(self.added_tokens_encoder.keys())
+        filtered_text = ""
+        i = 0
+        while i < len(input_string):
+            found_match = False
+            for word in all_vocabulary:
+                if input_string[i : i + len(word)] == word:
+                    filtered_text += word
+                    i += len(word)
+                    found_match = True
+                    break
+            if not found_match:
+                filtered_text += input_string[i].lower()
+                i += 1
+        return filtered_text
+    def _preprocess_char(self, text):
+        """Special treatment of characters in certain languages"""
+        if self.language == "ron":
+            text = text.replace("ț", "ţ")
+        return text
+    def prepare_for_tokenization(
+        self, text: str, is_split_into_words: bool = False, normalize = None, **kwargs):
+        normalize = normalize if normalize is not None else self.normalize
+        if normalize:
+            # normalise for casing
+            text = self.normalize_text(text)
+        filtered_text = self._preprocess_char(text)
+        if has_non_roman_characters(filtered_text) and self.is_uroman:
+            # 7 langs -  For now replace all to romans in app.py
+            raise ValueError
+        if self.phonemize:
+            if not is_phonemizer_available():
+                raise ImportError("Please install the `phonemizer` Python package to use this tokenizer.")
+            filtered_text = phonemizer.phonemize(
+                filtered_text,
+                language="en-us",
+                backend="espeak",
+                strip=True,
+                preserve_punctuation=True,
+                with_stress=True,
+            )
+            filtered_text = re.sub(r"\s+", " ", filtered_text)
+        elif normalize:
+            # strip any chars outside of the vocab (punctuation)
+            filtered_text = "".join(list(filter(lambda char: char in self.encoder, filtered_text))).strip()
+        return filtered_text, kwargs
+    def _tokenize(self, text):
+        """Tokenize a string by inserting the `<pad>` token at the boundary between adjacent characters."""
+        tokens = list(text)
+        if self.add_blank:
+            # sounds dyslexi if no space between letters
+            # sounds disconnected if >2 spaces between letters
+            interspersed = [self._convert_id_to_token(0)] * (len(tokens) * 2) # + 1)  # +1 rises slice index error if tokens odd
+            interspersed[::2] = tokens
+            tokens = interspersed + [self._convert_id_to_token(0)]  # append one last space (it has indexing error ::2 mismatch if tokens is odd)
+        return tokens
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index)