Uploading DART folder into model repo

Browse files

Files changed (8) hide show

finetune/config.yaml +17 -0
finetune/lora.py +74 -0
finetune/requirments.txt +6 -0
finetune/setup.py +35 -0
finetune/tester.py +56 -0
finetune/tester.yaml +5 -0
finetune/train.py +53 -0
finetune/tts_engines.py +146 -0

finetune/config.yaml ADDED Viewed

	@@ -0,0 +1,17 @@

+# CHANGE THIS TO YOUR OWN DATASET
+TTS_dataset: zirobtc/biboo-dataset-tokenised
+model_name: "canopylabs/orpheus-tts-0.1-finetune-prod"
+# Training Args
+epochs: 3
+batch_size: 1
+number_processes: 1
+pad_token: 128263
+save_steps: 5000
+learning_rate: 5.0e-5
+# Naming and paths
+save_folder: "miko-tts-biboo"
+project_name: "miko-biboo"
+run_name: "5e5-0"

finetune/lora.py ADDED Viewed

	@@ -0,0 +1,74 @@

+from datasets import load_dataset
+from peft import LoraConfig, get_peft_model
+from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, AutoTokenizer
+import numpy as np
+import yaml
+import wandb
+config_file = "config.yaml"
+with open(config_file, "r") as file:
+    config = yaml.safe_load(file)
+dsn = config["TTS_dataset"]
+model_name = config["model_name"]
+run_name = config["run_name"]
+project_name = config["project_name"]
+base_repo_id = config["save_folder"]
+epochs = config["epochs"]
+batch_size = config["batch_size"]
+save_steps = config["save_steps"]
+pad_token = config["pad_token"]
+number_processes = config["number_processes"]
+learning_rate = config["learning_rate"]
+lora_rank = 32
+lora_alpha = 64
+lora_dropout = 0.0
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(model_name, attn_implementation="flash_attention_2")
+lora_config = LoraConfig(
+    r=lora_rank,
+    lora_alpha=lora_alpha,
+    lora_dropout=lora_dropout,
+    target_modules=["q_proj", "k_proj", "v_proj",  "o_proj", "gate_proj", "down_proj", "up_proj"],
+    bias="none",
+    modules_to_save=["lm_head", "embed_tokens"], # Optional to train the embeddings and lm head
+    task_type="CAUSAL_LM",
+    use_rslora=True,
+)
+model = get_peft_model(model, lora_config)
+ds = load_dataset(dsn, split="train")
+wandb.init(project=project_name, name = run_name)
+training_args = TrainingArguments(
+    overwrite_output_dir=True,
+    num_train_epochs=epochs,
+    per_device_train_batch_size=batch_size,
+    logging_steps=1,
+    bf16=True,
+    output_dir=f"./{base_repo_id}",
+    report_to="wandb",
+    save_steps=save_steps,
+    remove_unused_columns=True,
+    learning_rate=learning_rate,
+)
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=ds,
+)
+trainer.train()
+merged_model = model.merge_and_unload()
+merged_model.save_pretrained(f"./{base_repo_id}/merged")
+tokenizer.save_pretrained(f"./{base_repo_id}/merged")

finetune/requirments.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+flash_attn
+transformers==4.51.3
+trl==0.8.0
+datasets
+wandb
+accelerate

finetune/setup.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# prepare_local_checkpoint.py
+import os
+import shutil
+from huggingface_hub import snapshot_download, login
+# The repository where the base Orpheus model's tokenizer files are located
+PRETRAINED_REPO = "zirobtc/miko-voice-tts-3e-5-7"
+# The local directory of your fine-tuned model checkpoint
+# Ensure this path exists and contains your fine-tuned model weights (e.g., model.safetensors)
+CHECKPOINT_DIR = "./miko-tts-biboo/checkpoint-3021"
+# --- 1. Download tokenizer files from the pretrained model ---
+print(f"Downloading tokenizer files from {PRETRAINED_REPO}...")
+tokenizer_files = [
+    "tokenizer.json",
+    "tokenizer_config.json",
+    "special_tokens_map.json"
+]
+# snapshot_download will download these specific files into a temporary cache directory
+pretrained_tokenizer_local_cache_path = snapshot_download(PRETRAINED_REPO, allow_patterns=tokenizer_files)
+print(f"✅ Tokenizer files downloaded to cache: {pretrained_tokenizer_local_cache_path}")
+# --- 3. Copy tokenizer files into your local checkpoint directory ---
+print(f"Copying tokenizer files to local checkpoint directory: {CHECKPOINT_DIR}...")
+for filename in tokenizer_files:
+    source_path = os.path.join(pretrained_tokenizer_local_cache_path, filename)
+    destination_path = os.path.join(CHECKPOINT_DIR, filename)
+    shutil.copy(source_path, destination_path)
+    print(f"Copied {filename} to {destination_path}")
+print("🎉 Tokenizer files successfully placed in your local fine-tuned model checkpoint directory.")

finetune/tester.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import os
+import sys
+import uuid
+import tts_engine # Assuming tts_engine.py is in the same directory
+def main():
+    print("🚀 Orpheus TTS CLI Tester 🚀")
+    print("Type your text and press Enter to synthesize speech.")
+    print("Press Ctrl+D (EOF) or type 'exit' to quit.")
+    # Ensure the output directory exists
+    output_dir = "generated_audio"
+    os.makedirs(output_dir, exist_ok=True)
+    print(f"Audio files will be saved to: {os.path.abspath(output_dir)}")
+    try:
+        # Load the TTS model once at the start
+        tts_engine.setup_model()
+    except Exception as e:
+        print(f"❌ Error setting up TTS model: {e}")
+        print("Please ensure 'vllm' is installed and your 'config.yaml' is correctly configured.")
+        sys.exit(1)
+    while True:
+        try:
+            prompt = input("\nEnter text to synthesize: ").strip()
+            if prompt.lower() == 'exit':
+                break
+            if not prompt:
+                print("Please enter some text.")
+                continue
+            print(f"Synthesizing: '{prompt}'...")
+            try:
+                audio_bytes, _ = tts_engine.synthesize_for_scene(prompt=prompt)
+                # Generate a unique filename for the WAV output
+                output_filename = os.path.join(output_dir, f"output_{uuid.uuid4().hex}.wav")
+                with open(output_filename, "wb") as f:
+                    f.write(audio_bytes)
+                print(f"✅ Audio saved to: {output_filename}")
+            except Exception as e:
+                print(f"❌ Error during synthesis: {e}")
+                print("Make sure your model is correctly loaded and accessible.")
+        except EOFError:
+            print("\nExiting tester.")
+            break
+        except KeyboardInterrupt:
+            print("\nExiting tester.")
+            break
+if __name__ == "__main__":
+    main()

finetune/tester.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+voice: "miko"
+temperature: 0.6
+top_p: 0.9
+max_tokens: 2048
+repetition_penalty: 1.3

finetune/train.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, AutoTokenizer
+import numpy as np
+import yaml
+import wandb
+config_file = "config.yaml"
+with open(config_file, "r") as file:
+    config = yaml.safe_load(file)
+dsn = config["TTS_dataset"]
+model_name = config["model_name"]
+run_name = config["run_name"]
+project_name = config["project_name"]
+base_repo_id = config["save_folder"]
+epochs = config["epochs"]
+batch_size = config["batch_size"]
+save_steps = config["save_steps"]
+pad_token = config["pad_token"]
+number_processes = config["number_processes"]
+learning_rate = config["learning_rate"]
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(model_name, attn_implementation="flash_attention_2")
+ds = load_dataset(dsn, split="train")
+wandb.init(project=project_name, name = run_name)
+training_args = TrainingArguments(
+    overwrite_output_dir=True,
+    num_train_epochs=epochs,
+    per_device_train_batch_size=batch_size,
+    logging_steps=1,
+    bf16=True,
+    output_dir=f"./{base_repo_id}",
+    report_to="wandb",
+    save_steps=save_steps,
+    remove_unused_columns=True,
+    learning_rate=learning_rate,
+)
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=ds,
+)
+trainer.train()

finetune/tts_engines.py ADDED Viewed

	@@ -0,0 +1,146 @@

+# tts_engine.py
+import wave
+import asyncio
+import uuid  # Import uuid to generate unique IDs
+import threading
+import queue
+import base64
+from io import BytesIO
+import yaml
+from orpheus_tts.engine_class import OrpheusModel
+from vllm.outputs import RequestOutput
+from vllm import SamplingParams
+# --- Background loop to keep vLLM stable across requests ---
+# This class is correct and does not need changes.
+class BackgroundEventLoop:
+    def __init__(self):
+        self._loop = asyncio.new_event_loop()
+        self._thread = threading.Thread(target=self._run_loop, daemon=True)
+        self._thread.start()
+    def _run_loop(self):
+        asyncio.set_event_loop(self._loop)
+        self._loop.run_forever()
+    def run_generator(self, async_gen):
+        q = queue.Queue()
+        sentinel = object()
+        async def producer():
+            try:
+                async for item in async_gen:
+                    q.put(item)
+            except Exception as e:
+                q.put(e)
+            finally:
+                q.put(sentinel)
+        asyncio.run_coroutine_threadsafe(producer(), self._loop)
+        while True:
+            item = q.get()
+            if item is sentinel:
+                break
+            if isinstance(item, Exception):
+                raise item
+            yield item
+# --- Patched Orpheus model using background loop ---
+tts_event_loop = BackgroundEventLoop()
+class PatchedOrpheusModel(OrpheusModel):
+    # THE FIX IS HERE
+    def generate_tokens_sync(self, prompt, voice=None, request_id=None, temperature=0.6, top_p=0.8, max_tokens=1200, stop_token_ids=[49158], repetition_penalty=1.3):
+        # If no request_id is provided, generate a new unique one.
+        # This solves the "id already running" error.
+        if request_id is None:
+            request_id = str(uuid.uuid4())
+        prompt_string = self._format_prompt(prompt, voice)
+        sampling_params = SamplingParams(
+            temperature=temperature,
+            top_p=top_p,
+            max_tokens=max_tokens,
+            stop_token_ids=stop_token_ids,
+            repetition_penalty=repetition_penalty,
+        )
+        async_gen = self.engine.generate(
+            prompt=prompt_string,
+            sampling_params=sampling_params,
+            request_id=request_id  # Use the unique ID
+        )
+        for result in tts_event_loop.run_generator(async_gen):
+            if not isinstance(result, RequestOutput):
+                raise TypeError(f"Unexpected result type: {type(result)}")
+            yield result.outputs[0].text
+# --- Persistent global model ---
+# This section is correct and does not need changes.
+model = None
+def load_yaml():
+    """
+    Loads a YAML file and extracts the 'model_name' string.
+    """
+    file_path = "tester.yaml"
+    try:
+        with open(file_path, 'r') as file:
+            config = yaml.safe_load(file)
+            return config
+    except FileNotFoundError:
+        print(f"Error: The file '{file_path}' was not found.")
+        return None
+    except yaml.YAMLError as e:
+        print(f"Error parsing YAML file: {e}")
+        return None
+def setup_model():
+    global model
+    if model is None:
+        print("Loading TTS model...")
+        model = PatchedOrpheusModel(model_name="./miko-tts-biboo/checkpoint-3021")
+        print("✅ Model loaded and ready.")
+def synthesize_for_scene(
+    prompt: str,
+    voice: str = "miko",
+    temperature: float = 0.6,
+    top_p: float = 0.9,
+    repetition_penalty: float = 1.3,
+    max_tokens: int = 1200,
+):
+    global model
+    config = load_yaml()
+    # This function now works correctly in parallel because each call
+    # will trigger a unique request_id in the PatchedOrpheusModel above.
+    chunks = bytearray()
+    for chunk in model.generate_speech(
+        prompt=prompt,
+        voice=config["voice"],
+        temperature=config["temperature"],
+        top_p=config["top_p"],
+        max_tokens=config["max_tokens"],
+        repetition_penalty=config["repetition_penalty"],
+    ):
+        chunks.extend(chunk)
+    buffer = BytesIO()
+    with wave.open(buffer, "wb") as wf:
+        wf.setnchannels(1)
+        wf.setsampwidth(2)
+        wf.setframerate(24000)
+        wf.writeframes(chunks)
+    audio_bytes = buffer.getvalue()
+    audio_base64 = base64.b64encode(audio_bytes).decode("utf-8")
+    return audio_bytes, audio_base64