Uploading DART folder into model repo
Browse files- finetune/config.yaml +17 -0
- finetune/lora.py +74 -0
- finetune/requirments.txt +6 -0
- finetune/setup.py +35 -0
- finetune/tester.py +56 -0
- finetune/tester.yaml +5 -0
- finetune/train.py +53 -0
- finetune/tts_engines.py +146 -0
finetune/config.yaml
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# CHANGE THIS TO YOUR OWN DATASET
|
| 2 |
+
TTS_dataset: zirobtc/biboo-dataset-tokenised
|
| 3 |
+
|
| 4 |
+
model_name: "canopylabs/orpheus-tts-0.1-finetune-prod"
|
| 5 |
+
|
| 6 |
+
# Training Args
|
| 7 |
+
epochs: 3
|
| 8 |
+
batch_size: 1
|
| 9 |
+
number_processes: 1
|
| 10 |
+
pad_token: 128263
|
| 11 |
+
save_steps: 5000
|
| 12 |
+
learning_rate: 5.0e-5
|
| 13 |
+
|
| 14 |
+
# Naming and paths
|
| 15 |
+
save_folder: "miko-tts-biboo"
|
| 16 |
+
project_name: "miko-biboo"
|
| 17 |
+
run_name: "5e5-0"
|
finetune/lora.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datasets import load_dataset
|
| 2 |
+
from peft import LoraConfig, get_peft_model
|
| 3 |
+
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, AutoTokenizer
|
| 4 |
+
import numpy as np
|
| 5 |
+
import yaml
|
| 6 |
+
import wandb
|
| 7 |
+
|
| 8 |
+
config_file = "config.yaml"
|
| 9 |
+
|
| 10 |
+
with open(config_file, "r") as file:
|
| 11 |
+
config = yaml.safe_load(file)
|
| 12 |
+
|
| 13 |
+
dsn = config["TTS_dataset"]
|
| 14 |
+
|
| 15 |
+
model_name = config["model_name"]
|
| 16 |
+
run_name = config["run_name"]
|
| 17 |
+
project_name = config["project_name"]
|
| 18 |
+
base_repo_id = config["save_folder"]
|
| 19 |
+
epochs = config["epochs"]
|
| 20 |
+
batch_size = config["batch_size"]
|
| 21 |
+
save_steps = config["save_steps"]
|
| 22 |
+
pad_token = config["pad_token"]
|
| 23 |
+
number_processes = config["number_processes"]
|
| 24 |
+
learning_rate = config["learning_rate"]
|
| 25 |
+
|
| 26 |
+
lora_rank = 32
|
| 27 |
+
lora_alpha = 64
|
| 28 |
+
lora_dropout = 0.0
|
| 29 |
+
|
| 30 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 31 |
+
model = AutoModelForCausalLM.from_pretrained(model_name, attn_implementation="flash_attention_2")
|
| 32 |
+
|
| 33 |
+
lora_config = LoraConfig(
|
| 34 |
+
r=lora_rank,
|
| 35 |
+
lora_alpha=lora_alpha,
|
| 36 |
+
lora_dropout=lora_dropout,
|
| 37 |
+
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "down_proj", "up_proj"],
|
| 38 |
+
bias="none",
|
| 39 |
+
modules_to_save=["lm_head", "embed_tokens"], # Optional to train the embeddings and lm head
|
| 40 |
+
task_type="CAUSAL_LM",
|
| 41 |
+
use_rslora=True,
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
model = get_peft_model(model, lora_config)
|
| 45 |
+
|
| 46 |
+
ds = load_dataset(dsn, split="train")
|
| 47 |
+
|
| 48 |
+
wandb.init(project=project_name, name = run_name)
|
| 49 |
+
|
| 50 |
+
training_args = TrainingArguments(
|
| 51 |
+
overwrite_output_dir=True,
|
| 52 |
+
num_train_epochs=epochs,
|
| 53 |
+
per_device_train_batch_size=batch_size,
|
| 54 |
+
logging_steps=1,
|
| 55 |
+
bf16=True,
|
| 56 |
+
output_dir=f"./{base_repo_id}",
|
| 57 |
+
report_to="wandb",
|
| 58 |
+
save_steps=save_steps,
|
| 59 |
+
remove_unused_columns=True,
|
| 60 |
+
learning_rate=learning_rate,
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
trainer = Trainer(
|
| 64 |
+
model=model,
|
| 65 |
+
args=training_args,
|
| 66 |
+
train_dataset=ds,
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
trainer.train()
|
| 70 |
+
|
| 71 |
+
merged_model = model.merge_and_unload()
|
| 72 |
+
|
| 73 |
+
merged_model.save_pretrained(f"./{base_repo_id}/merged")
|
| 74 |
+
tokenizer.save_pretrained(f"./{base_repo_id}/merged")
|
finetune/requirments.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
flash_attn
|
| 2 |
+
transformers==4.51.3
|
| 3 |
+
trl==0.8.0
|
| 4 |
+
datasets
|
| 5 |
+
wandb
|
| 6 |
+
accelerate
|
finetune/setup.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# prepare_local_checkpoint.py
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import shutil
|
| 5 |
+
from huggingface_hub import snapshot_download, login
|
| 6 |
+
|
| 7 |
+
# The repository where the base Orpheus model's tokenizer files are located
|
| 8 |
+
PRETRAINED_REPO = "zirobtc/miko-voice-tts-3e-5-7"
|
| 9 |
+
# The local directory of your fine-tuned model checkpoint
|
| 10 |
+
# Ensure this path exists and contains your fine-tuned model weights (e.g., model.safetensors)
|
| 11 |
+
CHECKPOINT_DIR = "./miko-tts-biboo/checkpoint-3021"
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
# --- 1. Download tokenizer files from the pretrained model ---
|
| 16 |
+
print(f"Downloading tokenizer files from {PRETRAINED_REPO}...")
|
| 17 |
+
tokenizer_files = [
|
| 18 |
+
"tokenizer.json",
|
| 19 |
+
"tokenizer_config.json",
|
| 20 |
+
"special_tokens_map.json"
|
| 21 |
+
]
|
| 22 |
+
# snapshot_download will download these specific files into a temporary cache directory
|
| 23 |
+
pretrained_tokenizer_local_cache_path = snapshot_download(PRETRAINED_REPO, allow_patterns=tokenizer_files)
|
| 24 |
+
print(f"✅ Tokenizer files downloaded to cache: {pretrained_tokenizer_local_cache_path}")
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
# --- 3. Copy tokenizer files into your local checkpoint directory ---
|
| 28 |
+
print(f"Copying tokenizer files to local checkpoint directory: {CHECKPOINT_DIR}...")
|
| 29 |
+
for filename in tokenizer_files:
|
| 30 |
+
source_path = os.path.join(pretrained_tokenizer_local_cache_path, filename)
|
| 31 |
+
destination_path = os.path.join(CHECKPOINT_DIR, filename)
|
| 32 |
+
shutil.copy(source_path, destination_path)
|
| 33 |
+
print(f"Copied {filename} to {destination_path}")
|
| 34 |
+
|
| 35 |
+
print("🎉 Tokenizer files successfully placed in your local fine-tuned model checkpoint directory.")
|
finetune/tester.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
import uuid
|
| 4 |
+
import tts_engine # Assuming tts_engine.py is in the same directory
|
| 5 |
+
|
| 6 |
+
def main():
|
| 7 |
+
print("🚀 Orpheus TTS CLI Tester 🚀")
|
| 8 |
+
print("Type your text and press Enter to synthesize speech.")
|
| 9 |
+
print("Press Ctrl+D (EOF) or type 'exit' to quit.")
|
| 10 |
+
|
| 11 |
+
# Ensure the output directory exists
|
| 12 |
+
output_dir = "generated_audio"
|
| 13 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 14 |
+
print(f"Audio files will be saved to: {os.path.abspath(output_dir)}")
|
| 15 |
+
|
| 16 |
+
try:
|
| 17 |
+
# Load the TTS model once at the start
|
| 18 |
+
tts_engine.setup_model()
|
| 19 |
+
except Exception as e:
|
| 20 |
+
print(f"❌ Error setting up TTS model: {e}")
|
| 21 |
+
print("Please ensure 'vllm' is installed and your 'config.yaml' is correctly configured.")
|
| 22 |
+
sys.exit(1)
|
| 23 |
+
|
| 24 |
+
while True:
|
| 25 |
+
try:
|
| 26 |
+
prompt = input("\nEnter text to synthesize: ").strip()
|
| 27 |
+
|
| 28 |
+
if prompt.lower() == 'exit':
|
| 29 |
+
break
|
| 30 |
+
if not prompt:
|
| 31 |
+
print("Please enter some text.")
|
| 32 |
+
continue
|
| 33 |
+
|
| 34 |
+
print(f"Synthesizing: '{prompt}'...")
|
| 35 |
+
try:
|
| 36 |
+
audio_bytes, _ = tts_engine.synthesize_for_scene(prompt=prompt)
|
| 37 |
+
|
| 38 |
+
# Generate a unique filename for the WAV output
|
| 39 |
+
output_filename = os.path.join(output_dir, f"output_{uuid.uuid4().hex}.wav")
|
| 40 |
+
with open(output_filename, "wb") as f:
|
| 41 |
+
f.write(audio_bytes)
|
| 42 |
+
print(f"✅ Audio saved to: {output_filename}")
|
| 43 |
+
|
| 44 |
+
except Exception as e:
|
| 45 |
+
print(f"❌ Error during synthesis: {e}")
|
| 46 |
+
print("Make sure your model is correctly loaded and accessible.")
|
| 47 |
+
|
| 48 |
+
except EOFError:
|
| 49 |
+
print("\nExiting tester.")
|
| 50 |
+
break
|
| 51 |
+
except KeyboardInterrupt:
|
| 52 |
+
print("\nExiting tester.")
|
| 53 |
+
break
|
| 54 |
+
|
| 55 |
+
if __name__ == "__main__":
|
| 56 |
+
main()
|
finetune/tester.yaml
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
voice: "miko"
|
| 2 |
+
temperature: 0.6
|
| 3 |
+
top_p: 0.9
|
| 4 |
+
max_tokens: 2048
|
| 5 |
+
repetition_penalty: 1.3
|
finetune/train.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datasets import load_dataset
|
| 2 |
+
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, AutoTokenizer
|
| 3 |
+
import numpy as np
|
| 4 |
+
import yaml
|
| 5 |
+
import wandb
|
| 6 |
+
|
| 7 |
+
config_file = "config.yaml"
|
| 8 |
+
|
| 9 |
+
with open(config_file, "r") as file:
|
| 10 |
+
config = yaml.safe_load(file)
|
| 11 |
+
|
| 12 |
+
dsn = config["TTS_dataset"]
|
| 13 |
+
|
| 14 |
+
model_name = config["model_name"]
|
| 15 |
+
run_name = config["run_name"]
|
| 16 |
+
project_name = config["project_name"]
|
| 17 |
+
base_repo_id = config["save_folder"]
|
| 18 |
+
epochs = config["epochs"]
|
| 19 |
+
batch_size = config["batch_size"]
|
| 20 |
+
save_steps = config["save_steps"]
|
| 21 |
+
pad_token = config["pad_token"]
|
| 22 |
+
number_processes = config["number_processes"]
|
| 23 |
+
learning_rate = config["learning_rate"]
|
| 24 |
+
|
| 25 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 26 |
+
model = AutoModelForCausalLM.from_pretrained(model_name, attn_implementation="flash_attention_2")
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
ds = load_dataset(dsn, split="train")
|
| 30 |
+
|
| 31 |
+
wandb.init(project=project_name, name = run_name)
|
| 32 |
+
|
| 33 |
+
training_args = TrainingArguments(
|
| 34 |
+
overwrite_output_dir=True,
|
| 35 |
+
num_train_epochs=epochs,
|
| 36 |
+
per_device_train_batch_size=batch_size,
|
| 37 |
+
logging_steps=1,
|
| 38 |
+
bf16=True,
|
| 39 |
+
output_dir=f"./{base_repo_id}",
|
| 40 |
+
report_to="wandb",
|
| 41 |
+
save_steps=save_steps,
|
| 42 |
+
remove_unused_columns=True,
|
| 43 |
+
learning_rate=learning_rate,
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
trainer = Trainer(
|
| 47 |
+
model=model,
|
| 48 |
+
args=training_args,
|
| 49 |
+
train_dataset=ds,
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
trainer.train()
|
| 53 |
+
|
finetune/tts_engines.py
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# tts_engine.py
|
| 2 |
+
|
| 3 |
+
import wave
|
| 4 |
+
import asyncio
|
| 5 |
+
import uuid # Import uuid to generate unique IDs
|
| 6 |
+
import threading
|
| 7 |
+
import queue
|
| 8 |
+
import base64
|
| 9 |
+
from io import BytesIO
|
| 10 |
+
import yaml
|
| 11 |
+
|
| 12 |
+
from orpheus_tts.engine_class import OrpheusModel
|
| 13 |
+
from vllm.outputs import RequestOutput
|
| 14 |
+
from vllm import SamplingParams
|
| 15 |
+
|
| 16 |
+
# --- Background loop to keep vLLM stable across requests ---
|
| 17 |
+
# This class is correct and does not need changes.
|
| 18 |
+
class BackgroundEventLoop:
|
| 19 |
+
def __init__(self):
|
| 20 |
+
self._loop = asyncio.new_event_loop()
|
| 21 |
+
self._thread = threading.Thread(target=self._run_loop, daemon=True)
|
| 22 |
+
self._thread.start()
|
| 23 |
+
|
| 24 |
+
def _run_loop(self):
|
| 25 |
+
asyncio.set_event_loop(self._loop)
|
| 26 |
+
self._loop.run_forever()
|
| 27 |
+
|
| 28 |
+
def run_generator(self, async_gen):
|
| 29 |
+
q = queue.Queue()
|
| 30 |
+
sentinel = object()
|
| 31 |
+
|
| 32 |
+
async def producer():
|
| 33 |
+
try:
|
| 34 |
+
async for item in async_gen:
|
| 35 |
+
q.put(item)
|
| 36 |
+
except Exception as e:
|
| 37 |
+
q.put(e)
|
| 38 |
+
finally:
|
| 39 |
+
q.put(sentinel)
|
| 40 |
+
|
| 41 |
+
asyncio.run_coroutine_threadsafe(producer(), self._loop)
|
| 42 |
+
|
| 43 |
+
while True:
|
| 44 |
+
item = q.get()
|
| 45 |
+
if item is sentinel:
|
| 46 |
+
break
|
| 47 |
+
if isinstance(item, Exception):
|
| 48 |
+
raise item
|
| 49 |
+
yield item
|
| 50 |
+
|
| 51 |
+
# --- Patched Orpheus model using background loop ---
|
| 52 |
+
tts_event_loop = BackgroundEventLoop()
|
| 53 |
+
|
| 54 |
+
class PatchedOrpheusModel(OrpheusModel):
|
| 55 |
+
# THE FIX IS HERE
|
| 56 |
+
def generate_tokens_sync(self, prompt, voice=None, request_id=None, temperature=0.6, top_p=0.8, max_tokens=1200, stop_token_ids=[49158], repetition_penalty=1.3):
|
| 57 |
+
|
| 58 |
+
# If no request_id is provided, generate a new unique one.
|
| 59 |
+
# This solves the "id already running" error.
|
| 60 |
+
if request_id is None:
|
| 61 |
+
request_id = str(uuid.uuid4())
|
| 62 |
+
|
| 63 |
+
prompt_string = self._format_prompt(prompt, voice)
|
| 64 |
+
sampling_params = SamplingParams(
|
| 65 |
+
temperature=temperature,
|
| 66 |
+
top_p=top_p,
|
| 67 |
+
max_tokens=max_tokens,
|
| 68 |
+
stop_token_ids=stop_token_ids,
|
| 69 |
+
repetition_penalty=repetition_penalty,
|
| 70 |
+
)
|
| 71 |
+
async_gen = self.engine.generate(
|
| 72 |
+
prompt=prompt_string,
|
| 73 |
+
sampling_params=sampling_params,
|
| 74 |
+
request_id=request_id # Use the unique ID
|
| 75 |
+
)
|
| 76 |
+
for result in tts_event_loop.run_generator(async_gen):
|
| 77 |
+
if not isinstance(result, RequestOutput):
|
| 78 |
+
raise TypeError(f"Unexpected result type: {type(result)}")
|
| 79 |
+
yield result.outputs[0].text
|
| 80 |
+
|
| 81 |
+
# --- Persistent global model ---
|
| 82 |
+
# This section is correct and does not need changes.
|
| 83 |
+
model = None
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def load_yaml():
|
| 87 |
+
"""
|
| 88 |
+
Loads a YAML file and extracts the 'model_name' string.
|
| 89 |
+
"""
|
| 90 |
+
file_path = "tester.yaml"
|
| 91 |
+
try:
|
| 92 |
+
with open(file_path, 'r') as file:
|
| 93 |
+
config = yaml.safe_load(file)
|
| 94 |
+
return config
|
| 95 |
+
|
| 96 |
+
except FileNotFoundError:
|
| 97 |
+
print(f"Error: The file '{file_path}' was not found.")
|
| 98 |
+
return None
|
| 99 |
+
except yaml.YAMLError as e:
|
| 100 |
+
print(f"Error parsing YAML file: {e}")
|
| 101 |
+
return None
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def setup_model():
|
| 106 |
+
global model
|
| 107 |
+
if model is None:
|
| 108 |
+
print("Loading TTS model...")
|
| 109 |
+
model = PatchedOrpheusModel(model_name="./miko-tts-biboo/checkpoint-3021")
|
| 110 |
+
print("✅ Model loaded and ready.")
|
| 111 |
+
|
| 112 |
+
def synthesize_for_scene(
|
| 113 |
+
prompt: str,
|
| 114 |
+
voice: str = "miko",
|
| 115 |
+
temperature: float = 0.6,
|
| 116 |
+
top_p: float = 0.9,
|
| 117 |
+
repetition_penalty: float = 1.3,
|
| 118 |
+
max_tokens: int = 1200,
|
| 119 |
+
):
|
| 120 |
+
global model
|
| 121 |
+
|
| 122 |
+
config = load_yaml()
|
| 123 |
+
|
| 124 |
+
# This function now works correctly in parallel because each call
|
| 125 |
+
# will trigger a unique request_id in the PatchedOrpheusModel above.
|
| 126 |
+
chunks = bytearray()
|
| 127 |
+
for chunk in model.generate_speech(
|
| 128 |
+
prompt=prompt,
|
| 129 |
+
voice=config["voice"],
|
| 130 |
+
temperature=config["temperature"],
|
| 131 |
+
top_p=config["top_p"],
|
| 132 |
+
max_tokens=config["max_tokens"],
|
| 133 |
+
repetition_penalty=config["repetition_penalty"],
|
| 134 |
+
):
|
| 135 |
+
chunks.extend(chunk)
|
| 136 |
+
|
| 137 |
+
buffer = BytesIO()
|
| 138 |
+
with wave.open(buffer, "wb") as wf:
|
| 139 |
+
wf.setnchannels(1)
|
| 140 |
+
wf.setsampwidth(2)
|
| 141 |
+
wf.setframerate(24000)
|
| 142 |
+
wf.writeframes(chunks)
|
| 143 |
+
|
| 144 |
+
audio_bytes = buffer.getvalue()
|
| 145 |
+
audio_base64 = base64.b64encode(audio_bytes).decode("utf-8")
|
| 146 |
+
return audio_bytes, audio_base64
|