zirobtc commited on
Commit
4ffb3b6
·
verified ·
1 Parent(s): c0a8ea3

Uploading DART folder into model repo

Browse files
finetune/config.yaml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CHANGE THIS TO YOUR OWN DATASET
2
+ TTS_dataset: zirobtc/biboo-dataset-tokenised
3
+
4
+ model_name: "canopylabs/orpheus-tts-0.1-finetune-prod"
5
+
6
+ # Training Args
7
+ epochs: 3
8
+ batch_size: 1
9
+ number_processes: 1
10
+ pad_token: 128263
11
+ save_steps: 5000
12
+ learning_rate: 5.0e-5
13
+
14
+ # Naming and paths
15
+ save_folder: "miko-tts-biboo"
16
+ project_name: "miko-biboo"
17
+ run_name: "5e5-0"
finetune/lora.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from peft import LoraConfig, get_peft_model
3
+ from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, AutoTokenizer
4
+ import numpy as np
5
+ import yaml
6
+ import wandb
7
+
8
+ config_file = "config.yaml"
9
+
10
+ with open(config_file, "r") as file:
11
+ config = yaml.safe_load(file)
12
+
13
+ dsn = config["TTS_dataset"]
14
+
15
+ model_name = config["model_name"]
16
+ run_name = config["run_name"]
17
+ project_name = config["project_name"]
18
+ base_repo_id = config["save_folder"]
19
+ epochs = config["epochs"]
20
+ batch_size = config["batch_size"]
21
+ save_steps = config["save_steps"]
22
+ pad_token = config["pad_token"]
23
+ number_processes = config["number_processes"]
24
+ learning_rate = config["learning_rate"]
25
+
26
+ lora_rank = 32
27
+ lora_alpha = 64
28
+ lora_dropout = 0.0
29
+
30
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
31
+ model = AutoModelForCausalLM.from_pretrained(model_name, attn_implementation="flash_attention_2")
32
+
33
+ lora_config = LoraConfig(
34
+ r=lora_rank,
35
+ lora_alpha=lora_alpha,
36
+ lora_dropout=lora_dropout,
37
+ target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "down_proj", "up_proj"],
38
+ bias="none",
39
+ modules_to_save=["lm_head", "embed_tokens"], # Optional to train the embeddings and lm head
40
+ task_type="CAUSAL_LM",
41
+ use_rslora=True,
42
+ )
43
+
44
+ model = get_peft_model(model, lora_config)
45
+
46
+ ds = load_dataset(dsn, split="train")
47
+
48
+ wandb.init(project=project_name, name = run_name)
49
+
50
+ training_args = TrainingArguments(
51
+ overwrite_output_dir=True,
52
+ num_train_epochs=epochs,
53
+ per_device_train_batch_size=batch_size,
54
+ logging_steps=1,
55
+ bf16=True,
56
+ output_dir=f"./{base_repo_id}",
57
+ report_to="wandb",
58
+ save_steps=save_steps,
59
+ remove_unused_columns=True,
60
+ learning_rate=learning_rate,
61
+ )
62
+
63
+ trainer = Trainer(
64
+ model=model,
65
+ args=training_args,
66
+ train_dataset=ds,
67
+ )
68
+
69
+ trainer.train()
70
+
71
+ merged_model = model.merge_and_unload()
72
+
73
+ merged_model.save_pretrained(f"./{base_repo_id}/merged")
74
+ tokenizer.save_pretrained(f"./{base_repo_id}/merged")
finetune/requirments.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ flash_attn
2
+ transformers==4.51.3
3
+ trl==0.8.0
4
+ datasets
5
+ wandb
6
+ accelerate
finetune/setup.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # prepare_local_checkpoint.py
2
+
3
+ import os
4
+ import shutil
5
+ from huggingface_hub import snapshot_download, login
6
+
7
+ # The repository where the base Orpheus model's tokenizer files are located
8
+ PRETRAINED_REPO = "zirobtc/miko-voice-tts-3e-5-7"
9
+ # The local directory of your fine-tuned model checkpoint
10
+ # Ensure this path exists and contains your fine-tuned model weights (e.g., model.safetensors)
11
+ CHECKPOINT_DIR = "./miko-tts-biboo/checkpoint-3021"
12
+
13
+
14
+
15
+ # --- 1. Download tokenizer files from the pretrained model ---
16
+ print(f"Downloading tokenizer files from {PRETRAINED_REPO}...")
17
+ tokenizer_files = [
18
+ "tokenizer.json",
19
+ "tokenizer_config.json",
20
+ "special_tokens_map.json"
21
+ ]
22
+ # snapshot_download will download these specific files into a temporary cache directory
23
+ pretrained_tokenizer_local_cache_path = snapshot_download(PRETRAINED_REPO, allow_patterns=tokenizer_files)
24
+ print(f"✅ Tokenizer files downloaded to cache: {pretrained_tokenizer_local_cache_path}")
25
+
26
+
27
+ # --- 3. Copy tokenizer files into your local checkpoint directory ---
28
+ print(f"Copying tokenizer files to local checkpoint directory: {CHECKPOINT_DIR}...")
29
+ for filename in tokenizer_files:
30
+ source_path = os.path.join(pretrained_tokenizer_local_cache_path, filename)
31
+ destination_path = os.path.join(CHECKPOINT_DIR, filename)
32
+ shutil.copy(source_path, destination_path)
33
+ print(f"Copied {filename} to {destination_path}")
34
+
35
+ print("🎉 Tokenizer files successfully placed in your local fine-tuned model checkpoint directory.")
finetune/tester.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import uuid
4
+ import tts_engine # Assuming tts_engine.py is in the same directory
5
+
6
+ def main():
7
+ print("🚀 Orpheus TTS CLI Tester 🚀")
8
+ print("Type your text and press Enter to synthesize speech.")
9
+ print("Press Ctrl+D (EOF) or type 'exit' to quit.")
10
+
11
+ # Ensure the output directory exists
12
+ output_dir = "generated_audio"
13
+ os.makedirs(output_dir, exist_ok=True)
14
+ print(f"Audio files will be saved to: {os.path.abspath(output_dir)}")
15
+
16
+ try:
17
+ # Load the TTS model once at the start
18
+ tts_engine.setup_model()
19
+ except Exception as e:
20
+ print(f"❌ Error setting up TTS model: {e}")
21
+ print("Please ensure 'vllm' is installed and your 'config.yaml' is correctly configured.")
22
+ sys.exit(1)
23
+
24
+ while True:
25
+ try:
26
+ prompt = input("\nEnter text to synthesize: ").strip()
27
+
28
+ if prompt.lower() == 'exit':
29
+ break
30
+ if not prompt:
31
+ print("Please enter some text.")
32
+ continue
33
+
34
+ print(f"Synthesizing: '{prompt}'...")
35
+ try:
36
+ audio_bytes, _ = tts_engine.synthesize_for_scene(prompt=prompt)
37
+
38
+ # Generate a unique filename for the WAV output
39
+ output_filename = os.path.join(output_dir, f"output_{uuid.uuid4().hex}.wav")
40
+ with open(output_filename, "wb") as f:
41
+ f.write(audio_bytes)
42
+ print(f"✅ Audio saved to: {output_filename}")
43
+
44
+ except Exception as e:
45
+ print(f"❌ Error during synthesis: {e}")
46
+ print("Make sure your model is correctly loaded and accessible.")
47
+
48
+ except EOFError:
49
+ print("\nExiting tester.")
50
+ break
51
+ except KeyboardInterrupt:
52
+ print("\nExiting tester.")
53
+ break
54
+
55
+ if __name__ == "__main__":
56
+ main()
finetune/tester.yaml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ voice: "miko"
2
+ temperature: 0.6
3
+ top_p: 0.9
4
+ max_tokens: 2048
5
+ repetition_penalty: 1.3
finetune/train.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, AutoTokenizer
3
+ import numpy as np
4
+ import yaml
5
+ import wandb
6
+
7
+ config_file = "config.yaml"
8
+
9
+ with open(config_file, "r") as file:
10
+ config = yaml.safe_load(file)
11
+
12
+ dsn = config["TTS_dataset"]
13
+
14
+ model_name = config["model_name"]
15
+ run_name = config["run_name"]
16
+ project_name = config["project_name"]
17
+ base_repo_id = config["save_folder"]
18
+ epochs = config["epochs"]
19
+ batch_size = config["batch_size"]
20
+ save_steps = config["save_steps"]
21
+ pad_token = config["pad_token"]
22
+ number_processes = config["number_processes"]
23
+ learning_rate = config["learning_rate"]
24
+
25
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
26
+ model = AutoModelForCausalLM.from_pretrained(model_name, attn_implementation="flash_attention_2")
27
+
28
+
29
+ ds = load_dataset(dsn, split="train")
30
+
31
+ wandb.init(project=project_name, name = run_name)
32
+
33
+ training_args = TrainingArguments(
34
+ overwrite_output_dir=True,
35
+ num_train_epochs=epochs,
36
+ per_device_train_batch_size=batch_size,
37
+ logging_steps=1,
38
+ bf16=True,
39
+ output_dir=f"./{base_repo_id}",
40
+ report_to="wandb",
41
+ save_steps=save_steps,
42
+ remove_unused_columns=True,
43
+ learning_rate=learning_rate,
44
+ )
45
+
46
+ trainer = Trainer(
47
+ model=model,
48
+ args=training_args,
49
+ train_dataset=ds,
50
+ )
51
+
52
+ trainer.train()
53
+
finetune/tts_engines.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # tts_engine.py
2
+
3
+ import wave
4
+ import asyncio
5
+ import uuid # Import uuid to generate unique IDs
6
+ import threading
7
+ import queue
8
+ import base64
9
+ from io import BytesIO
10
+ import yaml
11
+
12
+ from orpheus_tts.engine_class import OrpheusModel
13
+ from vllm.outputs import RequestOutput
14
+ from vllm import SamplingParams
15
+
16
+ # --- Background loop to keep vLLM stable across requests ---
17
+ # This class is correct and does not need changes.
18
+ class BackgroundEventLoop:
19
+ def __init__(self):
20
+ self._loop = asyncio.new_event_loop()
21
+ self._thread = threading.Thread(target=self._run_loop, daemon=True)
22
+ self._thread.start()
23
+
24
+ def _run_loop(self):
25
+ asyncio.set_event_loop(self._loop)
26
+ self._loop.run_forever()
27
+
28
+ def run_generator(self, async_gen):
29
+ q = queue.Queue()
30
+ sentinel = object()
31
+
32
+ async def producer():
33
+ try:
34
+ async for item in async_gen:
35
+ q.put(item)
36
+ except Exception as e:
37
+ q.put(e)
38
+ finally:
39
+ q.put(sentinel)
40
+
41
+ asyncio.run_coroutine_threadsafe(producer(), self._loop)
42
+
43
+ while True:
44
+ item = q.get()
45
+ if item is sentinel:
46
+ break
47
+ if isinstance(item, Exception):
48
+ raise item
49
+ yield item
50
+
51
+ # --- Patched Orpheus model using background loop ---
52
+ tts_event_loop = BackgroundEventLoop()
53
+
54
+ class PatchedOrpheusModel(OrpheusModel):
55
+ # THE FIX IS HERE
56
+ def generate_tokens_sync(self, prompt, voice=None, request_id=None, temperature=0.6, top_p=0.8, max_tokens=1200, stop_token_ids=[49158], repetition_penalty=1.3):
57
+
58
+ # If no request_id is provided, generate a new unique one.
59
+ # This solves the "id already running" error.
60
+ if request_id is None:
61
+ request_id = str(uuid.uuid4())
62
+
63
+ prompt_string = self._format_prompt(prompt, voice)
64
+ sampling_params = SamplingParams(
65
+ temperature=temperature,
66
+ top_p=top_p,
67
+ max_tokens=max_tokens,
68
+ stop_token_ids=stop_token_ids,
69
+ repetition_penalty=repetition_penalty,
70
+ )
71
+ async_gen = self.engine.generate(
72
+ prompt=prompt_string,
73
+ sampling_params=sampling_params,
74
+ request_id=request_id # Use the unique ID
75
+ )
76
+ for result in tts_event_loop.run_generator(async_gen):
77
+ if not isinstance(result, RequestOutput):
78
+ raise TypeError(f"Unexpected result type: {type(result)}")
79
+ yield result.outputs[0].text
80
+
81
+ # --- Persistent global model ---
82
+ # This section is correct and does not need changes.
83
+ model = None
84
+
85
+
86
+ def load_yaml():
87
+ """
88
+ Loads a YAML file and extracts the 'model_name' string.
89
+ """
90
+ file_path = "tester.yaml"
91
+ try:
92
+ with open(file_path, 'r') as file:
93
+ config = yaml.safe_load(file)
94
+ return config
95
+
96
+ except FileNotFoundError:
97
+ print(f"Error: The file '{file_path}' was not found.")
98
+ return None
99
+ except yaml.YAMLError as e:
100
+ print(f"Error parsing YAML file: {e}")
101
+ return None
102
+
103
+
104
+
105
+ def setup_model():
106
+ global model
107
+ if model is None:
108
+ print("Loading TTS model...")
109
+ model = PatchedOrpheusModel(model_name="./miko-tts-biboo/checkpoint-3021")
110
+ print("✅ Model loaded and ready.")
111
+
112
+ def synthesize_for_scene(
113
+ prompt: str,
114
+ voice: str = "miko",
115
+ temperature: float = 0.6,
116
+ top_p: float = 0.9,
117
+ repetition_penalty: float = 1.3,
118
+ max_tokens: int = 1200,
119
+ ):
120
+ global model
121
+
122
+ config = load_yaml()
123
+
124
+ # This function now works correctly in parallel because each call
125
+ # will trigger a unique request_id in the PatchedOrpheusModel above.
126
+ chunks = bytearray()
127
+ for chunk in model.generate_speech(
128
+ prompt=prompt,
129
+ voice=config["voice"],
130
+ temperature=config["temperature"],
131
+ top_p=config["top_p"],
132
+ max_tokens=config["max_tokens"],
133
+ repetition_penalty=config["repetition_penalty"],
134
+ ):
135
+ chunks.extend(chunk)
136
+
137
+ buffer = BytesIO()
138
+ with wave.open(buffer, "wb") as wf:
139
+ wf.setnchannels(1)
140
+ wf.setsampwidth(2)
141
+ wf.setframerate(24000)
142
+ wf.writeframes(chunks)
143
+
144
+ audio_bytes = buffer.getvalue()
145
+ audio_base64 = base64.b64encode(audio_bytes).decode("utf-8")
146
+ return audio_bytes, audio_base64