niobures commited on
Commit
f316adb
·
verified ·
1 Parent(s): e1c124f

Orpheus (tr)

Browse files
.gitattributes CHANGED
@@ -190,3 +190,5 @@ en/Orpheus-Liam-GGUF/Orpheus-Liam.Q8_0.gguf filter=lfs diff=lfs merge=lfs -text
190
  en/Orpheus-Liam/tokenizer.json filter=lfs diff=lfs merge=lfs -text
191
  tr/Orpheus-KhanAcademy-TR-Math/tokenizer.json filter=lfs diff=lfs merge=lfs -text
192
  tr/Orpheus-KhanAcademy-TR/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
 
190
  en/Orpheus-Liam/tokenizer.json filter=lfs diff=lfs merge=lfs -text
191
  tr/Orpheus-KhanAcademy-TR-Math/tokenizer.json filter=lfs diff=lfs merge=lfs -text
192
  tr/Orpheus-KhanAcademy-TR/tokenizer.json filter=lfs diff=lfs merge=lfs -text
193
+ tr/Orpheus-TTS-Turkish-PT-2000-GGUF/orpheus-tts-turkish-pt-2000.gguf filter=lfs diff=lfs merge=lfs -text
194
+ tr/Orpheus-TTS-Turkish-PT-2000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
tr/Orpheus-TTS-Turkish-PT-2000-GGUF/.gitattributes ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ orpheus-2.Q4_K_M.gguf filter=lfs diff=lfs merge=lfs -text
37
+ orpheus-tts-turkish-pt-2000.gguf filter=lfs diff=lfs merge=lfs -text
tr/Orpheus-TTS-Turkish-PT-2000-GGUF/README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ language:
4
+ - tr
5
+ base_model:
6
+ - Karayakar/Orpheus-TTS-Turkish-PT-2000
7
+ pipeline_tag: text-to-speech
8
+ tags:
9
+ - Turkish
10
+ datasets:
11
+ - Karayakar/OrpheusTTS110K-TR-Data
12
+ ---
tr/Orpheus-TTS-Turkish-PT-2000-GGUF/orpheus-tts-turkish-pt-2000.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cad772a1af61cf106651cd7345211929b97444157d99ec8522194c0ff7c2a338
3
+ size 6610828992
tr/Orpheus-TTS-Turkish-PT-2000-GGUF/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/Karayakar/Orpheus-TTS-Turkish-PT-2000-GGUF
tr/Orpheus-TTS-Turkish-PT-2000/.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
tr/Orpheus-TTS-Turkish-PT-2000/README.md ADDED
@@ -0,0 +1,385 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ language:
4
+ - tr
5
+ base_model:
6
+ - canopylabs/orpheus-3b-0.1-pretrained
7
+ tags:
8
+ - karayakar
9
+ - Turkish
10
+ - Turkce
11
+ - TTS
12
+ - Orpheus
13
+ - Text-to-Speech
14
+ pipeline_tag: text-to-speech
15
+ ---
16
+
17
+ # Orpheus TTS Turkish Model
18
+
19
+ Orpheus TTS Turkish Pretrain (step 2000)
20
+ model is trained based on "canopylabs/orpheus-3b-0.1-pretrained".
21
+
22
+ Syntethic voice data over 60 hrs used for initial training.
23
+ +160hrs additional Syntethic voice data mixed in training.
24
+ 400 Emoji (real voice) data used for emoji support.
25
+
26
+ you can interact with the model - Flask API
27
+
28
+
29
+ # Emotion Support
30
+
31
+ Model supports below emotions in the text.
32
+ ```
33
+ <laugh> – gülme
34
+
35
+ <chuckle> – kıkırdama
36
+
37
+ <sigh> – iç çekme
38
+
39
+ <cough> – öksürme
40
+
41
+ <sniffle> – <burnunu çekme>
42
+
43
+ <groan> – inleme
44
+
45
+ <yawn> – esneme
46
+
47
+ <gasp> – nefesi kesilme / şaşkınlıkla soluma
48
+ ```
49
+
50
+
51
+ # API
52
+
53
+ Flask configured to run on port 5400 (you can change in the below script)
54
+
55
+ ```
56
+ POST http://127.0.0.1:5400/generate HTTP/1.1
57
+ User-Agent: Fiddler
58
+ content-type: application/json
59
+ Host: 127.0.0.1:5400
60
+ Content-Length: 110
61
+
62
+ {
63
+ "text": "Merhaba, orpheusTTS Turkce deneme"
64
+ }
65
+
66
+ ```
67
+
68
+ # Create Environment
69
+
70
+ windows:
71
+ ```
72
+ #create virtual environment
73
+ python -m venv venv
74
+ venv\Scripts\activate
75
+
76
+ python inference.py
77
+
78
+
79
+
80
+ ```
81
+
82
+ # Training
83
+
84
+ ```
85
+ For training with your own data, you can check
86
+ train.py
87
+ config.yaml
88
+
89
+ ```
90
+
91
+
92
+
93
+
94
+
95
+ # inference.py
96
+ (please install the necessary libraries)
97
+
98
+ ```
99
+ # respective torch from https://pytorch.org/
100
+ pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
101
+ pip install snac pathlib torch transformers huggingface_hub librosa numpy scipy torchaudio Flask jsonify
102
+
103
+
104
+ ```
105
+
106
+ ```
107
+ import os
108
+ from snac import SNAC
109
+ from pathlib import Path
110
+ import torch
111
+ from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, AutoTokenizer,BitsAndBytesConfig
112
+ from huggingface_hub import snapshot_download
113
+ import librosa
114
+ import numpy as np
115
+ from scipy.io.wavfile import write
116
+ import torchaudio
117
+ from flask import Flask, jsonify, request
118
+
119
+ modelLocalPath="D:\\...\\Karayakar\\Orpheus-TTS-Turkish-PT-2000"
120
+
121
+
122
+ def load_orpheus_tokenizer(model_id: str = modelLocalPath) -> AutoTokenizer:
123
+ tokenizer = AutoTokenizer.from_pretrained(model_id,local_files_only=True, device_map="cuda")
124
+ return tokenizer
125
+
126
+ def load_snac():
127
+ snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
128
+ return snac_model
129
+
130
+ def load_orpheus_auto_model(model_id: str = modelLocalPath):
131
+ model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16,local_files_only=True, device_map="cuda")
132
+ model.cuda()
133
+ return model
134
+
135
+
136
+
137
+ def tokenize_audio(audio_file_path, snac_model):
138
+ audio_array, sample_rate = librosa.load(audio_file_path, sr=24000)
139
+ waveform = torch.from_numpy(audio_array).unsqueeze(0)
140
+ waveform = waveform.to(dtype=torch.float32)
141
+
142
+ waveform = waveform.unsqueeze(0)
143
+
144
+ with torch.inference_mode():
145
+ codes = snac_model.encode(waveform)
146
+
147
+ all_codes = []
148
+ for i in range(codes[0].shape[1]):
149
+ all_codes.append(codes[0][0][i].item() + 128266)
150
+ all_codes.append(codes[1][0][2 * i].item() + 128266 + 4096)
151
+ all_codes.append(codes[2][0][4 * i].item() + 128266 + (2 * 4096))
152
+ all_codes.append(codes[2][0][(4 * i) + 1].item() + 128266 + (3 * 4096))
153
+ all_codes.append(codes[1][0][(2 * i) + 1].item() + 128266 + (4 * 4096))
154
+ all_codes.append(codes[2][0][(4 * i) + 2].item() + 128266 + (5 * 4096))
155
+ all_codes.append(codes[2][0][(4 * i) + 3].item() + 128266 + (6 * 4096))
156
+
157
+ return all_codes
158
+
159
+
160
+ def prepare_inputs(
161
+ fpath_audio_ref,
162
+ audio_ref_transcript: str,
163
+ text_prompts: list[str],
164
+ snac_model,
165
+ tokenizer,
166
+ ):
167
+
168
+
169
+ start_tokens = torch.tensor([[128259]], dtype=torch.int64)
170
+ end_tokens = torch.tensor([[128009, 128260, 128261, 128257]], dtype=torch.int64)
171
+ final_tokens = torch.tensor([[128258, 128262]], dtype=torch.int64)
172
+
173
+
174
+ all_modified_input_ids = []
175
+ for prompt in text_prompts:
176
+ input_ids = tokenizer(prompt, return_tensors="pt").input_ids
177
+ #second_input_ids = torch.cat([zeroprompt_input_ids, start_tokens, input_ids, end_tokens], dim=1)
178
+ second_input_ids = torch.cat([start_tokens, input_ids, end_tokens], dim=1)
179
+ all_modified_input_ids.append(second_input_ids)
180
+
181
+ all_padded_tensors = []
182
+ all_attention_masks = []
183
+ max_length = max([modified_input_ids.shape[1] for modified_input_ids in all_modified_input_ids])
184
+
185
+ for modified_input_ids in all_modified_input_ids:
186
+ padding = max_length - modified_input_ids.shape[1]
187
+ padded_tensor = torch.cat([torch.full((1, padding), 128263, dtype=torch.int64), modified_input_ids], dim=1)
188
+ attention_mask = torch.cat([torch.zeros((1, padding), dtype=torch.int64),
189
+ torch.ones((1, modified_input_ids.shape[1]), dtype=torch.int64)], dim=1)
190
+ all_padded_tensors.append(padded_tensor)
191
+ all_attention_masks.append(attention_mask)
192
+
193
+ all_padded_tensors = torch.cat(all_padded_tensors, dim=0)
194
+ all_attention_masks = torch.cat(all_attention_masks, dim=0)
195
+
196
+ input_ids = all_padded_tensors.to("cuda")
197
+ attention_mask = all_attention_masks.to("cuda")
198
+ return input_ids, attention_mask
199
+
200
+
201
+
202
+ def inference(model, input_ids, attention_mask):
203
+ with torch.no_grad():
204
+ generated_ids = model.generate(
205
+ input_ids=input_ids,
206
+ attention_mask=attention_mask,
207
+ max_new_tokens=2048,
208
+ do_sample=True,
209
+ temperature=0.2,
210
+ top_k=10,
211
+ top_p=0.9,
212
+ repetition_penalty=1.9,
213
+ num_return_sequences=1,
214
+ eos_token_id=128258,
215
+
216
+ )
217
+
218
+ generated_ids = torch.cat([generated_ids, torch.tensor([[128262]]).to("cuda")], dim=1) # EOAI
219
+
220
+ return generated_ids
221
+
222
+
223
+ def convert_tokens_to_speech(generated_ids, snac_model):
224
+ token_to_find = 128257
225
+ token_to_remove = 128258
226
+ token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True)
227
+
228
+ if len(token_indices[1]) > 0:
229
+ last_occurrence_idx = token_indices[1][-1].item()
230
+ cropped_tensor = generated_ids[:, last_occurrence_idx + 1:]
231
+ else:
232
+ cropped_tensor = generated_ids
233
+
234
+ _mask = cropped_tensor != token_to_remove
235
+ processed_rows = []
236
+ for row in cropped_tensor:
237
+ masked_row = row[row != token_to_remove]
238
+ processed_rows.append(masked_row)
239
+
240
+ code_lists = []
241
+ for row in processed_rows:
242
+ row_length = row.size(0)
243
+ new_length = (row_length // 7) * 7
244
+ trimmed_row = row[:new_length]
245
+ trimmed_row = [t - 128266 for t in trimmed_row]
246
+ code_lists.append(trimmed_row)
247
+
248
+ my_samples = []
249
+ for code_list in code_lists:
250
+ samples = redistribute_codes(code_list, snac_model)
251
+ my_samples.append(samples)
252
+
253
+ return my_samples
254
+
255
+
256
+ def redistribute_codes(code_list, snac_model):
257
+ layer_1 = []
258
+ layer_2 = []
259
+ layer_3 = []
260
+
261
+ for i in range((len(code_list) + 1) // 7):
262
+ layer_1.append(code_list[7 * i])
263
+ layer_2.append(code_list[7 * i + 1] - 4096)
264
+ layer_3.append(code_list[7 * i + 2] - (2 * 4096))
265
+ layer_3.append(code_list[7 * i + 3] - (3 * 4096))
266
+ layer_2.append(code_list[7 * i + 4] - (4 * 4096))
267
+ layer_3.append(code_list[7 * i + 5] - (5 * 4096))
268
+ layer_3.append(code_list[7 * i + 6] - (6 * 4096))
269
+
270
+ codes = [
271
+ torch.tensor(layer_1).unsqueeze(0),
272
+ torch.tensor(layer_2).unsqueeze(0),
273
+ torch.tensor(layer_3).unsqueeze(0)
274
+ ]
275
+ audio_hat = snac_model.decode(codes)
276
+ return audio_hat
277
+
278
+
279
+ def to_wav_from(samples: list) -> list[np.ndarray]:
280
+ """Converts a list of PyTorch tensors (or NumPy arrays) to NumPy arrays."""
281
+ processed_samples = []
282
+
283
+ for s in samples:
284
+ if isinstance(s, torch.Tensor):
285
+ s = s.detach().squeeze().to('cpu').numpy()
286
+ else:
287
+ s = np.squeeze(s)
288
+
289
+ processed_samples.append(s)
290
+
291
+ return processed_samples
292
+
293
+
294
+ def zero_shot_tts(fpath_audio_ref, audio_ref_transcript, texts: list[str], model, snac_model, tokenizer):
295
+ print(f"fpath_audio_ref {fpath_audio_ref}")
296
+ print(f"audio_ref_transcript {audio_ref_transcript}")
297
+ print(f"texts {texts}")
298
+ inp_ids, attn_mask = prepare_inputs(fpath_audio_ref, audio_ref_transcript, texts, snac_model, tokenizer)
299
+ print(f"input_id_len:{len(inp_ids)}")
300
+ gen_ids = inference(model, inp_ids, attn_mask)
301
+ samples = convert_tokens_to_speech(gen_ids, snac_model)
302
+ wav_forms = to_wav_from(samples)
303
+ return wav_forms
304
+
305
+
306
+ def save_wav(samples: list[np.array], sample_rate: int, filenames: list[str]):
307
+ """ Saves a list of tensors as .wav files.
308
+
309
+ Args:
310
+ samples (list[torch.Tensor]): List of audio tensors.
311
+ sample_rate (int): Sample rate in Hz.
312
+ filenames (list[str]): List of filenames to save.
313
+ """
314
+ wav_data = to_wav_from(samples)
315
+
316
+ for data, filename in zip(wav_data, filenames):
317
+ write(filename, sample_rate, data.astype(np.float32))
318
+ print(f"saved to {filename}")
319
+
320
+
321
+ def get_ref_audio_and_transcript(root_folder: str):
322
+ root_path = Path(root_folder)
323
+ print(f"root_path {root_path}")
324
+ out = []
325
+ for speaker_folder in root_path.iterdir():
326
+ if speaker_folder.is_dir(): # Ensure it's a directory
327
+ wav_files = list(speaker_folder.glob("*.wav"))
328
+ txt_files = list(speaker_folder.glob("*.txt"))
329
+
330
+ if wav_files and txt_files:
331
+ ref_audio = wav_files[0] # Assume only one .wav file per folder
332
+ transcript = txt_files[0].read_text(encoding="utf-8").strip()
333
+ out.append((ref_audio, transcript))
334
+
335
+ return out
336
+
337
+ app = Flask(__name__)
338
+
339
+
340
+ @app.route('/generate', methods=['POST'])
341
+ def generate():
342
+ content = request.json
343
+ process_data(content)
344
+ rresponse = {
345
+ 'received': content,
346
+ 'status': 'success'
347
+ }
348
+ response= jsonify(rresponse)
349
+ response.headers['Content-Type'] = 'application/json; charset=utf-8'
350
+ return response
351
+
352
+
353
+
354
+ def process_data(jsonText):
355
+ texts = [f"{jsonText['text']}"]
356
+ #print(f"texts:{texts}")
357
+ #print(f"prompt_pairs:{prompt_pairs}")
358
+ for fpath_audio, audio_transcript in prompt_pairs:
359
+ print(f"zero shot: {fpath_audio} {audio_transcript}")
360
+ wav_forms = zero_shot_tts(fpath_audio, audio_transcript, texts, model, snac_model, tokenizer)
361
+
362
+ import os
363
+ from pathlib import Path
364
+ from datetime import datetime
365
+ out_dir = Path(fpath_audio).parent / "inference"
366
+ #print(f"out_dir:{out_dir}")
367
+ out_dir.mkdir(parents=True, exist_ok=True) #
368
+ timestamp_str = str(int(datetime.now().timestamp()))
369
+ file_names = [f"{out_dir.as_posix()}/{Path(fpath_audio).stem}_{i}_{timestamp_str}.wav" for i, t in enumerate(texts)]
370
+ #print(f"file_names:{file_names}")
371
+ save_wav(wav_forms, 24000, file_names)
372
+
373
+
374
+
375
+ if __name__ == "__main__":
376
+ tokenizer = load_orpheus_tokenizer()
377
+ model = load_orpheus_auto_model()
378
+ snac_model = load_snac()
379
+ prompt_pairs = get_ref_audio_and_transcript("D:\\AI_APPS\\Orpheus-TTS\\data")
380
+ print(f"snac_model loaded")
381
+ app.run(debug=True,port=5400)
382
+
383
+
384
+
385
+ ```
tr/Orpheus-TTS-Turkish-PT-2000/config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "G:\\OPENAI\\Orpheus-TTS\\pretrain\\checkpoints_Orpheus_TTS_KA_60HRS_24000Khz\\checkpoint-500",
3
+ "architectures": [
4
+ "LlamaForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 128000,
9
+ "eos_token_id": 128001,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 3072,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 8192,
15
+ "max_position_embeddings": 131072,
16
+ "mlp_bias": false,
17
+ "model_type": "llama",
18
+ "num_attention_heads": 24,
19
+ "num_hidden_layers": 28,
20
+ "num_key_value_heads": 8,
21
+ "pretraining_tp": 1,
22
+ "rms_norm_eps": 1e-05,
23
+ "rope_scaling": {
24
+ "factor": 32.0,
25
+ "high_freq_factor": 4.0,
26
+ "low_freq_factor": 1.0,
27
+ "original_max_position_embeddings": 8192,
28
+ "rope_type": "llama3"
29
+ },
30
+ "rope_theta": 500000.0,
31
+ "tie_word_embeddings": true,
32
+ "torch_dtype": "float32",
33
+ "transformers_version": "4.49.0",
34
+ "use_cache": true,
35
+ "vocab_size": 156940
36
+ }
tr/Orpheus-TTS-Turkish-PT-2000/config.yaml ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model
2
+ model_name: "Karayakar/Orpheus-TTS-Turkish-PT-2000"
3
+ tokenizer_name: "Karayakar/Orpheus-TTS-Turkish-PT-2000"
4
+
5
+
6
+ # Training Args
7
+ epochs: 1
8
+ batch_size: 1
9
+ number_processes: 1
10
+ pad_token: 128263
11
+ save_steps: 100
12
+ learning_rate: 5.0e-5
13
+ ratio: 1:1
14
+
15
+ # Datasets
16
+ text_QA_dataset: "your train dataset"
17
+ TTS_dataset: "your train dataset"
18
+
19
+ # Naming and paths
20
+ save_folder: "checkpoints_Orpheus_TTS"
21
+ project_name: "pretrain-orpheus"
22
+ run_name: "pretrain-orpheus"
23
+ #accelerate launch train.py
tr/Orpheus-TTS-Turkish-PT-2000/generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 128000,
4
+ "do_sample": true,
5
+ "eos_token_id": 128001,
6
+ "temperature": 0.6,
7
+ "top_p": 0.9,
8
+ "transformers_version": "4.49.0"
9
+ }
tr/Orpheus-TTS-Turkish-PT-2000/model-00001-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61d4ae63f446c953ff136785c60538c47ddc30a6e0eaaa41895e8225071b1c80
3
+ size 4948557560
tr/Orpheus-TTS-Turkish-PT-2000/model-00002-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c041d2ac07dc3e428cbe9bc5882bce45fc3a83a05d26e93411b3ffc772562fe8
3
+ size 4932808960
tr/Orpheus-TTS-Turkish-PT-2000/model-00003-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1bbcde72de8c320bf456ca24c68e8cd3ce375d76d214415b32fe064422026599
3
+ size 3322130968
tr/Orpheus-TTS-Turkish-PT-2000/model.safetensors.index.json ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 13203468288
4
+ },
5
+ "weight_map": {
6
+ "model.embed_tokens.weight": "model-00001-of-00003.safetensors",
7
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
8
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
9
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
10
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
11
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
12
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
13
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
14
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
15
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
16
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
17
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
18
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
19
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
20
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
21
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
22
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
23
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
24
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
25
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors",
26
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
27
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
28
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
29
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
30
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
31
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
32
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
33
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
34
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors",
35
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
36
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
37
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
38
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
39
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
40
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
41
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
42
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
43
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors",
44
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
45
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
46
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
47
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
48
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
49
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
50
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
51
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
52
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors",
53
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
54
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
55
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
56
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
57
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
58
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
59
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
60
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
61
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors",
62
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
63
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
64
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
65
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
66
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
67
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
68
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
69
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
70
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
71
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
72
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
73
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
74
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
75
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
76
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
77
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
78
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
79
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
80
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
81
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
82
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
83
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
84
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
85
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
86
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
87
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
88
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
89
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
90
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
91
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
92
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
93
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
94
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
95
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
96
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
97
+ "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
98
+ "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
99
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
100
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
101
+ "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
102
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
103
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
104
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
105
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
106
+ "model.layers.19.input_layernorm.weight": "model-00003-of-00003.safetensors",
107
+ "model.layers.19.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
108
+ "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
109
+ "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
110
+ "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
111
+ "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
112
+ "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
113
+ "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
114
+ "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
115
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
116
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
117
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
118
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
119
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
120
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
121
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
122
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
123
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
124
+ "model.layers.20.input_layernorm.weight": "model-00003-of-00003.safetensors",
125
+ "model.layers.20.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
126
+ "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
127
+ "model.layers.20.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
128
+ "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
129
+ "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
130
+ "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
131
+ "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
132
+ "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
133
+ "model.layers.21.input_layernorm.weight": "model-00003-of-00003.safetensors",
134
+ "model.layers.21.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
135
+ "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
136
+ "model.layers.21.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
137
+ "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
138
+ "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
139
+ "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
140
+ "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
141
+ "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
142
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00003.safetensors",
143
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
144
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
145
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
146
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
147
+ "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
148
+ "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
149
+ "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
150
+ "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
151
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00003.safetensors",
152
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
153
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
154
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
155
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
156
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
157
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
158
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
159
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
160
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00003.safetensors",
161
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
162
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
163
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
164
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
165
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
166
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
167
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
168
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
169
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00003.safetensors",
170
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
171
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
172
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
173
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
174
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
175
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
176
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
177
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
178
+ "model.layers.26.input_layernorm.weight": "model-00003-of-00003.safetensors",
179
+ "model.layers.26.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
180
+ "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
181
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
182
+ "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
183
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
184
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
185
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
186
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
187
+ "model.layers.27.input_layernorm.weight": "model-00003-of-00003.safetensors",
188
+ "model.layers.27.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
189
+ "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
190
+ "model.layers.27.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
191
+ "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
192
+ "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
193
+ "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
194
+ "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
195
+ "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
196
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
197
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
198
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
199
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
200
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
201
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
202
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
203
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
204
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
205
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
206
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
207
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
208
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
209
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
210
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
211
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
212
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
213
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
214
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
215
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
216
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
217
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
218
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
219
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
220
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
221
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
222
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
223
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
224
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
225
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
226
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
227
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
228
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
229
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
230
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
231
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
232
+ "model.layers.7.input_layernorm.weight": "model-00002-of-00003.safetensors",
233
+ "model.layers.7.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
234
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
235
+ "model.layers.7.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
236
+ "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
237
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
238
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
239
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
240
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
241
+ "model.layers.8.input_layernorm.weight": "model-00002-of-00003.safetensors",
242
+ "model.layers.8.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
243
+ "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
244
+ "model.layers.8.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
245
+ "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
246
+ "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
247
+ "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
248
+ "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
249
+ "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
250
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00003.safetensors",
251
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
252
+ "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
253
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
254
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
255
+ "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
256
+ "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
257
+ "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
258
+ "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
259
+ "model.norm.weight": "model-00003-of-00003.safetensors"
260
+ }
261
+ }
tr/Orpheus-TTS-Turkish-PT-2000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95a013621fb4d350dab6ace0142f9e142286c68602c0f0a6438d3d1e3b4419f7
3
+ size 26407159030
tr/Orpheus-TTS-Turkish-PT-2000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea005b38dffb3ddb68fe35f651e4ee5e119bf913293dda58efdd7f56f4335656
3
+ size 14244
tr/Orpheus-TTS-Turkish-PT-2000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da762999d2c4127b623e631487e702dde005ba1214253149c024342697124c4f
3
+ size 1064
tr/Orpheus-TTS-Turkish-PT-2000/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/Karayakar/Orpheus-TTS-Turkish-PT-2000
tr/Orpheus-TTS-Turkish-PT-2000/special_tokens_map.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|audio|>"
4
+ ],
5
+ "bos_token": {
6
+ "content": "<|begin_of_text|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "eos_token": {
13
+ "content": "<|eot_id|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false
18
+ },
19
+ "pad_token": {
20
+ "content": "<|finetune_right_pad_id|>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false
25
+ }
26
+ }
tr/Orpheus-TTS-Turkish-PT-2000/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc3fecb199b4170636dbfab986d25f628157268d37b861f9cadaca60b1353bce
3
+ size 22849547
tr/Orpheus-TTS-Turkish-PT-2000/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
tr/Orpheus-TTS-Turkish-PT-2000/train.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from datasets import load_dataset,load_from_disk
3
+ from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, AutoTokenizer
4
+ import numpy as np
5
+ from torch.distributed.fsdp.fully_sharded_data_parallel import FullStateDictConfig
6
+ from torch.distributed.fsdp import (
7
+ FullyShardedDataParallel as FSDP, FullStateDictConfig, StateDictType)
8
+ from torch.utils.data import DataLoader, Dataset
9
+ from torch.utils.data.distributed import DistributedSampler
10
+ import yaml
11
+ import wandb
12
+ from huggingface_hub import HfApi
13
+
14
+ config_file = "config.yaml"
15
+
16
+ with open(config_file, "r") as file:
17
+ config = yaml.safe_load(file)
18
+
19
+ dsn1 = config["text_QA_dataset"]
20
+ dsn2 = config["TTS_dataset"]
21
+
22
+ model_name = config["model_name"]
23
+ tokenizer_name = config["tokenizer_name"]
24
+
25
+ run_name = config["run_name"]
26
+ project_name = config["project_name"]
27
+ base_repo_id = config["save_folder"]
28
+
29
+ epochs = config["epochs"]
30
+ batch_size = config["batch_size"]
31
+ save_steps = config["save_steps"]
32
+ pad_token = config["pad_token"]
33
+ number_processes = config["number_processes"]
34
+ learning_rate = config["learning_rate"]
35
+ config_ratio = config["ratio"]
36
+
37
+
38
+
39
+
40
+ class BatchedRatioDataset(Dataset):
41
+ def __init__(self, dataset1, dataset2, batch_total, ratio=config_ratio):
42
+ self.dataset1 = dataset1
43
+ self.dataset2 = dataset2
44
+ self.batch_total = batch_total
45
+ self.ratio = ratio
46
+
47
+ num_cycles_ds1 = len(dataset1) // (batch_total * ratio)
48
+ num_cycles_ds2 = len(dataset2) // batch_total
49
+ self.num_cycles = min(num_cycles_ds1, num_cycles_ds2)
50
+
51
+ self.length = self.num_cycles * (ratio + 1) * batch_total
52
+
53
+ def __len__(self):
54
+ print("accessing length", self.length)
55
+ return int(self.length)
56
+
57
+ def __getitem__(self, index):
58
+ # Compute the cycle length in terms of samples.
59
+ cycle_length = (self.ratio + 1) * self.batch_total
60
+ cycle = index // cycle_length
61
+ pos_in_cycle = index % cycle_length
62
+
63
+ if pos_in_cycle < self.ratio * self.batch_total:
64
+ batch_in_cycle = pos_in_cycle // self.batch_total
65
+ sample_in_batch = pos_in_cycle % self.batch_total
66
+ ds1_index = cycle * self.ratio * self.batch_total + batch_in_cycle * self.batch_total + sample_in_batch
67
+ return self.dataset1[ds1_index]
68
+ else:
69
+ # We are in the dataset2 batch for this cycle.
70
+ sample_in_batch = pos_in_cycle - self.ratio * self.batch_total
71
+ ds2_index = cycle * self.batch_total + sample_in_batch
72
+ return self.dataset2[ds2_index]
73
+
74
+
75
+
76
+ class AlternatingDistributedSampler(DistributedSampler):
77
+ def __init__(self, dataset, num_replicas=None, rank=None, shuffle=False):
78
+ super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
79
+ self.shuffle = shuffle
80
+
81
+ def __iter__(self):
82
+ indices = list(range(len(self.dataset)))
83
+ indices = indices[self.rank:self.total_size:self.num_replicas]
84
+ return iter(indices)
85
+
86
+
87
+ class FSDPTrainer(Trainer):
88
+ def __init__(self, *args, log_ratio=config_ratio, **kwargs):
89
+ super().__init__(*args, **kwargs)
90
+ self.repo_id = base_repo_id
91
+ self.api = HfApi()
92
+
93
+ self.log_ratio = log_ratio
94
+ self.text_step = 0
95
+ self.audio_step = 0
96
+
97
+ def get_train_dataloader(self):
98
+ sampler = AlternatingDistributedSampler(
99
+ self.train_dataset,
100
+ num_replicas=torch.distributed.get_world_size(),
101
+ rank=torch.distributed.get_rank(),
102
+ shuffle=False,
103
+ )
104
+
105
+ return DataLoader(
106
+ self.train_dataset,
107
+ batch_size=self.args.per_device_train_batch_size,
108
+ sampler=sampler,
109
+ collate_fn=self.data_collator,
110
+ drop_last=self.args.dataloader_drop_last,
111
+ num_workers=0,
112
+ pin_memory=self.args.dataloader_pin_memory,
113
+ )
114
+
115
+ def log(self, logs, start_time=None):
116
+ super().log(logs, start_time)
117
+ if self.is_world_process_zero():
118
+ global_step = self.state.global_step
119
+ # Each cycle is (log_ratio + 1) steps: first log_ratio steps for text_loss, then one for audio_loss.
120
+ cycle_length = self.log_ratio + 1
121
+ if (global_step % cycle_length) + self.log_ratio - 1 < self.log_ratio:
122
+ wandb.log({"audio_loss": logs["loss"], "audio_step": self.audio_step})
123
+ self.audio_step += 1
124
+ else:
125
+ wandb.log({"text_loss": logs["loss"], "text_step": self.text_step})
126
+ self.text_step += 1
127
+
128
+ def save_model(self, output_dir=None, _internal_call=False):
129
+ if output_dir is None:
130
+ output_dir = self.args.output_dir
131
+ self.save_and_push_model(output_dir)
132
+
133
+ def save_and_push_model(self, output_dir):
134
+ save_policy = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
135
+ with FSDP.state_dict_type(self.model, StateDictType.FULL_STATE_DICT, save_policy):
136
+ cpu_state_dict = self.model.state_dict()
137
+ self.model.save_pretrained(output_dir, state_dict=cpu_state_dict)
138
+
139
+
140
+ def data_collator(features):
141
+ # max_length = 2656 # set a crop based on vram - ideally you have stacked all sequences to the same length
142
+ # from 3b on 8 h100s fsdp, at bf16, 8192 works well.
143
+ input_ids = [f["input_ids"] for f in features]
144
+
145
+ if any("attention_mask" not in f for f in features):
146
+ attention_mask = [[1]*len(ids) for ids in input_ids]
147
+ else:
148
+ attention_mask = [f["attention_mask"] for f in features]
149
+
150
+ if any("labels" not in f for f in features):
151
+ labels = input_ids
152
+ else:
153
+ labels = [f["labels"] for f in features]
154
+
155
+ input_ids = torch.nn.utils.rnn.pad_sequence([torch.tensor(
156
+ i, dtype=torch.long) for i in input_ids], batch_first=True, padding_value=pad_token)
157
+ attention_mask = torch.nn.utils.rnn.pad_sequence([torch.tensor(
158
+ m, dtype=torch.long) for m in attention_mask], batch_first=True, padding_value=0)
159
+ labels = torch.nn.utils.rnn.pad_sequence([torch.tensor(
160
+ l, dtype=torch.long) for l in labels], batch_first=True, padding_value=-100)
161
+
162
+ return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}
163
+
164
+ #wandb offline
165
+ #wandb.init(project=project_name, name=run_name)
166
+
167
+
168
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
169
+ model = AutoModelForCausalLM.from_pretrained(model_name, local_files_only=True)
170
+ #attn_implementation="flash_attention_2",torch_dtype=torch.bfloat16,
171
+ model.cuda()
172
+
173
+ number_add_tokens = 7 * 4096 + 10
174
+ new_tokens = [f"<custom_token_{i}>" for i in range(0, number_add_tokens + 1)]
175
+ tokenizer.add_tokens(new_tokens)
176
+ model.resize_token_embeddings(len(tokenizer))
177
+
178
+
179
+ ds1 = load_from_disk(dsn1, keep_in_memory=True)# split="train")
180
+ ds2 = load_from_disk(dsn2, keep_in_memory=True)# split="train")
181
+
182
+
183
+ batch_total = batch_size * number_processes
184
+ train_dataset = BatchedRatioDataset(ds1, ds2, batch_total, ratio=config_ratio)
185
+
186
+
187
+ #training_args = TrainingArguments(
188
+ # overwrite_output_dir=True,
189
+ # num_train_epochs=epochs,
190
+ # per_device_train_batch_size=batch_size,
191
+ # logging_steps=1,
192
+ # bf16=True,
193
+ # output_dir=f"./{base_repo_id}",
194
+ # #fsdp="auto_wrap",
195
+ # report_to="none",
196
+ # save_steps=save_steps,
197
+ # remove_unused_columns=True,
198
+ # learning_rate=learning_rate,
199
+ # lr_scheduler_type="cosine",
200
+ #)
201
+ #
202
+ #
203
+ #trainer = FSDPTrainer(
204
+ # model=model,
205
+ # args=training_args,
206
+ # train_dataset=train_dataset,
207
+ # data_collator=data_collator,
208
+ # log_ratio=config_ratio
209
+ #)
210
+ #
211
+ #trainer.train()
212
+
213
+
214
+ training_args = TrainingArguments(
215
+ overwrite_output_dir=True,
216
+ num_train_epochs=epochs,
217
+ per_device_train_batch_size=batch_size,
218
+ logging_steps=1,
219
+ bf16=True,
220
+ output_dir=f"./{base_repo_id}",
221
+ report_to="tensorboard",
222
+ save_steps=save_steps,
223
+ remove_unused_columns=True,
224
+ learning_rate=learning_rate,
225
+ lr_scheduler_type="cosine",
226
+ )
227
+
228
+ trainer = Trainer(
229
+ model=model,
230
+ args=training_args,
231
+ train_dataset=train_dataset,
232
+ data_collator=data_collator,
233
+ #log_ratio=config_ratio
234
+ )
235
+
236
+ trainer.train()
tr/Orpheus-TTS-Turkish-PT-2000/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
tr/Orpheus-TTS-Turkish-PT-2000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2e17615c0117656ff03a59eab05516ad636e4ce6df4182952bb10a0fe6b8314
3
+ size 5368