PatnaikAshish commited on
Commit
7a915e5
Β·
verified Β·
1 Parent(s): 12f7f5e

Update audiobook.py

Browse files
Files changed (1) hide show
  1. audiobook.py +195 -188
audiobook.py CHANGED
@@ -1,188 +1,195 @@
1
- import os
2
- import re
3
- import torch
4
- import numpy as np
5
- from scipy.io.wavfile import write
6
- from tts import commons
7
- from tts import utils
8
- from tts.models import SynthesizerTrn
9
- from text.symbols import symbols
10
- from text import text_to_sequence
11
- from phonemizer.backend.espeak.wrapper import EspeakWrapper
12
- from safetensors.torch import load_file
13
-
14
- _ESPEAK_LIBRARY = r"C:\Program Files\eSpeak NG\libespeak-ng.dll"
15
- if os.path.exists(_ESPEAK_LIBRARY):
16
- EspeakWrapper.set_library(_ESPEAK_LIBRARY)
17
- print(f"βœ… Found eSpeak-ng: {_ESPEAK_LIBRARY}")
18
- else:
19
- print("⚠️ eSpeak-ng not found (ok if already working)")
20
-
21
-
22
- MODEL_PATH = "checkpoints/sonya-tts.safetensors"
23
- CONFIG_PATH = "checkpoints/config.json"
24
-
25
- OUTPUT_WAV_SHORT = "output.wav"
26
- OUTPUT_WAV_LONG = "audiobook.wav"
27
-
28
- USE_LONG_FORM = True # ← change to False for short text
29
-
30
- TEXT = """
31
- You’re 100 percent right. LeetCode solutions should be simple, readable,
32
- and teachable. This long form inference mode allows narration of long
33
- paragraphs without instability. The audio remains smooth and natural.
34
- without breaking audio quality. It works smoothly. The Problem: VITS crashes or creates garbage audio if you feed it a whole paragraph or page of text. The Solution: A script that automatically splits text into sentences, generates audio for each, and stitches them together with natural pauses.
35
- Why it stands out: It allows you to convert entire PDF chapters or articles into audio files automatically. Since the architectural changes for zero-shot cloning adding embeddings require retraining, let’s focus on inference-side features you can add right now to your existing G_10400.pth model.
36
- These features will turn your simple "Text-to-WAV" script into a powerful Audiobook & Content Creation Tool.
37
- """
38
-
39
- def save_wav_int16(path, audio, sample_rate):
40
- audio = np.clip(audio, -1.0, 1.0)
41
- audio = (audio * 32767).astype(np.int16)
42
- write(path, sample_rate, audio)
43
-
44
-
45
- def clean_text_for_vits(text):
46
- text = text.strip()
47
-
48
-
49
- text = text.replace("’", "'")
50
- text = text.replace("β€œ", '"').replace("”", '"')
51
- text = text.replace("–", "-").replace("β€”", "-")
52
-
53
-
54
- text = re.sub(r"[()\[\]{}<>]", "", text)
55
-
56
-
57
- text = re.sub(r"[^a-zA-Z0-9\s.,!?'\-]", "", text)
58
-
59
- text = re.sub(r"\s+", " ", text)
60
-
61
- return text
62
-
63
- def get_text(text, hps):
64
- text = clean_text_for_vits(text)
65
- text_norm = text_to_sequence(text, hps.data.text_cleaners)
66
- if hps.data.add_blank:
67
- text_norm = commons.intersperse(text_norm, 0)
68
- return torch.LongTensor(text_norm)
69
-
70
-
71
- def split_sentences(text):
72
- text = clean_text_for_vits(text)
73
- if not text:
74
- return []
75
- return re.split(r'(?<=[.!?])\s+', text)
76
-
77
-
78
- def generate_audiobook(
79
- net_g,
80
- hps,
81
- text,
82
- device,
83
- output_file,
84
- noise_scale=0.5,
85
- noise_scale_w=0.6,
86
- length_scale=1.0,
87
- base_pause=0.4,
88
- ):
89
- print("πŸ“– Long-form audiobook mode enabled")
90
-
91
- sentences = split_sentences(text)
92
- print(f"πŸ”Ή Sentences: {len(sentences)}")
93
-
94
- audio_chunks = []
95
-
96
- for i, sent in enumerate(sentences):
97
- sent = sent.strip()
98
- if not sent:
99
- continue
100
-
101
- stn_tst = get_text(sent, hps)
102
-
103
- with torch.no_grad():
104
- x = stn_tst.to(device).unsqueeze(0)
105
- x_len = torch.LongTensor([stn_tst.size(0)]).to(device)
106
-
107
- audio = net_g.infer(
108
- x,
109
- x_len,
110
- noise_scale=noise_scale,
111
- noise_scale_w=noise_scale_w,
112
- length_scale=length_scale,
113
- )[0][0, 0].cpu().numpy()
114
-
115
- if sent.endswith("?"):
116
- pause = base_pause + 0.15
117
- elif sent.endswith("!"):
118
- pause = base_pause
119
- else:
120
- pause = base_pause + 0.05
121
-
122
- silence = np.zeros(int(hps.data.sampling_rate * pause))
123
-
124
- audio_chunks.append(audio)
125
- audio_chunks.append(silence)
126
-
127
- print(f" βœ… Sentence {i+1}/{len(sentences)} done")
128
-
129
- final_audio = np.concatenate(audio_chunks)
130
- save_wav_int16(output_file, final_audio, hps.data.sampling_rate)
131
-
132
- print(f"πŸŽ‰ Audiobook saved: {os.path.abspath(output_file)}")
133
-
134
-
135
- def main():
136
- if not os.path.exists(CONFIG_PATH):
137
- print("❌ Config file not found")
138
- return
139
-
140
- hps = utils.get_hparams_from_file(CONFIG_PATH)
141
-
142
- device = "cuda" if torch.cuda.is_available() else "cpu"
143
- print(f"πŸš€ Using device: {device}")
144
-
145
- # Load model
146
- net_g = SynthesizerTrn(
147
- len(symbols),
148
- hps.data.filter_length // 2 + 1,
149
- hps.train.segment_size // hps.data.hop_length,
150
- **hps.model,
151
- ).to(device)
152
- net_g.eval()
153
-
154
- # Load checkpoint
155
- state_dict = load_file(MODEL_PATH, device=device)
156
- net_g.load_state_dict(state_dict)
157
- print(f"βœ… Loaded model: {MODEL_PATH}")
158
-
159
-
160
- if USE_LONG_FORM:
161
- generate_audiobook(
162
- net_g,
163
- hps,
164
- TEXT,
165
- device,
166
- OUTPUT_WAV_LONG,
167
- )
168
- else:
169
- print("πŸ—£οΈ Short-text inference")
170
-
171
- stn_tst = get_text(TEXT, hps)
172
- with torch.no_grad():
173
- x = stn_tst.to(device).unsqueeze(0)
174
- x_len = torch.LongTensor([stn_tst.size(0)]).to(device)
175
-
176
- audio = net_g.infer(
177
- x,
178
- x_len,
179
- noise_scale=0.5,
180
- noise_scale_w=0.6,
181
- length_scale=1.0,
182
- )[0][0, 0].cpu().numpy()
183
-
184
- save_wav_int16(OUTPUT_WAV_SHORT, audio, hps.data.sampling_rate)
185
- print(f"πŸ’Ύ Saved audio: {os.path.abspath(OUTPUT_WAV_SHORT)}")
186
-
187
- if __name__ == "__main__":
188
- main()
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import torch
4
+ import numpy as np
5
+ from scipy.io.wavfile import write
6
+ from tts import commons
7
+ from tts import utils
8
+ from tts.models import SynthesizerTrn
9
+ from text.symbols import symbols
10
+ from text import text_to_sequence
11
+ from phonemizer.backend.espeak.wrapper import EspeakWrapper
12
+ from safetensors.torch import load_file
13
+ from huggingface_hub import hf_hub_download
14
+
15
+ _ESPEAK_LIBRARY = r"C:\Program Files\eSpeak NG\libespeak-ng.dll"
16
+ if os.path.exists(_ESPEAK_LIBRARY):
17
+ EspeakWrapper.set_library(_ESPEAK_LIBRARY)
18
+ print(f"βœ… Found eSpeak-ng: {_ESPEAK_LIBRARY}")
19
+ else:
20
+ print("⚠️ eSpeak-ng not found (ok if already working)")
21
+
22
+
23
+ REPO_ID = "PatnaikAshish/Sonya-TTS"
24
+ MODEL_FILENAME = "sonya-tts.safetensors"
25
+ CONFIG_FILENAME = "config.json"
26
+
27
+ LOCAL_MODEL_PATH = "checkpoints/sonya-tts.safetensors"
28
+ LOCAL_CONFIG_PATH = "checkpoints/config.json"
29
+ OUTPUT_WAV_SHORT = "output.wav"
30
+ OUTPUT_WAV_LONG = "audiobook.wav"
31
+
32
+ USE_LONG_FORM = True # ← change to False for short text
33
+
34
+ TEXT = """
35
+ A neural network or Artificial Neural Network is a computer system inspired by the human brain,
36
+ using interconnected nodes neurons in layers to recognize complex patterns in data for tasks like
37
+ image recognition, language processing, and prediction
38
+ """
39
+
40
+ def save_wav_int16(path, audio, sample_rate):
41
+ audio = np.clip(audio, -1.0, 1.0)
42
+ audio = (audio * 32767).astype(np.int16)
43
+ write(path, sample_rate, audio)
44
+
45
+
46
+ def clean_text_for_vits(text):
47
+ text = text.strip()
48
+
49
+
50
+ text = text.replace("’", "'")
51
+ text = text.replace("β€œ", '"').replace("”", '"')
52
+ text = text.replace("–", "-").replace("β€”", "-")
53
+
54
+
55
+ text = re.sub(r"[()\[\]{}<>]", "", text)
56
+
57
+
58
+ text = re.sub(r"[^a-zA-Z0-9\s.,!?'\-]", "", text)
59
+
60
+ text = re.sub(r"\s+", " ", text)
61
+
62
+ return text
63
+
64
+ def get_text(text, hps):
65
+ text = clean_text_for_vits(text)
66
+ text_norm = text_to_sequence(text, hps.data.text_cleaners)
67
+ if hps.data.add_blank:
68
+ text_norm = commons.intersperse(text_norm, 0)
69
+ return torch.LongTensor(text_norm)
70
+
71
+
72
+ def split_sentences(text):
73
+ text = clean_text_for_vits(text)
74
+ if not text:
75
+ return []
76
+ return re.split(r'(?<=[.!?])\s+', text)
77
+
78
+
79
+ def generate_audiobook(
80
+ net_g,
81
+ hps,
82
+ text,
83
+ device,
84
+ output_file,
85
+ noise_scale=0.5,
86
+ noise_scale_w=0.6,
87
+ length_scale=1.0,
88
+ base_pause=0.4,
89
+ ):
90
+ print("πŸ“– Long-form audiobook mode enabled")
91
+
92
+ sentences = split_sentences(text)
93
+ print(f"πŸ”Ή Sentences: {len(sentences)}")
94
+
95
+ audio_chunks = []
96
+
97
+ for i, sent in enumerate(sentences):
98
+ sent = sent.strip()
99
+ if not sent:
100
+ continue
101
+
102
+ stn_tst = get_text(sent, hps)
103
+
104
+ with torch.no_grad():
105
+ x = stn_tst.to(device).unsqueeze(0)
106
+ x_len = torch.LongTensor([stn_tst.size(0)]).to(device)
107
+
108
+ audio = net_g.infer(
109
+ x,
110
+ x_len,
111
+ noise_scale=noise_scale,
112
+ noise_scale_w=noise_scale_w,
113
+ length_scale=length_scale,
114
+ )[0][0, 0].cpu().numpy()
115
+
116
+ if sent.endswith("?"):
117
+ pause = base_pause + 0.15
118
+ elif sent.endswith("!"):
119
+ pause = base_pause
120
+ else:
121
+ pause = base_pause + 0.05
122
+
123
+ silence = np.zeros(int(hps.data.sampling_rate * pause))
124
+
125
+ audio_chunks.append(audio)
126
+ audio_chunks.append(silence)
127
+
128
+ print(f" βœ… Sentence {i+1}/{len(sentences)} done")
129
+
130
+ final_audio = np.concatenate(audio_chunks)
131
+ save_wav_int16(output_file, final_audio, hps.data.sampling_rate)
132
+
133
+ print(f"πŸŽ‰ Audiobook saved: {os.path.abspath(output_file)}")
134
+
135
+
136
+ def main():
137
+ if os.path.exists(LOCAL_MODEL_PATH) and os.path.exists(LOCAL_CONFIG_PATH):
138
+ print("βœ… Loading Sonya TTS from local checkpoints...")
139
+ model_path = LOCAL_MODEL_PATH
140
+ config_path = LOCAL_CONFIG_PATH
141
+ else:
142
+ print("🌍 Downloading Sonya TTS from Hugging Face...")
143
+ model_path = hf_hub_download(repo_id=REPO_ID, filename=MODEL_FILENAME)
144
+ config_path = hf_hub_download(repo_id=REPO_ID, filename=CONFIG_FILENAME)
145
+
146
+
147
+ hps = utils.get_hparams_from_file(config_path)
148
+
149
+ device = "cuda" if torch.cuda.is_available() else "cpu"
150
+ print(f"πŸš€ Using device: {device}")
151
+
152
+ # Load model
153
+ net_g = SynthesizerTrn(
154
+ len(symbols),
155
+ hps.data.filter_length // 2 + 1,
156
+ hps.train.segment_size // hps.data.hop_length,
157
+ **hps.model,
158
+ ).to(device)
159
+ net_g.eval()
160
+
161
+ # Load checkpoint
162
+ state_dict = load_file(model_path)
163
+ net_g.load_state_dict(state_dict)
164
+ print(f"βœ… Loaded model: {model_path}")
165
+
166
+
167
+ if USE_LONG_FORM:
168
+ generate_audiobook(
169
+ net_g,
170
+ hps,
171
+ TEXT,
172
+ device,
173
+ OUTPUT_WAV_LONG,
174
+ )
175
+ else:
176
+ print("πŸ—£οΈ Short-text inference")
177
+
178
+ stn_tst = get_text(TEXT, hps)
179
+ with torch.no_grad():
180
+ x = stn_tst.to(device).unsqueeze(0)
181
+ x_len = torch.LongTensor([stn_tst.size(0)]).to(device)
182
+
183
+ audio = net_g.infer(
184
+ x,
185
+ x_len,
186
+ noise_scale=0.5,
187
+ noise_scale_w=0.6,
188
+ length_scale=1.0,
189
+ )[0][0, 0].cpu().numpy()
190
+
191
+ save_wav_int16(OUTPUT_WAV_SHORT, audio, hps.data.sampling_rate)
192
+ print(f"πŸ’Ύ Saved audio: {os.path.abspath(OUTPUT_WAV_SHORT)}")
193
+
194
+ if __name__ == "__main__":
195
+ main()