abersbail commited on
Commit
0a88ee7
·
verified ·
1 Parent(s): 22d0cc3

Deploy tiny code-only TTS Space

Browse files
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ __pycache__/
2
+ artifacts/
README.md CHANGED
@@ -1,12 +1,61 @@
1
  ---
2
- title: Tiny Code Only Tts
3
- emoji: 🚀
4
- colorFrom: pink
5
- colorTo: purple
6
  sdk: gradio
7
- sdk_version: 6.9.0
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Tiny Code-Only TTS
3
+ emoji: 🤖
4
+ colorFrom: blue
5
+ colorTo: green
6
  sdk: gradio
7
+ sdk_version: 5.23.0
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
12
+ # Tiny Code-Only TTS for Hugging Face Spaces
13
+
14
+ This project builds a simple text-to-speech system from code only.
15
+
16
+ - No API key
17
+ - No external model
18
+ - No pretrained checkpoint
19
+ - Pure Python waveform synthesis
20
+ - Gradio UI for Hugging Face Spaces
21
+
22
+ ## What it does
23
+
24
+ It converts text into robotic speech audio using a lightweight phoneme-style synthesizer. The engine uses handcrafted sound rules for vowels, fricatives, stops, nasals, liquids, and pauses.
25
+
26
+ This is a starter TTS project for deployment and experimentation. It is intentionally simple and CPU-friendly.
27
+
28
+ ## Project structure
29
+
30
+ ```text
31
+ .
32
+ ├── app.py
33
+ ├── requirements.txt
34
+ └── mini_tts/
35
+ ├── __init__.py
36
+ ├── config.py
37
+ ├── normalizer.py
38
+ ├── service.py
39
+ └── synth.py
40
+ ```
41
+
42
+ ## Run locally
43
+
44
+ ```bash
45
+ pip install -r requirements.txt
46
+ python app.py
47
+ ```
48
+
49
+ ## Deploy on Hugging Face Spaces
50
+
51
+ 1. Create a new Space.
52
+ 2. Choose `Gradio`.
53
+ 3. Upload these files.
54
+ 4. Space will install `requirements.txt`.
55
+ 5. Open the app and generate speech directly from text.
56
+
57
+ ## Notes
58
+
59
+ - The voice is synthetic and simple by design.
60
+ - You can tune pitch, speed, and voice color in the UI.
61
+ - You can extend phoneme rules in `mini_tts/synth.py`.
app.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ from mini_tts.service import LocalTTSService
4
+
5
+
6
+ service = LocalTTSService()
7
+
8
+
9
+ def run_tts(text: str, voice: str, speed: float, pitch: float):
10
+ return service.synthesize(
11
+ text=text,
12
+ voice=voice,
13
+ speed=speed,
14
+ pitch_shift=pitch,
15
+ )
16
+
17
+
18
+ with gr.Blocks(title="Tiny Code-Only TTS") as demo:
19
+ gr.Markdown(
20
+ """
21
+ # Tiny Code-Only TTS
22
+ A simple text-to-speech engine built from code only.
23
+
24
+ - No API key
25
+ - No hosted model
26
+ - No pretrained checkpoint
27
+ - Designed for Hugging Face Spaces
28
+ """
29
+ )
30
+
31
+ with gr.Row():
32
+ with gr.Column():
33
+ text = gr.Textbox(
34
+ label="Text",
35
+ value="Hello. This is a simple text to speech demo built only with code.",
36
+ lines=8,
37
+ )
38
+ voice = gr.Dropdown(
39
+ label="Voice",
40
+ choices=["neutral", "bright", "deep"],
41
+ value="neutral",
42
+ )
43
+ speed = gr.Slider(
44
+ label="Speed",
45
+ minimum=0.6,
46
+ maximum=1.6,
47
+ value=1.0,
48
+ step=0.1,
49
+ )
50
+ pitch = gr.Slider(
51
+ label="Pitch shift",
52
+ minimum=-0.3,
53
+ maximum=0.3,
54
+ value=0.0,
55
+ step=0.05,
56
+ )
57
+ speak_button = gr.Button("Generate Speech", variant="primary")
58
+
59
+ with gr.Column():
60
+ audio = gr.Audio(label="Audio", type="numpy")
61
+ status = gr.Textbox(label="Status", value=service.describe())
62
+ normalized = gr.Textbox(label="Normalized Text", lines=8)
63
+
64
+ speak_button.click(
65
+ fn=run_tts,
66
+ inputs=[text, voice, speed, pitch],
67
+ outputs=[audio, status, normalized],
68
+ )
69
+
70
+
71
+ if __name__ == "__main__":
72
+ demo.launch()
mini_tts/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .service import LocalTTSService
2
+
3
+ __all__ = ["LocalTTSService"]
mini_tts/config.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+
3
+
4
+ @dataclass
5
+ class TTSConfig:
6
+ sample_rate: int = 22050
7
+ base_pitch_hz: float = 140.0
8
+ symbol_duration_ms: int = 110
9
+ pause_duration_ms: int = 90
10
+ crossfade_ms: int = 12
11
+ amplitude: float = 0.75
mini_tts/normalizer.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+
4
+ DIGRAPH_MAP = {
5
+ "th": "T",
6
+ "sh": "S",
7
+ "ch": "C",
8
+ "ph": "F",
9
+ "oo": "U",
10
+ "ee": "I",
11
+ "ai": "A",
12
+ "ou": "W",
13
+ }
14
+
15
+
16
+ def normalize_text(text: str) -> str:
17
+ normalized = text.lower().strip()
18
+ normalized = re.sub(r"[^a-z0-9\s,.;:!?'-]", " ", normalized)
19
+ normalized = re.sub(r"\s+", " ", normalized)
20
+ return normalized
21
+
22
+
23
+ def text_to_symbols(text: str) -> list[str]:
24
+ normalized = normalize_text(text)
25
+ symbols: list[str] = []
26
+ i = 0
27
+ while i < len(normalized):
28
+ pair = normalized[i : i + 2]
29
+ if pair in DIGRAPH_MAP:
30
+ symbols.append(DIGRAPH_MAP[pair])
31
+ i += 2
32
+ continue
33
+
34
+ ch = normalized[i]
35
+ if ch in ",.;:!?":
36
+ symbols.append("|")
37
+ elif ch == " ":
38
+ symbols.append(" ")
39
+ else:
40
+ symbols.append(ch)
41
+ i += 1
42
+ return symbols
mini_tts/service.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .synth import TinyTTSSynthesizer
2
+
3
+
4
+ class LocalTTSService:
5
+ def __init__(self):
6
+ self.engine = TinyTTSSynthesizer()
7
+
8
+ def describe(self) -> str:
9
+ return "Local TTS engine ready. No API key and no external model."
10
+
11
+ def synthesize(
12
+ self,
13
+ text: str,
14
+ voice: str,
15
+ speed: float,
16
+ pitch_shift: float,
17
+ ):
18
+ sample_rate, audio, normalized = self.engine.synthesize(
19
+ text=text,
20
+ voice=voice,
21
+ speed=float(speed),
22
+ pitch_shift=float(pitch_shift),
23
+ )
24
+ status = f"Generated local speech with voice={voice}, speed={speed:.2f}, pitch_shift={pitch_shift:.2f}"
25
+ return (sample_rate, audio), status, normalized
mini_tts/synth.py ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ import math
3
+
4
+ import numpy as np
5
+
6
+ from .config import TTSConfig
7
+ from .normalizer import normalize_text, text_to_symbols
8
+
9
+
10
+ @dataclass(frozen=True)
11
+ class VoiceProfile:
12
+ pitch_scale: float
13
+ formant_scale: float
14
+ brightness: float
15
+
16
+
17
+ VOICE_PROFILES = {
18
+ "neutral": VoiceProfile(pitch_scale=1.0, formant_scale=1.0, brightness=1.0),
19
+ "bright": VoiceProfile(pitch_scale=1.2, formant_scale=1.1, brightness=1.15),
20
+ "deep": VoiceProfile(pitch_scale=0.82, formant_scale=0.9, brightness=0.85),
21
+ }
22
+
23
+
24
+ VOWELS = {
25
+ "a": (800, 1200, 2500),
26
+ "e": (530, 1850, 2500),
27
+ "i": (300, 2200, 2900),
28
+ "o": (500, 900, 2400),
29
+ "u": (350, 800, 2200),
30
+ "A": (650, 1600, 2550),
31
+ "I": (320, 2400, 3000),
32
+ "U": (380, 1000, 2300),
33
+ "W": (450, 1100, 2350),
34
+ }
35
+
36
+ FRICATIVES = set("fszhvjxSFT")
37
+ STOPS = set("pbtdkgcqC")
38
+ NASALS = set("mn")
39
+ LIQUIDS = set("lrwy")
40
+
41
+
42
+ class TinyTTSSynthesizer:
43
+ def __init__(self, config: TTSConfig | None = None):
44
+ self.config = config or TTSConfig()
45
+
46
+ def synthesize(
47
+ self,
48
+ text: str,
49
+ voice: str = "neutral",
50
+ speed: float = 1.0,
51
+ pitch_shift: float = 0.0,
52
+ ) -> tuple[int, np.ndarray, str]:
53
+ normalized = normalize_text(text)
54
+ symbols = text_to_symbols(text)
55
+ profile = VOICE_PROFILES.get(voice, VOICE_PROFILES["neutral"])
56
+
57
+ pieces: list[np.ndarray] = []
58
+ for symbol in symbols:
59
+ segment = self._render_symbol(
60
+ symbol=symbol,
61
+ profile=profile,
62
+ speed=max(speed, 0.1),
63
+ pitch_shift=pitch_shift,
64
+ )
65
+ if segment.size:
66
+ pieces.append(segment)
67
+
68
+ if not pieces:
69
+ pieces.append(self._silence(0.25))
70
+
71
+ audio = pieces[0]
72
+ for piece in pieces[1:]:
73
+ audio = self._crossfade(audio, piece)
74
+
75
+ peak = np.max(np.abs(audio))
76
+ if peak > 0:
77
+ audio = (audio / peak) * self.config.amplitude
78
+
79
+ return self.config.sample_rate, audio.astype(np.float32), normalized
80
+
81
+ def _render_symbol(
82
+ self,
83
+ symbol: str,
84
+ profile: VoiceProfile,
85
+ speed: float,
86
+ pitch_shift: float,
87
+ ) -> np.ndarray:
88
+ if symbol == " ":
89
+ return self._silence(self.config.pause_duration_ms / 1000 / speed)
90
+ if symbol == "|":
91
+ return self._silence((self.config.pause_duration_ms * 2.2) / 1000 / speed)
92
+ if symbol in VOWELS:
93
+ return self._vowel(symbol, profile, speed, pitch_shift)
94
+ if symbol in FRICATIVES:
95
+ return self._fricative(profile, speed)
96
+ if symbol in STOPS:
97
+ return self._stop(profile, speed)
98
+ if symbol in NASALS:
99
+ return self._nasal(profile, speed, pitch_shift)
100
+ if symbol in LIQUIDS:
101
+ return self._liquid(profile, speed, pitch_shift)
102
+ if symbol.isdigit():
103
+ return self._digit(symbol, profile, speed, pitch_shift)
104
+ return self._soft_noise(speed)
105
+
106
+ def _vowel(
107
+ self,
108
+ symbol: str,
109
+ profile: VoiceProfile,
110
+ speed: float,
111
+ pitch_shift: float,
112
+ ) -> np.ndarray:
113
+ duration = self._duration(1.0, speed)
114
+ t = self._timeline(duration)
115
+ pitch = self.config.base_pitch_hz * profile.pitch_scale * (1.0 + pitch_shift)
116
+ formants = [f * profile.formant_scale for f in VOWELS[symbol]]
117
+ source = (
118
+ np.sin(2 * math.pi * pitch * t)
119
+ + 0.35 * np.sin(2 * math.pi * pitch * 2.0 * t)
120
+ + 0.18 * np.sin(2 * math.pi * pitch * 3.0 * t)
121
+ )
122
+ resonance = (
123
+ 0.42 * np.sin(2 * math.pi * formants[0] * t)
124
+ + 0.22 * np.sin(2 * math.pi * formants[1] * t)
125
+ + 0.12 * np.sin(2 * math.pi * formants[2] * t)
126
+ )
127
+ envelope = self._adsr(len(t), attack=0.08, decay=0.12, sustain=0.82, release=0.18)
128
+ return (0.7 * source + 0.5 * resonance) * envelope
129
+
130
+ def _fricative(self, profile: VoiceProfile, speed: float) -> np.ndarray:
131
+ duration = self._duration(0.8, speed)
132
+ n = self._num_samples(duration)
133
+ noise = np.random.uniform(-1.0, 1.0, n)
134
+ tilt = np.concatenate(([noise[0]], np.diff(noise)))
135
+ mix = 0.65 * tilt + 0.35 * noise * profile.brightness
136
+ envelope = self._adsr(n, attack=0.02, decay=0.05, sustain=0.6, release=0.2)
137
+ return mix * envelope * 0.7
138
+
139
+ def _stop(self, profile: VoiceProfile, speed: float) -> np.ndarray:
140
+ closure = self._silence(0.035 / speed)
141
+ burst = self._fricative(profile, speed)[: self._num_samples(0.04 / speed)]
142
+ return np.concatenate([closure, burst])
143
+
144
+ def _nasal(
145
+ self,
146
+ profile: VoiceProfile,
147
+ speed: float,
148
+ pitch_shift: float,
149
+ ) -> np.ndarray:
150
+ duration = self._duration(0.9, speed)
151
+ t = self._timeline(duration)
152
+ pitch = self.config.base_pitch_hz * 0.92 * profile.pitch_scale * (1.0 + pitch_shift)
153
+ signal = (
154
+ np.sin(2 * math.pi * pitch * t)
155
+ + 0.28 * np.sin(2 * math.pi * 280 * profile.formant_scale * t)
156
+ + 0.12 * np.sin(2 * math.pi * 900 * profile.formant_scale * t)
157
+ )
158
+ envelope = self._adsr(len(t), attack=0.05, decay=0.08, sustain=0.72, release=0.2)
159
+ return signal * envelope * 0.7
160
+
161
+ def _liquid(
162
+ self,
163
+ profile: VoiceProfile,
164
+ speed: float,
165
+ pitch_shift: float,
166
+ ) -> np.ndarray:
167
+ duration = self._duration(0.75, speed)
168
+ t = self._timeline(duration)
169
+ pitch = self.config.base_pitch_hz * 1.05 * profile.pitch_scale * (1.0 + pitch_shift)
170
+ glide = np.linspace(0.95, 1.05, len(t))
171
+ signal = (
172
+ np.sin(2 * math.pi * pitch * glide * t)
173
+ + 0.22 * np.sin(2 * math.pi * 700 * profile.formant_scale * t)
174
+ + 0.1 * np.sin(2 * math.pi * 1500 * profile.formant_scale * t)
175
+ )
176
+ envelope = self._adsr(len(t), attack=0.04, decay=0.08, sustain=0.7, release=0.18)
177
+ return signal * envelope * 0.65
178
+
179
+ def _digit(
180
+ self,
181
+ symbol: str,
182
+ profile: VoiceProfile,
183
+ speed: float,
184
+ pitch_shift: float,
185
+ ) -> np.ndarray:
186
+ names = {
187
+ "0": "zero",
188
+ "1": "one",
189
+ "2": "two",
190
+ "3": "three",
191
+ "4": "four",
192
+ "5": "five",
193
+ "6": "six",
194
+ "7": "seven",
195
+ "8": "eight",
196
+ "9": "nine",
197
+ }
198
+ chunks = [self._render_symbol(s, profile, speed, pitch_shift) for s in text_to_symbols(names[symbol])]
199
+ result = chunks[0] if chunks else self._silence(0.08)
200
+ for chunk in chunks[1:]:
201
+ result = self._crossfade(result, chunk)
202
+ return result
203
+
204
+ def _soft_noise(self, speed: float) -> np.ndarray:
205
+ duration = self._duration(0.45, speed)
206
+ n = self._num_samples(duration)
207
+ noise = np.random.uniform(-0.3, 0.3, n)
208
+ envelope = self._adsr(n, attack=0.03, decay=0.1, sustain=0.2, release=0.12)
209
+ return noise * envelope
210
+
211
+ def _crossfade(self, left: np.ndarray, right: np.ndarray) -> np.ndarray:
212
+ fade = min(
213
+ int(self.config.sample_rate * self.config.crossfade_ms / 1000),
214
+ len(left),
215
+ len(right),
216
+ )
217
+ if fade <= 0:
218
+ return np.concatenate([left, right])
219
+
220
+ curve_out = np.linspace(1.0, 0.0, fade)
221
+ curve_in = np.linspace(0.0, 1.0, fade)
222
+ mixed = left[-fade:] * curve_out + right[:fade] * curve_in
223
+ return np.concatenate([left[:-fade], mixed, right[fade:]])
224
+
225
+ def _duration(self, scale: float, speed: float) -> float:
226
+ base = self.config.symbol_duration_ms / 1000
227
+ return max(0.03, (base * scale) / speed)
228
+
229
+ def _num_samples(self, duration: float) -> int:
230
+ return max(1, int(self.config.sample_rate * duration))
231
+
232
+ def _timeline(self, duration: float) -> np.ndarray:
233
+ return np.linspace(0.0, duration, self._num_samples(duration), endpoint=False)
234
+
235
+ def _silence(self, duration: float) -> np.ndarray:
236
+ return np.zeros(self._num_samples(duration), dtype=np.float32)
237
+
238
+ def _adsr(
239
+ self,
240
+ n: int,
241
+ attack: float,
242
+ decay: float,
243
+ sustain: float,
244
+ release: float,
245
+ ) -> np.ndarray:
246
+ attack_n = max(1, int(n * attack))
247
+ decay_n = max(1, int(n * decay))
248
+ release_n = max(1, int(n * release))
249
+ sustain_n = max(1, n - attack_n - decay_n - release_n)
250
+
251
+ attack_curve = np.linspace(0.0, 1.0, attack_n, endpoint=False)
252
+ decay_curve = np.linspace(1.0, sustain, decay_n, endpoint=False)
253
+ sustain_curve = np.full(sustain_n, sustain)
254
+ release_curve = np.linspace(sustain, 0.0, release_n, endpoint=True)
255
+ envelope = np.concatenate([attack_curve, decay_curve, sustain_curve, release_curve])
256
+ if len(envelope) < n:
257
+ envelope = np.pad(envelope, (0, n - len(envelope)))
258
+ return envelope[:n]
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ gradio>=5.23.0
2
+ numpy>=1.26.0