Spaces:

abersbail
/

tiny-code-only-tts

Sleeping

App Files Files Community

abersbail commited on 19 days ago

Commit

0a88ee7

verified ·

1 Parent(s): 22d0cc3

Deploy tiny code-only TTS Space

Browse files

Files changed (9) hide show

.gitignore +2 -0
README.md +55 -6
app.py +72 -0
mini_tts/__init__.py +3 -0
mini_tts/config.py +11 -0
mini_tts/normalizer.py +42 -0
mini_tts/service.py +25 -0
mini_tts/synth.py +258 -0
requirements.txt +2 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ __pycache__/
2	+ artifacts/

README.md CHANGED Viewed

@@ -1,12 +1,61 @@
 ---
-title: Tiny Code Only Tts
-emoji: 🚀
-colorFrom: pink
-colorTo: purple
 sdk: gradio
-sdk_version: 6.9.0
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Tiny Code-Only TTS
+emoji: 🤖
+colorFrom: blue
+colorTo: green
 sdk: gradio
+sdk_version: 5.23.0
 app_file: app.py
 pinned: false
 ---
+# Tiny Code-Only TTS for Hugging Face Spaces
+This project builds a simple text-to-speech system from code only.
+- No API key
+- No external model
+- No pretrained checkpoint
+- Pure Python waveform synthesis
+- Gradio UI for Hugging Face Spaces
+## What it does
+It converts text into robotic speech audio using a lightweight phoneme-style synthesizer. The engine uses handcrafted sound rules for vowels, fricatives, stops, nasals, liquids, and pauses.
+This is a starter TTS project for deployment and experimentation. It is intentionally simple and CPU-friendly.
+## Project structure
+```text
+.
+├── app.py
+├── requirements.txt
+└── mini_tts/
+    ├── __init__.py
+    ├── config.py
+    ├── normalizer.py
+    ├── service.py
+    └── synth.py
+```
+## Run locally
+```bash
+pip install -r requirements.txt
+python app.py
+```
+## Deploy on Hugging Face Spaces
+1. Create a new Space.
+2. Choose `Gradio`.
+3. Upload these files.
+4. Space will install `requirements.txt`.
+5. Open the app and generate speech directly from text.
+## Notes
+- The voice is synthetic and simple by design.
+- You can tune pitch, speed, and voice color in the UI.
+- You can extend phoneme rules in `mini_tts/synth.py`.

app.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import gradio as gr
+from mini_tts.service import LocalTTSService
+service = LocalTTSService()
+def run_tts(text: str, voice: str, speed: float, pitch: float):
+    return service.synthesize(
+        text=text,
+        voice=voice,
+        speed=speed,
+        pitch_shift=pitch,
+    )
+with gr.Blocks(title="Tiny Code-Only TTS") as demo:
+    gr.Markdown(
+        """
+        # Tiny Code-Only TTS
+        A simple text-to-speech engine built from code only.
+        - No API key
+        - No hosted model
+        - No pretrained checkpoint
+        - Designed for Hugging Face Spaces
+        """
+    )
+    with gr.Row():
+        with gr.Column():
+            text = gr.Textbox(
+                label="Text",
+                value="Hello. This is a simple text to speech demo built only with code.",
+                lines=8,
+            )
+            voice = gr.Dropdown(
+                label="Voice",
+                choices=["neutral", "bright", "deep"],
+                value="neutral",
+            )
+            speed = gr.Slider(
+                label="Speed",
+                minimum=0.6,
+                maximum=1.6,
+                value=1.0,
+                step=0.1,
+            )
+            pitch = gr.Slider(
+                label="Pitch shift",
+                minimum=-0.3,
+                maximum=0.3,
+                value=0.0,
+                step=0.05,
+            )
+            speak_button = gr.Button("Generate Speech", variant="primary")
+        with gr.Column():
+            audio = gr.Audio(label="Audio", type="numpy")
+            status = gr.Textbox(label="Status", value=service.describe())
+            normalized = gr.Textbox(label="Normalized Text", lines=8)
+    speak_button.click(
+        fn=run_tts,
+        inputs=[text, voice, speed, pitch],
+        outputs=[audio, status, normalized],
+    )
+if __name__ == "__main__":
+    demo.launch()

mini_tts/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .service import LocalTTSService
2	+
3	+ __all__ = ["LocalTTSService"]

mini_tts/config.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from dataclasses import dataclass
+@dataclass
+class TTSConfig:
+    sample_rate: int = 22050
+    base_pitch_hz: float = 140.0
+    symbol_duration_ms: int = 110
+    pause_duration_ms: int = 90
+    crossfade_ms: int = 12
+    amplitude: float = 0.75

mini_tts/normalizer.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import re
+DIGRAPH_MAP = {
+    "th": "T",
+    "sh": "S",
+    "ch": "C",
+    "ph": "F",
+    "oo": "U",
+    "ee": "I",
+    "ai": "A",
+    "ou": "W",
+}
+def normalize_text(text: str) -> str:
+    normalized = text.lower().strip()
+    normalized = re.sub(r"[^a-z0-9\s,.;:!?'-]", " ", normalized)
+    normalized = re.sub(r"\s+", " ", normalized)
+    return normalized
+def text_to_symbols(text: str) -> list[str]:
+    normalized = normalize_text(text)
+    symbols: list[str] = []
+    i = 0
+    while i < len(normalized):
+        pair = normalized[i : i + 2]
+        if pair in DIGRAPH_MAP:
+            symbols.append(DIGRAPH_MAP[pair])
+            i += 2
+            continue
+        ch = normalized[i]
+        if ch in ",.;:!?":
+            symbols.append("|")
+        elif ch == " ":
+            symbols.append(" ")
+        else:
+            symbols.append(ch)
+        i += 1
+    return symbols

mini_tts/service.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from .synth import TinyTTSSynthesizer
+class LocalTTSService:
+    def __init__(self):
+        self.engine = TinyTTSSynthesizer()
+    def describe(self) -> str:
+        return "Local TTS engine ready. No API key and no external model."
+    def synthesize(
+        self,
+        text: str,
+        voice: str,
+        speed: float,
+        pitch_shift: float,
+    ):
+        sample_rate, audio, normalized = self.engine.synthesize(
+            text=text,
+            voice=voice,
+            speed=float(speed),
+            pitch_shift=float(pitch_shift),
+        )
+        status = f"Generated local speech with voice={voice}, speed={speed:.2f}, pitch_shift={pitch_shift:.2f}"
+        return (sample_rate, audio), status, normalized

mini_tts/synth.py ADDED Viewed

	@@ -0,0 +1,258 @@

+from dataclasses import dataclass
+import math
+import numpy as np
+from .config import TTSConfig
+from .normalizer import normalize_text, text_to_symbols
+@dataclass(frozen=True)
+class VoiceProfile:
+    pitch_scale: float
+    formant_scale: float
+    brightness: float
+VOICE_PROFILES = {
+    "neutral": VoiceProfile(pitch_scale=1.0, formant_scale=1.0, brightness=1.0),
+    "bright": VoiceProfile(pitch_scale=1.2, formant_scale=1.1, brightness=1.15),
+    "deep": VoiceProfile(pitch_scale=0.82, formant_scale=0.9, brightness=0.85),
+}
+VOWELS = {
+    "a": (800, 1200, 2500),
+    "e": (530, 1850, 2500),
+    "i": (300, 2200, 2900),
+    "o": (500, 900, 2400),
+    "u": (350, 800, 2200),
+    "A": (650, 1600, 2550),
+    "I": (320, 2400, 3000),
+    "U": (380, 1000, 2300),
+    "W": (450, 1100, 2350),
+}
+FRICATIVES = set("fszhvjxSFT")
+STOPS = set("pbtdkgcqC")
+NASALS = set("mn")
+LIQUIDS = set("lrwy")
+class TinyTTSSynthesizer:
+    def __init__(self, config: TTSConfig | None = None):
+        self.config = config or TTSConfig()
+    def synthesize(
+        self,
+        text: str,
+        voice: str = "neutral",
+        speed: float = 1.0,
+        pitch_shift: float = 0.0,
+    ) -> tuple[int, np.ndarray, str]:
+        normalized = normalize_text(text)
+        symbols = text_to_symbols(text)
+        profile = VOICE_PROFILES.get(voice, VOICE_PROFILES["neutral"])
+        pieces: list[np.ndarray] = []
+        for symbol in symbols:
+            segment = self._render_symbol(
+                symbol=symbol,
+                profile=profile,
+                speed=max(speed, 0.1),
+                pitch_shift=pitch_shift,
+            )
+            if segment.size:
+                pieces.append(segment)
+        if not pieces:
+            pieces.append(self._silence(0.25))
+        audio = pieces[0]
+        for piece in pieces[1:]:
+            audio = self._crossfade(audio, piece)
+        peak = np.max(np.abs(audio))
+        if peak > 0:
+            audio = (audio / peak) * self.config.amplitude
+        return self.config.sample_rate, audio.astype(np.float32), normalized
+    def _render_symbol(
+        self,
+        symbol: str,
+        profile: VoiceProfile,
+        speed: float,
+        pitch_shift: float,
+    ) -> np.ndarray:
+        if symbol == " ":
+            return self._silence(self.config.pause_duration_ms / 1000 / speed)
+        if symbol == "|":
+            return self._silence((self.config.pause_duration_ms * 2.2) / 1000 / speed)
+        if symbol in VOWELS:
+            return self._vowel(symbol, profile, speed, pitch_shift)
+        if symbol in FRICATIVES:
+            return self._fricative(profile, speed)
+        if symbol in STOPS:
+            return self._stop(profile, speed)
+        if symbol in NASALS:
+            return self._nasal(profile, speed, pitch_shift)
+        if symbol in LIQUIDS:
+            return self._liquid(profile, speed, pitch_shift)
+        if symbol.isdigit():
+            return self._digit(symbol, profile, speed, pitch_shift)
+        return self._soft_noise(speed)
+    def _vowel(
+        self,
+        symbol: str,
+        profile: VoiceProfile,
+        speed: float,
+        pitch_shift: float,
+    ) -> np.ndarray:
+        duration = self._duration(1.0, speed)
+        t = self._timeline(duration)
+        pitch = self.config.base_pitch_hz * profile.pitch_scale * (1.0 + pitch_shift)
+        formants = [f * profile.formant_scale for f in VOWELS[symbol]]
+        source = (
+            np.sin(2 * math.pi * pitch * t)
+            + 0.35 * np.sin(2 * math.pi * pitch * 2.0 * t)
+            + 0.18 * np.sin(2 * math.pi * pitch * 3.0 * t)
+        )
+        resonance = (
+            0.42 * np.sin(2 * math.pi * formants[0] * t)
+            + 0.22 * np.sin(2 * math.pi * formants[1] * t)
+            + 0.12 * np.sin(2 * math.pi * formants[2] * t)
+        )
+        envelope = self._adsr(len(t), attack=0.08, decay=0.12, sustain=0.82, release=0.18)
+        return (0.7 * source + 0.5 * resonance) * envelope
+    def _fricative(self, profile: VoiceProfile, speed: float) -> np.ndarray:
+        duration = self._duration(0.8, speed)
+        n = self._num_samples(duration)
+        noise = np.random.uniform(-1.0, 1.0, n)
+        tilt = np.concatenate(([noise[0]], np.diff(noise)))
+        mix = 0.65 * tilt + 0.35 * noise * profile.brightness
+        envelope = self._adsr(n, attack=0.02, decay=0.05, sustain=0.6, release=0.2)
+        return mix * envelope * 0.7
+    def _stop(self, profile: VoiceProfile, speed: float) -> np.ndarray:
+        closure = self._silence(0.035 / speed)
+        burst = self._fricative(profile, speed)[: self._num_samples(0.04 / speed)]
+        return np.concatenate([closure, burst])
+    def _nasal(
+        self,
+        profile: VoiceProfile,
+        speed: float,
+        pitch_shift: float,
+    ) -> np.ndarray:
+        duration = self._duration(0.9, speed)
+        t = self._timeline(duration)
+        pitch = self.config.base_pitch_hz * 0.92 * profile.pitch_scale * (1.0 + pitch_shift)
+        signal = (
+            np.sin(2 * math.pi * pitch * t)
+            + 0.28 * np.sin(2 * math.pi * 280 * profile.formant_scale * t)
+            + 0.12 * np.sin(2 * math.pi * 900 * profile.formant_scale * t)
+        )
+        envelope = self._adsr(len(t), attack=0.05, decay=0.08, sustain=0.72, release=0.2)
+        return signal * envelope * 0.7
+    def _liquid(
+        self,
+        profile: VoiceProfile,
+        speed: float,
+        pitch_shift: float,
+    ) -> np.ndarray:
+        duration = self._duration(0.75, speed)
+        t = self._timeline(duration)
+        pitch = self.config.base_pitch_hz * 1.05 * profile.pitch_scale * (1.0 + pitch_shift)
+        glide = np.linspace(0.95, 1.05, len(t))
+        signal = (
+            np.sin(2 * math.pi * pitch * glide * t)
+            + 0.22 * np.sin(2 * math.pi * 700 * profile.formant_scale * t)
+            + 0.1 * np.sin(2 * math.pi * 1500 * profile.formant_scale * t)
+        )
+        envelope = self._adsr(len(t), attack=0.04, decay=0.08, sustain=0.7, release=0.18)
+        return signal * envelope * 0.65
+    def _digit(
+        self,
+        symbol: str,
+        profile: VoiceProfile,
+        speed: float,
+        pitch_shift: float,
+    ) -> np.ndarray:
+        names = {
+            "0": "zero",
+            "1": "one",
+            "2": "two",
+            "3": "three",
+            "4": "four",
+            "5": "five",
+            "6": "six",
+            "7": "seven",
+            "8": "eight",
+            "9": "nine",
+        }
+        chunks = [self._render_symbol(s, profile, speed, pitch_shift) for s in text_to_symbols(names[symbol])]
+        result = chunks[0] if chunks else self._silence(0.08)
+        for chunk in chunks[1:]:
+            result = self._crossfade(result, chunk)
+        return result
+    def _soft_noise(self, speed: float) -> np.ndarray:
+        duration = self._duration(0.45, speed)
+        n = self._num_samples(duration)
+        noise = np.random.uniform(-0.3, 0.3, n)
+        envelope = self._adsr(n, attack=0.03, decay=0.1, sustain=0.2, release=0.12)
+        return noise * envelope
+    def _crossfade(self, left: np.ndarray, right: np.ndarray) -> np.ndarray:
+        fade = min(
+            int(self.config.sample_rate * self.config.crossfade_ms / 1000),
+            len(left),
+            len(right),
+        )
+        if fade <= 0:
+            return np.concatenate([left, right])
+        curve_out = np.linspace(1.0, 0.0, fade)
+        curve_in = np.linspace(0.0, 1.0, fade)
+        mixed = left[-fade:] * curve_out + right[:fade] * curve_in
+        return np.concatenate([left[:-fade], mixed, right[fade:]])
+    def _duration(self, scale: float, speed: float) -> float:
+        base = self.config.symbol_duration_ms / 1000
+        return max(0.03, (base * scale) / speed)
+    def _num_samples(self, duration: float) -> int:
+        return max(1, int(self.config.sample_rate * duration))
+    def _timeline(self, duration: float) -> np.ndarray:
+        return np.linspace(0.0, duration, self._num_samples(duration), endpoint=False)
+    def _silence(self, duration: float) -> np.ndarray:
+        return np.zeros(self._num_samples(duration), dtype=np.float32)
+    def _adsr(
+        self,
+        n: int,
+        attack: float,
+        decay: float,
+        sustain: float,
+        release: float,
+    ) -> np.ndarray:
+        attack_n = max(1, int(n * attack))
+        decay_n = max(1, int(n * decay))
+        release_n = max(1, int(n * release))
+        sustain_n = max(1, n - attack_n - decay_n - release_n)
+        attack_curve = np.linspace(0.0, 1.0, attack_n, endpoint=False)
+        decay_curve = np.linspace(1.0, sustain, decay_n, endpoint=False)
+        sustain_curve = np.full(sustain_n, sustain)
+        release_curve = np.linspace(sustain, 0.0, release_n, endpoint=True)
+        envelope = np.concatenate([attack_curve, decay_curve, sustain_curve, release_curve])
+        if len(envelope) < n:
+            envelope = np.pad(envelope, (0, n - len(envelope)))
+        return envelope[:n]

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ gradio>=5.23.0
2	+ numpy>=1.26.0