File size: 2,923 Bytes
a3419b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# app/engines/base.py
# Abstract base class for all TTS engines.
# To add a new engine: create a new file in engines/, subclass TTSEngine,
# implement synthesize(), and register it in engines/__init__.py.

from abc import ABC, abstractmethod


class TTSEngine(ABC):
    """
    Base class for all TTS engines in the Bantrly evaluation framework.
    Every engine must implement synthesize() with this exact signature.
    """

    # --- class-level metadata (set in each subclass) ---

    name: str = ""
    # Display name shown in the UI dropdown

    engine_type: str = ""
    # One of: "rule-based-local" | "neural-local" | "neural-cloud-free" | "neural-cloud-paid"

    cost_per_million_chars: float = 0.0
    # Cost in USD per 1M characters. 0.0 for free/local engines.
    # Used to compute equivalent cost column in comparison table.

    is_production_ready: bool = False
    # If False, shown with a "baseline only" label in the UI.

    requires_internet: bool = False
    # If True, shown with a warning in the UI when offline.

    # --- grade-band config ---
    # Subclasses can override this to apply per-band voice/speed tuning.
    # Format: { "K-2": {...}, "3-5": {...}, "6-8": {...}, "9-12": {...} }
    BAND_CONFIG: dict = {}

    @abstractmethod
    def synthesize(self, text: str, band: str, output_path: str) -> dict:
        """
        Synthesize text to audio file.

        Args:
            text:        coaching text to synthesize
            band:        grade band — one of "K-2", "3-5", "6-8", "9-12"
            output_path: full path to save audio (without extension —
                         each engine appends its own extension)

        Returns:
            dict with keys:
                audio_path      (str)   full path to saved audio file
                latency_seconds (float) wall-clock synthesis time
                voice           (str)   voice ID used
                speed           (float) speed multiplier used (1.0 if N/A)
                engine          (str)   engine name (same as self.name)
        """
        ...

    def estimate_cost(self, text: str) -> float:
        """
        Estimate cost in USD for synthesizing this text.
        Returns 0.0 for free/local engines.
        """
        return (len(text) / 1_000_000) * self.cost_per_million_chars

    def get_band_config(self, band: str) -> dict:
        """
        Get config for the given band, falling back to the most neutral
        available band if the requested band is not found.
        """
        if band in self.BAND_CONFIG:
            return self.BAND_CONFIG[band]
        # fallback priority: 6-8 > 9-12 > 3-5 > K-2 > first available
        for fallback in ["6-8", "9-12", "3-5", "K-2"]:
            if fallback in self.BAND_CONFIG:
                return self.BAND_CONFIG[fallback]
        # last resort: return first config
        return next(iter(self.BAND_CONFIG.values()))