File size: 10,634 Bytes
4fa8fb6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a0a4728
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
import gradio as gr
import requests
import os
import time
import json
from dotenv import load_dotenv

# --- Configuration & Constants ---
load_dotenv()

REPLICATE_API_TOKENS_STR = os.getenv("REPLICATE_API_TOKENS")
if not REPLICATE_API_TOKENS_STR:
    print("WARNING: REPLICATE_API_TOKENS not found. App will not function.")
    REPLICATE_API_KEYS = []
else:
    REPLICATE_API_KEYS = [token.strip() for token in REPLICATE_API_TOKENS_STR.split(',')]

MODEL_ENDPOINT = "https://api.replicate.com/v1/models/minimax/speech-02-hd/predictions"

VOICE_ID_MAP = {}
try:
    with open("voices.json", "r", encoding="utf-8") as f:
        VOICE_ID_MAP = json.load(f)
    if not VOICE_ID_MAP:
        print("WARNING: voices.json is empty or could not be loaded.")
    VOICE_ID_PRETTY_NAMES = list(VOICE_ID_MAP.keys())
    DEFAULT_VOICE_PRETTY_NAME = "Friendly Person" if "Friendly Person" in VOICE_ID_PRETTY_NAMES else (VOICE_ID_PRETTY_NAMES[0] if VOICE_ID_PRETTY_NAMES else None)
except FileNotFoundError:
    print("ERROR: voices.json not found. Please create it.")
    VOICE_ID_PRETTY_NAMES = []
    DEFAULT_VOICE_PRETTY_NAME = None
except json.JSONDecodeError:
    print("ERROR: voices.json is not valid JSON.")
    VOICE_ID_PRETTY_NAMES = []
    DEFAULT_VOICE_PRETTY_NAME = None

EMOTIONS = ["auto", "neutral", "happy", "sad", "angry", "fearful", "disgusted", "surprised"]
SAMPLE_RATES = [8000, 16000, 22050, 24000, 32000, 44100]
BITRATES = [32000, 64000, 128000, 256000]
CHANNELS = ["mono", "stereo"]
LANGUAGE_BOOST_OPTIONS = ["None", "English", "Chinese", "Japanese", "Korean"]

current_key_index = 0
MAX_POLLING_ATTEMPTS = 60
POLL_INTERVAL = 3

def get_next_api_key():
    global current_key_index
    if not REPLICATE_API_KEYS:
        return None
    key = REPLICATE_API_KEYS[current_key_index]
    current_key_index = (current_key_index + 1) % len(REPLICATE_API_KEYS)
    return key

def generate_speech(
    text, pitch, speed, volume, bitrate, channel, emotion,
    voice_id_pretty_name, custom_voice_id, sample_rate,
    language_boost, english_normalization
):
    if not text.strip():
        gr.Warning("Text input cannot be empty.")
        return None # Must return a value for the audio output

    if not REPLICATE_API_KEYS:
        gr.Error("No Replicate API Tokens configured. Please set REPLICATE_API_TOKENS in secrets.")
        return None

    if not VOICE_ID_MAP and not custom_voice_id.strip():
        gr.Error("Voice ID configuration is missing (voices.json empty/invalid) and no custom voice ID provided.")
        return None

    actual_voice_id_to_use = ""
    if custom_voice_id.strip():
        actual_voice_id_to_use = custom_voice_id.strip()
    elif voice_id_pretty_name and voice_id_pretty_name in VOICE_ID_MAP:
        actual_voice_id_to_use = VOICE_ID_MAP[voice_id_pretty_name]
    else:
        gr.Error(f"Selected voice '{voice_id_pretty_name}' not found in mappings and no custom ID provided.")
        return None

    payload = {
        "input": {
            "text": text, "pitch": int(pitch), "speed": float(speed), "volume": int(volume),
            "bitrate": int(bitrate), "channel": channel, "emotion": emotion,
            "voice_id": actual_voice_id_to_use, "sample_rate": int(sample_rate),
            "english_normalization": bool(english_normalization)
        }
    }
    if language_boost and language_boost.lower() != "none":
        payload["input"]["language_boost"] = language_boost

    num_keys_to_try = len(REPLICATE_API_KEYS)
    last_error_message_for_key = ""

    for i in range(num_keys_to_try):
        api_key = get_next_api_key()
        if not api_key: # Should not happen if REPLICATE_API_KEYS is populated
            gr.Error("Internal error: No API keys available in the cycling pool.")
            return None

        headers_post = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
        headers_get = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}

        print(f"Attempting API call with key ending: ...{api_key[-4:]}. Voice ID: {actual_voice_id_to_use}")

        try:
            response = requests.post(MODEL_ENDPOINT, json=payload, headers=headers_post, timeout=30)
            response.raise_for_status()
            result = response.json()
            current_status = result.get("status")
            print(f"Initial API Response (Key ...{api_key[-4:]}): Status '{current_status}'")

            prediction_url = result.get("urls", {}).get("get")
            logs_from_initial_call = result.get("logs")

            polling_attempts = 0
            while current_status in ["starting", "processing"] and prediction_url:
                if polling_attempts >= MAX_POLLING_ATTEMPTS:
                    last_error_message_for_key = f"Polling timed out for key ...{api_key[-4:]}."
                    print(last_error_message_for_key)
                    result["error"] = "Polling timed out." # For local log
                    current_status = "failed_polling_timeout"
                    break
                polling_attempts += 1
                time.sleep(POLL_INTERVAL)
                poll_response = requests.get(prediction_url, headers=headers_get, timeout=30)
                poll_response.raise_for_status()
                result = poll_response.json()
                current_status = result.get("status")

            if current_status == "succeeded":
                audio_url = result.get("output")
                if audio_url:
                    success_logs = result.get('logs', logs_from_initial_call if logs_from_initial_call else 'N/A')
                    print(f"Success with key ...{api_key[-4:]}. Logs: {success_logs}")
                    gr.Info("Success! Audio generated.")
                    return audio_url
                else:
                    last_error_message_for_key = f"API succeeded (Key ...{api_key[-4:]}) but no output URL. Resp: {result}"
                    print(last_error_message_for_key)
                    continue # Try next key
            else: # Covers "failed", "failed_polling_timeout", or other unexpected states
                error_detail = result.get("error", f"Unknown error or unexpected status '{current_status}'")
                last_error_message_for_key = f"Prediction failed/timed out for key ...{api_key[-4:]}. Status: {current_status}. Error: {error_detail}"
                print(last_error_message_for_key)
                continue # Try next key

        except requests.exceptions.HTTPError as e:
            error_text = "Unknown HTTP Error"
            try: error_text = e.response.text
            except AttributeError: pass
            last_error_message_for_key = f"HTTP error for key ...{api_key[-4:]}: {e.response.status_code} - {error_text}"
            print(last_error_message_for_key)
            continue # Try next key
        except requests.exceptions.RequestException as e:
            last_error_message_for_key = f"Request exception for key ...{api_key[-4:]}: {e}"
            print(last_error_message_for_key)
            continue # Try next key

    # If all keys failed
    final_error_message = "All API keys failed or an unrecoverable error occurred."
    if last_error_message_for_key: # Provide a bit more context from the last attempt if available
        final_error_message += f" Last attempt error: {last_error_message_for_key}"
    gr.Error(final_error_message)
    return None


# --- Gradio UI ---
with gr.Blocks(theme=gr.themes.Soft()) as app:
    gr.Markdown("# Glue Up Academy Narrator")
    gr.Markdown("Enter text and adjust parameters to generate speech.")

    with gr.Row():
        with gr.Column(scale=2):
            text_input = gr.Textbox(
                label="Text to Synthesize",
                lines=5,
                placeholder="Enter your text here...\n💡Insert '<#0.5#>' to add a 0.5s pause. Adjust duration."
            )
            with gr.Accordion("Voice Selection", open=True):
                voice_id_dropdown = gr.Dropdown(
                    label="Choose a Voice ID",
                    choices=VOICE_ID_PRETTY_NAMES,
                    value=DEFAULT_VOICE_PRETTY_NAME
                )
                custom_voice_id_input = gr.Textbox(
                    label="Custom Voice ID (Optional)",
                    placeholder="e.g., my_cloned_voice_v2",
                    info="If filled, this will override dropdown."
                )
                #gr.Markdown("[Minimax Voices](https://www.minimax.io/audio/voices) for more options.")
                gr.Markdown("For voice cloning, reach out to Raffy")
            with gr.Accordion("Advanced Speech Parameters", open=False):
                speed_slider = gr.Slider(label="Speed", minimum=0.5, maximum=2, step=0.1, value=1.0)
                volume_slider = gr.Slider(label="Volume", minimum=0, maximum=10, step=1, value=1)
                pitch_slider = gr.Slider(label="Pitch", minimum=-12, maximum=12, step=1, value=0)
                english_norm_checkbox = gr.Checkbox(label="English Normalization", value=False, info="Improves number reading.")

            with gr.Accordion("Audio Format & Emotion", open=False):
                emotion_dropdown = gr.Dropdown(label="Emotion", choices=EMOTIONS, value="auto")
                sample_rate_dropdown = gr.Dropdown(label="Sample Rate (Hz)", choices=SAMPLE_RATES, value=32000, type="value")
                bitrate_dropdown = gr.Dropdown(label="Bitrate (bps)", choices=BITRATES, value=128000, type="value")
                channel_dropdown = gr.Dropdown(label="Channels", choices=CHANNELS, value="mono")
                language_boost_dropdown = gr.Dropdown(label="Language Boost", choices=LANGUAGE_BOOST_OPTIONS, value="None")

        with gr.Column(scale=1):
            generate_button = gr.Button("Generate Speech", variant="primary")
            # REMOVED status_message Textbox
            audio_output = gr.Audio(label="Generated Speech", type="filepath")

    generate_button.click(
        fn=generate_speech,
        inputs=[
            text_input, pitch_slider, speed_slider, volume_slider,
            bitrate_dropdown, channel_dropdown, emotion_dropdown,
            voice_id_dropdown, custom_voice_id_input, sample_rate_dropdown,
            language_boost_dropdown, english_norm_checkbox
        ],
        outputs=[audio_output] # REMOVED status_message from outputs
    )

if __name__ == "__main__":
    if not REPLICATE_API_KEYS:
        print("FATAL: REPLICATE_API_TOKENS are not set.")
    if not VOICE_ID_MAP:
        print("WARNING: Voice ID map is empty (voices.json issue?).")
    app.launch(debug=True)