Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import numpy as np | |
| import torch | |
| import soundfile as sf | |
| from kokoro import KPipeline | |
| import re | |
| import traceback | |
| # Helper: Format seconds into SRT timestamp (hh:mm:ss,ms) | |
| def format_time(seconds): | |
| hours = int(seconds // 3600) | |
| minutes = int((seconds % 3600) // 60) | |
| secs = seconds % 60 | |
| # Ensure milliseconds are comma separated | |
| return f"{hours:02d}:{minutes:02d}:{secs:06.3f}".replace('.', ',') | |
| def generate_audio(text, voice, speed, lang_code, split_pattern, debug): | |
| debug_logs = [] | |
| debug_logs.append("Starting Kokoro TTS generation...") | |
| try: | |
| debug_logs.append(f"Initializing Kokoro pipeline with lang_code: '{lang_code}' (CPU mode assumed)") | |
| # Initialize the pipeline; by default, it will run on CPU if no GPU is available. | |
| pipeline = KPipeline(lang_code=lang_code) | |
| debug_logs.append("Pipeline initialized successfully.") | |
| except Exception as e: | |
| error_msg = f"Error initializing pipeline: {str(e)}" | |
| debug_logs.append(error_msg) | |
| return None, "", "\n".join(debug_logs) | |
| # Prepare lists for audio segments, SRT entries, and segment-level debug info. | |
| audio_segments = [] | |
| srt_entries = [] | |
| current_time = 0.0 # cumulative time for SRT timestamps | |
| segment_index = 1 | |
| segment_debug_info = [] | |
| try: | |
| debug_logs.append("Generating audio segments from input text...") | |
| # Invoke the pipeline to process the text. | |
| # The split_pattern parameter (regex) allows you to define how text is segmented. | |
| generator = pipeline( | |
| text, | |
| voice=voice, | |
| speed=speed, | |
| split_pattern=split_pattern | |
| ) | |
| for i, (gs, ps, audio) in enumerate(generator): | |
| duration = len(audio) / 24000.0 # assuming a sample rate of 24000 Hz | |
| start_timestamp = current_time | |
| end_timestamp = current_time + duration | |
| # Create an SRT entry for the segment. | |
| srt_entry = f"{segment_index}\n{format_time(start_timestamp)} --> {format_time(end_timestamp)}\n{gs}\n" | |
| srt_entries.append(srt_entry) | |
| current_time = end_timestamp | |
| # Record segment details for debugging. | |
| segment_debug_info.append(f"Segment {segment_index}: Duration = {duration:.3f}s, Graphemes = {gs}, Phonemes = {ps}") | |
| audio_segments.append(audio) | |
| segment_index += 1 | |
| debug_logs.append("Audio segments generated successfully.") | |
| except Exception as e: | |
| error_msg = f"Error during audio generation: {str(e)}\n{traceback.format_exc()}" | |
| debug_logs.append(error_msg) | |
| return None, "", "\n".join(debug_logs) | |
| # Concatenate all the generated segments into a single audio array. | |
| if audio_segments: | |
| full_audio = np.concatenate(audio_segments) | |
| else: | |
| debug_logs.append("No audio segments were generated.") | |
| return None, "", "\n".join(debug_logs) | |
| # Combine all SRT entries into one string. | |
| srt_content = "\n".join(srt_entries) | |
| # Combine all debug logs (with optional segment details). | |
| if debug: | |
| debug_info = "\n".join(debug_logs + segment_debug_info) | |
| else: | |
| debug_info = "\n".join(debug_logs) | |
| # Return a tuple: audio (with sample rate), the SRT text, and the debug log. | |
| return (24000, full_audio), srt_content, debug_info | |
| # Build the Gradio interface. | |
| iface = gr.Interface( | |
| fn=generate_audio, | |
| inputs=[ | |
| gr.Textbox(label="Input Text", lines=10, placeholder="Enter the text to be synthesized here..."), | |
| gr.Textbox(label="Voice (e.g., af_heart)", value="af_heart"), | |
| gr.Slider(label="Speed", minimum=0.5, maximum=2.0, step=0.1, value=1.0), | |
| gr.Textbox(label="Language Code", value="a", | |
| placeholder="Enter language code ('a' for American English, 'b' for British, etc.)"), | |
| gr.Textbox(label="Split Pattern (Regex)", value=r'\n+', | |
| placeholder="Regex to split the input text (e.g., '\\n+')"), | |
| gr.Checkbox(label="Enable Debug Mode", value=True) | |
| ], | |
| outputs=[ | |
| gr.Audio(label="Generated Audio", type="numpy"), | |
| gr.Textbox(label="Generated SRT"), | |
| gr.Textbox(label="Debug Information", lines=15) | |
| ], | |
| title="Kokoro TTS Gradio App (CPU Mode)", | |
| description=("This app uses the Kokoro TTS model to generate audio from text. " | |
| "You can tweak parameters such as voice, speed, language code, and the text split pattern. " | |
| "When debug mode is enabled, detailed processing steps (including grapheme and phoneme outputs) are displayed.") | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() | |