Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import numpy as np
|
| 3 |
+
import torch
|
| 4 |
+
import soundfile as sf
|
| 5 |
+
from kokoro import KPipeline
|
| 6 |
+
import re
|
| 7 |
+
import traceback
|
| 8 |
+
|
| 9 |
+
# Helper: Format seconds into SRT timestamp (hh:mm:ss,ms)
|
| 10 |
+
def format_time(seconds):
|
| 11 |
+
hours = int(seconds // 3600)
|
| 12 |
+
minutes = int((seconds % 3600) // 60)
|
| 13 |
+
secs = seconds % 60
|
| 14 |
+
# Ensure milliseconds are comma separated
|
| 15 |
+
return f"{hours:02d}:{minutes:02d}:{secs:06.3f}".replace('.', ',')
|
| 16 |
+
|
| 17 |
+
def generate_audio(text, voice, speed, lang_code, split_pattern, debug):
|
| 18 |
+
debug_logs = []
|
| 19 |
+
debug_logs.append("Starting Kokoro TTS generation...")
|
| 20 |
+
|
| 21 |
+
try:
|
| 22 |
+
debug_logs.append(f"Initializing Kokoro pipeline with lang_code: '{lang_code}' (CPU mode assumed)")
|
| 23 |
+
# Initialize the pipeline; by default, it will run on CPU if no GPU is available.
|
| 24 |
+
pipeline = KPipeline(lang_code=lang_code)
|
| 25 |
+
debug_logs.append("Pipeline initialized successfully.")
|
| 26 |
+
except Exception as e:
|
| 27 |
+
error_msg = f"Error initializing pipeline: {str(e)}"
|
| 28 |
+
debug_logs.append(error_msg)
|
| 29 |
+
return None, "", "\n".join(debug_logs)
|
| 30 |
+
|
| 31 |
+
# Prepare lists for audio segments, SRT entries, and segment-level debug info.
|
| 32 |
+
audio_segments = []
|
| 33 |
+
srt_entries = []
|
| 34 |
+
current_time = 0.0 # cumulative time for SRT timestamps
|
| 35 |
+
segment_index = 1
|
| 36 |
+
segment_debug_info = []
|
| 37 |
+
|
| 38 |
+
try:
|
| 39 |
+
debug_logs.append("Generating audio segments from input text...")
|
| 40 |
+
# Invoke the pipeline to process the text.
|
| 41 |
+
# The split_pattern parameter (regex) allows you to define how text is segmented.
|
| 42 |
+
generator = pipeline(
|
| 43 |
+
text,
|
| 44 |
+
voice=voice,
|
| 45 |
+
speed=speed,
|
| 46 |
+
split_pattern=split_pattern
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
for i, (gs, ps, audio) in enumerate(generator):
|
| 50 |
+
duration = len(audio) / 24000.0 # assuming a sample rate of 24000 Hz
|
| 51 |
+
start_timestamp = current_time
|
| 52 |
+
end_timestamp = current_time + duration
|
| 53 |
+
# Create an SRT entry for the segment.
|
| 54 |
+
srt_entry = f"{segment_index}\n{format_time(start_timestamp)} --> {format_time(end_timestamp)}\n{gs}\n"
|
| 55 |
+
srt_entries.append(srt_entry)
|
| 56 |
+
current_time = end_timestamp
|
| 57 |
+
|
| 58 |
+
# Record segment details for debugging.
|
| 59 |
+
segment_debug_info.append(f"Segment {segment_index}: Duration = {duration:.3f}s, Graphemes = {gs}, Phonemes = {ps}")
|
| 60 |
+
audio_segments.append(audio)
|
| 61 |
+
segment_index += 1
|
| 62 |
+
|
| 63 |
+
debug_logs.append("Audio segments generated successfully.")
|
| 64 |
+
except Exception as e:
|
| 65 |
+
error_msg = f"Error during audio generation: {str(e)}\n{traceback.format_exc()}"
|
| 66 |
+
debug_logs.append(error_msg)
|
| 67 |
+
return None, "", "\n".join(debug_logs)
|
| 68 |
+
|
| 69 |
+
# Concatenate all the generated segments into a single audio array.
|
| 70 |
+
if audio_segments:
|
| 71 |
+
full_audio = np.concatenate(audio_segments)
|
| 72 |
+
else:
|
| 73 |
+
debug_logs.append("No audio segments were generated.")
|
| 74 |
+
return None, "", "\n".join(debug_logs)
|
| 75 |
+
|
| 76 |
+
# Combine all SRT entries into one string.
|
| 77 |
+
srt_content = "\n".join(srt_entries)
|
| 78 |
+
|
| 79 |
+
# Combine all debug logs (with optional segment details).
|
| 80 |
+
if debug:
|
| 81 |
+
debug_info = "\n".join(debug_logs + segment_debug_info)
|
| 82 |
+
else:
|
| 83 |
+
debug_info = "\n".join(debug_logs)
|
| 84 |
+
|
| 85 |
+
# Return a tuple: audio (with sample rate), the SRT text, and the debug log.
|
| 86 |
+
return (24000, full_audio), srt_content, debug_info
|
| 87 |
+
|
| 88 |
+
# Build the Gradio interface.
|
| 89 |
+
iface = gr.Interface(
|
| 90 |
+
fn=generate_audio,
|
| 91 |
+
inputs=[
|
| 92 |
+
gr.Textbox(label="Input Text", lines=10, placeholder="Enter the text to be synthesized here..."),
|
| 93 |
+
gr.Textbox(label="Voice (e.g., af_heart)", value="af_heart"),
|
| 94 |
+
gr.Slider(label="Speed", minimum=0.5, maximum=2.0, step=0.1, value=1.0),
|
| 95 |
+
gr.Textbox(label="Language Code", value="a",
|
| 96 |
+
placeholder="Enter language code ('a' for American English, 'b' for British, etc.)"),
|
| 97 |
+
gr.Textbox(label="Split Pattern (Regex)", value=r'\n+',
|
| 98 |
+
placeholder="Regex to split the input text (e.g., '\\n+')"),
|
| 99 |
+
gr.Checkbox(label="Enable Debug Mode", value=True)
|
| 100 |
+
],
|
| 101 |
+
outputs=[
|
| 102 |
+
gr.Audio(label="Generated Audio", type="numpy"),
|
| 103 |
+
gr.Textbox(label="Generated SRT"),
|
| 104 |
+
gr.Textbox(label="Debug Information", lines=15)
|
| 105 |
+
],
|
| 106 |
+
title="Kokoro TTS Gradio App (CPU Mode)",
|
| 107 |
+
description=("This app uses the Kokoro TTS model to generate audio from text. "
|
| 108 |
+
"You can tweak parameters such as voice, speed, language code, and the text split pattern. "
|
| 109 |
+
"When debug mode is enabled, detailed processing steps (including grapheme and phoneme outputs) are displayed.")
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
if __name__ == "__main__":
|
| 113 |
+
iface.launch()
|