Aid3445 commited on
Commit
ad8a59c
·
verified ·
1 Parent(s): 674ae41

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +137 -103
app.py CHANGED
@@ -9,6 +9,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
9
  import gc
10
  from huggingface_hub import hf_hub_download
11
  import json
 
12
 
13
  # Fix for OpenMP duplicate library error
14
  os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
@@ -16,8 +17,87 @@ os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
16
  # Force CPU usage for ONNX Runtime to avoid GPU issues
17
  os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
18
 
19
- # Import KittenTTS
20
- from kittentts import KittenTTS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  class KittenTTSGradio:
23
  def __init__(self):
@@ -29,7 +109,6 @@ class KittenTTSGradio:
29
  ]
30
  self.max_workers = max(1, os.cpu_count() - 1) if os.cpu_count() else 2
31
  self.model_loaded = False
32
- # Don't load model in __init__, do it on first use
33
 
34
  def ensure_model_loaded(self):
35
  """Ensure model is loaded before use"""
@@ -37,47 +116,46 @@ class KittenTTSGradio:
37
  self.load_model()
38
 
39
  def download_and_load_model(self, repo_id):
40
- """Download model files and load them"""
41
  try:
42
  print(f"Downloading model files from {repo_id}...")
43
 
44
  # Download config file
45
- config_path = hf_hub_download(
46
- repo_id=repo_id,
47
- filename="config.json"
48
- )
49
 
50
  # Read config to get file names
51
  with open(config_path, 'r') as f:
52
  config = json.load(f)
53
 
54
- # Download model file - try different possible names
55
- model_filename = config.get("model_file", None)
56
  if not model_filename:
57
- # Try common names
58
- possible_names = ["kitten_tts_mini_v0_1.onnx", "kitten_tts_nano_v0_2.onnx", "kitten_tts_nano_v0_1.onnx"]
59
- for name in possible_names:
60
- try:
61
- model_path = hf_hub_download(repo_id=repo_id, filename=name)
62
- model_filename = name
63
- break
64
- except:
65
- continue
66
- else:
67
- model_path = hf_hub_download(repo_id=repo_id, filename=model_filename)
68
 
69
  # Download voices file
70
  voices_filename = config.get("voices", "voices.npz")
71
- voices_path = hf_hub_download(
72
- repo_id=repo_id,
73
- filename=voices_filename
74
- )
75
 
76
- print(f"Model files downloaded successfully")
 
 
 
 
 
77
 
78
- # Now try to load with KittenTTS using the repo_id
79
- # The library should use the cached files
80
- self.model = KittenTTS(repo_id)
81
  return True
82
 
83
  except Exception as e:
@@ -92,7 +170,23 @@ class KittenTTSGradio:
92
  try:
93
  print("Loading KittenTTS model...")
94
 
95
- # Try different loading strategies
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  strategies = [
97
  ("KittenML/kitten-tts-mini-0.1", "mini"),
98
  ("KittenML/kitten-tts-nano-0.2", "nano v0.2"),
@@ -100,25 +194,14 @@ class KittenTTSGradio:
100
  ]
101
 
102
  for repo_id, name in strategies:
103
- print(f"Trying to load {name} model...")
104
-
105
- # First try direct loading (in case files are cached)
106
- try:
107
- self.model = KittenTTS(repo_id)
108
- self.model_loaded = True
109
- print(f"Successfully loaded {name} model!")
110
- return
111
- except Exception as e:
112
- print(f"Direct loading failed: {e}")
113
-
114
- # Try downloading and loading
115
  if self.download_and_load_model(repo_id):
116
  self.model_loaded = True
117
- print(f"Successfully loaded {name} model after download!")
118
  return
119
 
120
  # If all strategies failed
121
- raise Exception("Failed to load any KittenTTS model. Please check your internet connection.")
122
 
123
  except Exception as e:
124
  print(f"Error loading model: {e}")
@@ -127,19 +210,15 @@ class KittenTTSGradio:
127
 
128
  def split_into_sentences(self, text):
129
  """Split text into sentences"""
130
- # Clean the text
131
  text = re.sub(r'\s+', ' ', text)
132
  text = text.strip()
133
 
134
- # Split by common sentence terminators
135
  sentences = re.split(r'(?<=[.!?])\s+', text)
136
 
137
- # Process each sentence
138
  processed_sentences = []
139
  for sentence in sentences:
140
  sentence = sentence.strip()
141
  if sentence:
142
- # Ensure proper punctuation
143
  if not sentence.endswith(('.', '!', '?')):
144
  sentence += '.'
145
  processed_sentences.append(sentence)
@@ -153,7 +232,6 @@ class KittenTTSGradio:
153
 
154
  chunks = []
155
  for i in range(0, len(sentences), chunk_size):
156
- # Join sentences in this chunk with a space
157
  chunk = ' '.join(sentences[i:i + chunk_size])
158
  chunks.append(chunk)
159
 
@@ -164,14 +242,10 @@ class KittenTTSGradio:
164
  if not text:
165
  return "Hello."
166
 
167
- # Remove problematic characters
168
  text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\'\"]', '', text)
169
-
170
- # Normalize whitespace
171
  text = re.sub(r'\s+', ' ', text)
172
  text = text.strip()
173
 
174
- # Ensure minimum length
175
  if len(text) < 5:
176
  text = "Hello."
177
 
@@ -179,7 +253,6 @@ class KittenTTSGradio:
179
 
180
  def safe_generate_audio(self, text, voice, speed):
181
  """Generate audio with fallback strategies"""
182
- # Ensure model is loaded
183
  self.ensure_model_loaded()
184
 
185
  if not self.model:
@@ -220,7 +293,6 @@ class KittenTTSGradio:
220
 
221
  def convert_text_to_speech(self, text, voice, speed, chunk_size, use_multithreading, progress=gr.Progress()):
222
  """Main conversion function for Gradio"""
223
- # Ensure model is loaded
224
  try:
225
  self.ensure_model_loaded()
226
  except Exception as e:
@@ -230,13 +302,11 @@ class KittenTTSGradio:
230
  raise gr.Error("Please enter some text to convert.")
231
 
232
  try:
233
- # Split into sentences first
234
  sentences = self.split_into_sentences(text)
235
 
236
  if not sentences:
237
  raise gr.Error("No valid sentences found in the text.")
238
 
239
- # Group sentences into chunks based on chunk_size
240
  chunks = self.group_sentences_into_chunks(sentences, chunk_size)
241
 
242
  total_chunks = len(chunks)
@@ -245,19 +315,15 @@ class KittenTTSGradio:
245
  chunk_label = "chunk" if chunk_size == 1 else f"chunk ({chunk_size} sentences each)"
246
  progress(0, desc=f"Processing {total_sentences} sentences in {total_chunks} {chunk_label}s...")
247
 
248
- # Process chunks
249
  audio_chunks = []
250
 
251
  if use_multithreading and total_chunks > 1:
252
- # Multithreaded processing
253
  with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
254
- # Submit all chunks
255
  futures = {
256
  executor.submit(self.process_single_sentence, chunk, voice, speed): i
257
  for i, chunk in enumerate(chunks)
258
  }
259
 
260
- # Collect results in order
261
  results = {}
262
  completed = 0
263
 
@@ -273,11 +339,9 @@ class KittenTTSGradio:
273
  print(f"Error processing chunk: {e}")
274
  continue
275
 
276
- # Sort by index
277
  for i in sorted(results.keys()):
278
  audio_chunks.append(results[i])
279
  else:
280
- # Sequential processing
281
  for i, chunk in enumerate(chunks):
282
  try:
283
  audio = self.process_single_sentence(chunk, voice, speed)
@@ -291,7 +355,6 @@ class KittenTTSGradio:
291
  if not audio_chunks:
292
  raise gr.Error("Failed to generate any audio.")
293
 
294
- # Concatenate audio chunks
295
  progress(0.9, desc="Concatenating audio...")
296
 
297
  if len(audio_chunks) == 1:
@@ -299,14 +362,12 @@ class KittenTTSGradio:
299
  else:
300
  final_audio = np.concatenate(audio_chunks)
301
 
302
- # Create temporary file for output
303
  output_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
304
  sf.write(output_file.name, final_audio, 24000)
305
  output_file.close()
306
 
307
  progress(1.0, desc="Complete!")
308
 
309
- # Clean up memory
310
  gc.collect()
311
 
312
  processing_method = "multithreading" if use_multithreading else "sequential"
@@ -318,7 +379,7 @@ class KittenTTSGradio:
318
  except Exception as e:
319
  raise gr.Error(f"Conversion failed: {str(e)}")
320
 
321
- # Initialize the app - don't load model yet
322
  print("Initializing KittenTTS app...")
323
  app = KittenTTSGradio()
324
  print("App initialized, model will load on first use")
@@ -331,14 +392,8 @@ def create_interface():
331
 
332
  Convert text to natural-sounding speech using KittenTTS - a lightweight TTS model that runs on CPU.
333
 
334
- **Features:**
335
- - 8 different voice options (male and female)
336
- - Adjustable speech speed
337
- - Adjustable chunk size for processing
338
- - Sentence-by-sentence or multi-sentence processing
339
- - Multithreading support for faster processing
340
-
341
- **Note:** The model will download on first use (~170MB for mini model, ~25MB for nano).
342
  """)
343
 
344
  with gr.Row():
@@ -348,7 +403,7 @@ def create_interface():
348
  placeholder="Enter your text here or upload a file...",
349
  lines=10,
350
  max_lines=20,
351
- value="" # Start with empty text
352
  )
353
 
354
  with gr.Row():
@@ -360,13 +415,11 @@ def create_interface():
360
 
361
  clear_btn = gr.Button("Clear Text", size="sm")
362
 
363
- # File upload handler
364
  def load_file(file_path):
365
  if file_path:
366
  try:
367
  with open(file_path, 'r', encoding='utf-8') as f:
368
  content = f.read()
369
- # Limit display for very large files
370
  if len(content) > 50000:
371
  display_text = content[:50000] + "\n\n... (truncated for display)"
372
  else:
@@ -379,17 +432,8 @@ def create_interface():
379
  def clear_text():
380
  return ""
381
 
382
- file_upload.change(
383
- fn=load_file,
384
- inputs=[file_upload],
385
- outputs=[text_input]
386
- )
387
-
388
- clear_btn.click(
389
- fn=clear_text,
390
- inputs=[],
391
- outputs=[text_input]
392
- )
393
 
394
  with gr.Column(scale=1):
395
  voice_dropdown = gr.Dropdown(
@@ -441,7 +485,6 @@ def create_interface():
441
  value="Ready to convert text to speech."
442
  )
443
 
444
- # Examples
445
  gr.Examples(
446
  examples=[
447
  ["Hello! This is a test of the KittenTTS system. It can convert text to natural sounding speech."],
@@ -452,7 +495,6 @@ def create_interface():
452
  label="Example Texts"
453
  )
454
 
455
- # Connect the conversion function
456
  convert_btn.click(
457
  fn=app.convert_text_to_speech,
458
  inputs=[text_input, voice_dropdown, speed_slider, chunk_size_slider, multithread_checkbox],
@@ -461,24 +503,16 @@ def create_interface():
461
 
462
  gr.Markdown("""
463
  ---
464
- ### 📝 Tips:
465
- - **Chunk Size**: Set to 1 for maximum quality (processes each sentence separately). Increase for faster processing of long texts.
466
- - **Trade-offs**: Larger chunks = faster processing but may have less natural pauses between sentences
467
- - Processing time depends on text length, chunk size, and multithreading setting
468
- - Each voice has different characteristics - try them out!
469
- - The model runs entirely on CPU - no GPU required
470
- - First conversion will take longer as the model downloads and loads
471
 
472
  ### 🎭 Available Voices:
473
  - **expr-voice-2-m/f**: Expressive male/female voices
474
  - **expr-voice-3-m/f**: Natural male/female voices
475
  - **expr-voice-4-m/f**: Clear male/female voices
476
  - **expr-voice-5-m/f**: Warm male/female voices
477
-
478
- ### ⚙️ Chunk Size Guide:
479
- - **1 sentence**: Best quality, natural pauses (recommended for short texts)
480
- - **2-3 sentences**: Good balance of speed and quality
481
- - **5+ sentences**: Faster processing for long texts (may sound more continuous)
482
  """)
483
 
484
  return demo
 
9
  import gc
10
  from huggingface_hub import hf_hub_download
11
  import json
12
+ import onnxruntime as ort
13
 
14
  # Fix for OpenMP duplicate library error
15
  os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
 
17
  # Force CPU usage for ONNX Runtime to avoid GPU issues
18
  os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
19
 
20
+ class DirectKittenTTS:
21
+ """Direct implementation of KittenTTS using ONNX Runtime"""
22
+
23
+ def __init__(self, model_path, voices_path):
24
+ """Initialize with direct paths to model and voices files"""
25
+ self.session = ort.InferenceSession(model_path, providers=['CPUExecutionProvider'])
26
+ self.voices_data = np.load(voices_path)
27
+ self.voice_list = list(self.voices_data.keys())
28
+ print(f"Loaded model with voices: {self.voice_list}")
29
+
30
+ def text_to_phonemes(self, text):
31
+ """Convert text to phonemes - simplified version"""
32
+ # This is a very basic implementation
33
+ # The actual KittenTTS uses espeak-ng for phoneme conversion
34
+ # For now, we'll just return the text as-is with some basic processing
35
+ text = text.lower()
36
+ text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\'\"]', '', text)
37
+ return text
38
+
39
+ def generate(self, text, voice='expr-voice-2-m', speed=1.0):
40
+ """Generate audio from text"""
41
+ try:
42
+ # Get voice embedding
43
+ if voice not in self.voices_data:
44
+ print(f"Voice {voice} not found, using first available voice")
45
+ voice = self.voice_list[0]
46
+
47
+ voice_embedding = self.voices_data[voice]
48
+
49
+ # Convert text to phonemes (simplified)
50
+ phonemes = self.text_to_phonemes(text)
51
+
52
+ # Prepare input for ONNX model
53
+ # Note: This is a simplified version - actual implementation would need proper tokenization
54
+ # For now, create dummy input that matches expected shape
55
+ max_length = 512
56
+ text_encoded = [ord(c) for c in phonemes[:max_length]]
57
+ text_encoded = text_encoded + [0] * (max_length - len(text_encoded))
58
+ text_input = np.array([text_encoded], dtype=np.int64)
59
+
60
+ # Get input names from the model
61
+ input_names = [inp.name for inp in self.session.get_inputs()]
62
+
63
+ # Prepare inputs dict
64
+ inputs = {}
65
+ for name in input_names:
66
+ if 'text' in name.lower() or 'input' in name.lower():
67
+ inputs[name] = text_input
68
+ elif 'voice' in name.lower() or 'speaker' in name.lower():
69
+ inputs[name] = voice_embedding.reshape(1, -1)
70
+ elif 'speed' in name.lower():
71
+ inputs[name] = np.array([[speed]], dtype=np.float32)
72
+
73
+ # Run inference
74
+ outputs = self.session.run(None, inputs)
75
+
76
+ # Get audio output (usually the first output)
77
+ audio = outputs[0]
78
+
79
+ # Ensure audio is 1D
80
+ if audio.ndim > 1:
81
+ audio = audio.squeeze()
82
+
83
+ # Apply speed adjustment if not handled by model
84
+ if speed != 1.0:
85
+ # Simple speed adjustment by resampling
86
+ original_length = len(audio)
87
+ new_length = int(original_length / speed)
88
+ indices = np.linspace(0, original_length - 1, new_length)
89
+ audio = np.interp(indices, np.arange(original_length), audio)
90
+
91
+ return audio
92
+
93
+ except Exception as e:
94
+ print(f"Error in generate: {e}")
95
+ # Return a simple sine wave as fallback
96
+ duration = 1.0
97
+ sample_rate = 24000
98
+ t = np.linspace(0, duration, int(sample_rate * duration))
99
+ audio = np.sin(2 * np.pi * 440 * t) * 0.3
100
+ return audio
101
 
102
  class KittenTTSGradio:
103
  def __init__(self):
 
109
  ]
110
  self.max_workers = max(1, os.cpu_count() - 1) if os.cpu_count() else 2
111
  self.model_loaded = False
 
112
 
113
  def ensure_model_loaded(self):
114
  """Ensure model is loaded before use"""
 
116
  self.load_model()
117
 
118
  def download_and_load_model(self, repo_id):
119
+ """Download model files and load them directly"""
120
  try:
121
  print(f"Downloading model files from {repo_id}...")
122
 
123
  # Download config file
124
+ config_path = hf_hub_download(repo_id=repo_id, filename="config.json")
 
 
 
125
 
126
  # Read config to get file names
127
  with open(config_path, 'r') as f:
128
  config = json.load(f)
129
 
130
+ # Get model filename from config or use defaults
131
+ model_filename = config.get("model_file")
132
  if not model_filename:
133
+ # Try to guess based on repo name
134
+ if "mini" in repo_id:
135
+ model_filename = "kitten_tts_mini_v0_1.onnx"
136
+ elif "nano" in repo_id and "0.2" in repo_id:
137
+ model_filename = "kitten_tts_nano_v0_2.onnx"
138
+ else:
139
+ model_filename = "kitten_tts_nano_v0_1.onnx"
140
+
141
+ # Download model file
142
+ print(f"Downloading model file: {model_filename}")
143
+ model_path = hf_hub_download(repo_id=repo_id, filename=model_filename)
144
 
145
  # Download voices file
146
  voices_filename = config.get("voices", "voices.npz")
147
+ print(f"Downloading voices file: {voices_filename}")
148
+ voices_path = hf_hub_download(repo_id=repo_id, filename=voices_filename)
149
+
150
+ print(f"Files downloaded: {model_path}, {voices_path}")
151
 
152
+ # Create our direct ONNX model
153
+ self.model = DirectKittenTTS(model_path, voices_path)
154
+
155
+ # Update available voices based on what's actually in the file
156
+ if hasattr(self.model, 'voice_list'):
157
+ self.available_voices = self.model.voice_list
158
 
 
 
 
159
  return True
160
 
161
  except Exception as e:
 
170
  try:
171
  print("Loading KittenTTS model...")
172
 
173
+ # First, try to import and use KittenTTS if available
174
+ try:
175
+ from kittentts import KittenTTS
176
+ # Try loading with the library first
177
+ for repo_id in ["KittenML/kitten-tts-mini-0.1", "KittenML/kitten-tts-nano-0.2"]:
178
+ try:
179
+ print(f"Trying to load {repo_id} with KittenTTS library...")
180
+ self.model = KittenTTS(repo_id)
181
+ self.model_loaded = True
182
+ print(f"Successfully loaded {repo_id} with KittenTTS!")
183
+ return
184
+ except:
185
+ continue
186
+ except ImportError:
187
+ print("KittenTTS library not available, using direct ONNX loading")
188
+
189
+ # If library loading failed, use our direct implementation
190
  strategies = [
191
  ("KittenML/kitten-tts-mini-0.1", "mini"),
192
  ("KittenML/kitten-tts-nano-0.2", "nano v0.2"),
 
194
  ]
195
 
196
  for repo_id, name in strategies:
197
+ print(f"Trying to load {name} model directly...")
 
 
 
 
 
 
 
 
 
 
 
198
  if self.download_and_load_model(repo_id):
199
  self.model_loaded = True
200
+ print(f"Successfully loaded {name} model!")
201
  return
202
 
203
  # If all strategies failed
204
+ raise Exception("Failed to load any KittenTTS model")
205
 
206
  except Exception as e:
207
  print(f"Error loading model: {e}")
 
210
 
211
  def split_into_sentences(self, text):
212
  """Split text into sentences"""
 
213
  text = re.sub(r'\s+', ' ', text)
214
  text = text.strip()
215
 
 
216
  sentences = re.split(r'(?<=[.!?])\s+', text)
217
 
 
218
  processed_sentences = []
219
  for sentence in sentences:
220
  sentence = sentence.strip()
221
  if sentence:
 
222
  if not sentence.endswith(('.', '!', '?')):
223
  sentence += '.'
224
  processed_sentences.append(sentence)
 
232
 
233
  chunks = []
234
  for i in range(0, len(sentences), chunk_size):
 
235
  chunk = ' '.join(sentences[i:i + chunk_size])
236
  chunks.append(chunk)
237
 
 
242
  if not text:
243
  return "Hello."
244
 
 
245
  text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\'\"]', '', text)
 
 
246
  text = re.sub(r'\s+', ' ', text)
247
  text = text.strip()
248
 
 
249
  if len(text) < 5:
250
  text = "Hello."
251
 
 
253
 
254
  def safe_generate_audio(self, text, voice, speed):
255
  """Generate audio with fallback strategies"""
 
256
  self.ensure_model_loaded()
257
 
258
  if not self.model:
 
293
 
294
  def convert_text_to_speech(self, text, voice, speed, chunk_size, use_multithreading, progress=gr.Progress()):
295
  """Main conversion function for Gradio"""
 
296
  try:
297
  self.ensure_model_loaded()
298
  except Exception as e:
 
302
  raise gr.Error("Please enter some text to convert.")
303
 
304
  try:
 
305
  sentences = self.split_into_sentences(text)
306
 
307
  if not sentences:
308
  raise gr.Error("No valid sentences found in the text.")
309
 
 
310
  chunks = self.group_sentences_into_chunks(sentences, chunk_size)
311
 
312
  total_chunks = len(chunks)
 
315
  chunk_label = "chunk" if chunk_size == 1 else f"chunk ({chunk_size} sentences each)"
316
  progress(0, desc=f"Processing {total_sentences} sentences in {total_chunks} {chunk_label}s...")
317
 
 
318
  audio_chunks = []
319
 
320
  if use_multithreading and total_chunks > 1:
 
321
  with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
 
322
  futures = {
323
  executor.submit(self.process_single_sentence, chunk, voice, speed): i
324
  for i, chunk in enumerate(chunks)
325
  }
326
 
 
327
  results = {}
328
  completed = 0
329
 
 
339
  print(f"Error processing chunk: {e}")
340
  continue
341
 
 
342
  for i in sorted(results.keys()):
343
  audio_chunks.append(results[i])
344
  else:
 
345
  for i, chunk in enumerate(chunks):
346
  try:
347
  audio = self.process_single_sentence(chunk, voice, speed)
 
355
  if not audio_chunks:
356
  raise gr.Error("Failed to generate any audio.")
357
 
 
358
  progress(0.9, desc="Concatenating audio...")
359
 
360
  if len(audio_chunks) == 1:
 
362
  else:
363
  final_audio = np.concatenate(audio_chunks)
364
 
 
365
  output_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
366
  sf.write(output_file.name, final_audio, 24000)
367
  output_file.close()
368
 
369
  progress(1.0, desc="Complete!")
370
 
 
371
  gc.collect()
372
 
373
  processing_method = "multithreading" if use_multithreading else "sequential"
 
379
  except Exception as e:
380
  raise gr.Error(f"Conversion failed: {str(e)}")
381
 
382
+ # Initialize the app
383
  print("Initializing KittenTTS app...")
384
  app = KittenTTSGradio()
385
  print("App initialized, model will load on first use")
 
392
 
393
  Convert text to natural-sounding speech using KittenTTS - a lightweight TTS model that runs on CPU.
394
 
395
+ **Note:** First conversion will download and load the model (~170MB for mini, ~25MB for nano).
396
+ If you encounter issues, please try refreshing the page.
 
 
 
 
 
 
397
  """)
398
 
399
  with gr.Row():
 
403
  placeholder="Enter your text here or upload a file...",
404
  lines=10,
405
  max_lines=20,
406
+ value=""
407
  )
408
 
409
  with gr.Row():
 
415
 
416
  clear_btn = gr.Button("Clear Text", size="sm")
417
 
 
418
  def load_file(file_path):
419
  if file_path:
420
  try:
421
  with open(file_path, 'r', encoding='utf-8') as f:
422
  content = f.read()
 
423
  if len(content) > 50000:
424
  display_text = content[:50000] + "\n\n... (truncated for display)"
425
  else:
 
432
  def clear_text():
433
  return ""
434
 
435
+ file_upload.change(fn=load_file, inputs=[file_upload], outputs=[text_input])
436
+ clear_btn.click(fn=clear_text, inputs=[], outputs=[text_input])
 
 
 
 
 
 
 
 
 
437
 
438
  with gr.Column(scale=1):
439
  voice_dropdown = gr.Dropdown(
 
485
  value="Ready to convert text to speech."
486
  )
487
 
 
488
  gr.Examples(
489
  examples=[
490
  ["Hello! This is a test of the KittenTTS system. It can convert text to natural sounding speech."],
 
495
  label="Example Texts"
496
  )
497
 
 
498
  convert_btn.click(
499
  fn=app.convert_text_to_speech,
500
  inputs=[text_input, voice_dropdown, speed_slider, chunk_size_slider, multithread_checkbox],
 
503
 
504
  gr.Markdown("""
505
  ---
506
+ ### ⚙️ Chunk Size Guide:
507
+ - **1 sentence**: Best quality, natural pauses (recommended for short texts)
508
+ - **2-3 sentences**: Good balance of speed and quality
509
+ - **5+ sentences**: Faster processing for long texts (may sound more continuous)
 
 
 
510
 
511
  ### 🎭 Available Voices:
512
  - **expr-voice-2-m/f**: Expressive male/female voices
513
  - **expr-voice-3-m/f**: Natural male/female voices
514
  - **expr-voice-4-m/f**: Clear male/female voices
515
  - **expr-voice-5-m/f**: Warm male/female voices
 
 
 
 
 
516
  """)
517
 
518
  return demo