ruslanmv commited on
Commit
cd32542
·
1 Parent(s): 04c9a5e
Files changed (2) hide show
  1. app.py +176 -131
  2. requirements.txt +6 -3
app.py CHANGED
@@ -5,15 +5,28 @@ from __future__ import annotations
5
  import os
6
  import requests
7
  import base64
8
- import datetime
9
  import struct
10
  import re
11
  import textwrap
12
- import time
13
  import uuid
 
 
 
 
 
14
 
15
  # --- Hugging Face Spaces & ZeroGPU ---
16
- import spaces # Required for ZeroGPU
 
 
 
 
 
 
 
 
 
 
17
  import gradio as gr
18
 
19
  # --- Core ML & Data Libraries ---
@@ -33,30 +46,29 @@ import nltk
33
  import langid
34
  import emoji
35
  import noisereduce as nr
36
- import dotenv
37
 
38
  # ===================================================================================
39
  # 2. GLOBAL CONFIGURATION & HELPER FUNCTIONS
40
  # ===================================================================================
41
 
42
- # --- Download NLTK data once ---
43
  nltk.download("punkt", quiet=True)
 
44
  os.environ["COQUI_TOS_AGREED"] = "1"
45
 
46
- # --- Define global variables for caching models ---
47
- # This prevents reloading the models on every single run, which would be very slow.
48
- tts_model = None
49
- llm_model = None
50
 
51
- # --- Configuration ---
52
  HF_TOKEN = os.environ.get("HF_TOKEN")
53
  api = HfApi(token=HF_TOKEN) if HF_TOKEN else None
54
  repo_id = "ruslanmv/ai-story-server"
55
- SECRET_TOKEN = os.getenv('SECRET_TOKEN', 'secret') # Default secret
56
  SENTENCE_SPLIT_LENGTH = 250
57
  LLM_STOP_WORDS = ["</s>", "<|user|>", "/s>"]
58
 
59
- # --- System Prompts and Roles ---
60
  default_system_message = (
61
  "You're a storyteller crafting a short tale for young listeners. Keep sentences short and simple. "
62
  "Use narrative style only, without lists or complex words. Type numbers as words (e.g., 'ten')."
@@ -69,28 +81,36 @@ ROLE_PROMPTS["Pirate"] = (
69
  "Keep answers short, as if in a real conversation. Only provide the words AI Beard would speak."
70
  )
71
 
72
- # --- Audio and Text Helper Functions ---
73
- def pcm_to_wav(pcm_data, sample_rate=24000, channels=1, bit_depth=16):
74
  if pcm_data.startswith(b"RIFF"):
75
  return pcm_data
76
  chunk_size = 36 + len(pcm_data)
77
- return struct.pack('<4sI4s4sIHHIIHH4sI',
78
- b'RIFF', chunk_size, b'WAVE', b'fmt ',
79
- 16, 1, channels, sample_rate,
80
- sample_rate * channels * bit_depth // 8,
81
- channels * bit_depth // 8, bit_depth,
82
- b'data', len(pcm_data)) + pcm_data
83
-
84
- def split_sentences(text, max_len):
 
 
 
85
  sentences = nltk.sent_tokenize(text)
86
- return [sub_sent for sent in sentences for sub_sent in (
87
- textwrap.wrap(sent, max_len, break_long_words=True) if len(sent) > max_len else [sent]
88
- )]
 
 
 
 
89
 
90
- def format_prompt_zephyr(message, history, system_message):
91
  prompt = f"<|system|>\n{system_message}</s>"
92
  for user_prompt, bot_response in history:
93
- prompt += f"<|user|>\n{user_prompt}</s><|assistant|>\n{bot_response}</s>"
 
94
  prompt += f"<|user|>\n{message}</s><|assistant|>"
95
  return prompt
96
 
@@ -98,52 +118,64 @@ def format_prompt_zephyr(message, history, system_message):
98
  # 3. CORE AI FUNCTIONS (Model Loading & Inference)
99
  # ===================================================================================
100
 
101
- def load_models():
102
- """Loads and caches the TTS and LLM models if they haven't been loaded yet."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  global tts_model, llm_model
104
-
105
  device = "cuda" if torch.cuda.is_available() else "cpu"
106
-
107
- # --- Load Coqui TTS XTTS Model ---
108
  if tts_model is None:
109
- print("Loading Coqui XTTS V2 model for the first time...")
110
- model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
111
- ModelManager().download_model(model_name)
112
- model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
113
-
114
- config = XttsConfig()
115
- config.load_json(os.path.join(model_path, "config.json"))
116
- tts_model = Xtts.init_from_config(config)
117
- tts_model.load_checkpoint(
118
- config,
119
- checkpoint_path=os.path.join(model_path, "model.pth"),
120
- vocab_path=os.path.join(model_path, "vocab.json"),
121
- eval=True,
122
- use_deepspeed=True,
123
- )
124
- tts_model.to(device)
125
- print("XTTS model loaded and cached successfully.")
126
-
127
- # --- Load Large Language Model (Zephyr) ---
128
  if llm_model is None:
129
- print("Loading LLM (Zephyr) for the first time...")
130
- zephyr_model_path = hf_hub_download(
131
- repo_id="TheBloke/zephyr-7B-beta-GGUF",
132
- filename="zephyr-7b-beta.Q5_K_M.gguf"
133
- )
134
- llm_model = Llama(
135
- model_path=zephyr_model_path,
136
- n_gpu_layers=-1, # Offload all layers to GPU
137
- n_ctx=4096,
138
- n_batch=512,
139
- verbose=False
140
- )
141
- print("LLM loaded and cached successfully.")
142
-
143
  return tts_model, llm_model
144
 
145
- def generate_text_stream(llm_instance, prompt, history, system_message):
146
- """Generates text using the loaded LLM."""
 
147
  formatted_prompt = format_prompt_zephyr(prompt, history, system_message)
148
  stream = llm_instance(
149
  formatted_prompt,
@@ -154,120 +186,133 @@ def generate_text_stream(llm_instance, prompt, history, system_message):
154
  stream=True
155
  )
156
  for response in stream:
157
- char = response["choices"][0]["text"]
158
- if "<|user|>" in char or emoji.is_emoji(char):
159
- return
160
- yield char
 
161
 
162
- def generate_audio_stream(tts_instance, text, language, latents):
163
- """Generates audio using the loaded TTS model."""
164
  gpt_cond_latent, speaker_embedding = latents
165
  try:
166
- chunks = tts_instance.inference_stream(
167
- text,
168
- language,
169
- gpt_cond_latent,
170
- speaker_embedding,
171
  temperature=0.85,
172
- )
173
- for chunk in chunks:
174
  if chunk is not None:
175
  yield chunk.detach().cpu().numpy().squeeze().tobytes()
176
  except RuntimeError as e:
177
  print(f"Error during TTS inference: {e}")
 
178
  if "device-side assert" in str(e) and api:
179
- gr.Warning("Critical GPU error. Restarting the Space...")
180
- api.restart_space(repo_id=repo_id)
 
 
 
181
 
182
  # ===================================================================================
183
  # 4. MAIN GRADIO FUNCTION (Decorated for ZeroGPU)
184
  # ===================================================================================
185
 
186
- @spaces.GPU(duration=120) # Request GPU for 120 seconds
187
- def generate_story_and_speech(secret_token_input, input_text, chatbot_role):
188
- """The main function called by the Gradio interface."""
189
  if secret_token_input != SECRET_TOKEN:
190
- raise gr.Error('Invalid secret token provided.')
191
-
192
  if not input_text:
193
  return []
194
 
195
- # --- Step 1: Load models (will use cache after first run) ---
196
  tts, llm = load_models()
197
-
198
- # --- Pre-compute voice latents ---
199
- latent_map = {}
200
- for role, filename in [("Cloée", "cloee-1.wav"), ("Julian", "julian-bedtime-style-1.wav"),
201
- ("Pirate", "pirate_by_coqui.wav"), ("Thera", "thera-1.wav")]:
 
 
 
 
202
  path = os.path.join("voices", filename)
203
- latent_map[role] = tts.get_conditioning_latents(audio_path=path, gpt_cond_len=30, max_ref_length=60)
204
-
205
- # --- Step 2: Generate the full story text ---
206
- history = [[input_text, None]]
 
 
207
  full_story_text = "".join(
208
  generate_text_stream(llm, history[-1][0], history[:-1], system_message=ROLE_PROMPTS[chatbot_role])
209
- )
210
-
211
- # --- Step 3: Post-process text and generate audio sentence by sentence ---
212
- full_story_text = re.sub(r"([^\x00-\x7F]|\w)([.?!]+)", r"\1 \2", full_story_text.strip())
213
  if not full_story_text:
214
  return []
215
 
 
216
  sentences = split_sentences(full_story_text, SENTENCE_SPLIT_LENGTH)
217
- lang = langid.classify(sentences[0])[0] if sentences else 'en'
218
-
219
- results = []
220
  for sentence in sentences:
221
  if not any(c.isalnum() for c in sentence):
222
  continue
223
 
224
  audio_chunks = generate_audio_stream(tts, sentence, lang, latent_map[chatbot_role])
225
- if audio_chunks:
226
- pcm_data = b"".join(chunk for chunk in audio_chunks if chunk)
227
-
228
- # Optional: Noise reduction
229
- try:
230
- data_s16 = np.frombuffer(pcm_data, dtype=np.int16)
231
  float_data = data_s16.astype(np.float32) / 32767.0
232
- reduced_noise = nr.reduce_noise(y=float_data, sr=24000)
233
- final_pcm = (reduced_noise * 32767).astype(np.int16).tobytes()
234
- except Exception:
235
  final_pcm = pcm_data
236
-
237
- base64_audio = base64.b64encode(pcm_to_wav(final_pcm)).decode('utf-8')
238
- results.append({"text": sentence, "audio": base64_audio})
239
-
 
 
240
  return results
241
 
242
  # ===================================================================================
243
  # 5. GRADIO INTERFACE LAUNCH
244
  # ===================================================================================
245
 
246
- # --- Download voice files on startup ---
247
  print("Downloading voice files...")
248
- file_names = ['cloee-1.wav', 'julian-bedtime-style-1.wav', 'pirate_by_coqui.wav', 'thera-1.wav']
249
- base_url = 'https://raw.githubusercontent.com/ruslanmv/ai-story-server/main/voices/'
250
- os.makedirs('voices', exist_ok=True)
251
  for name in file_names:
252
- if not os.path.exists(os.path.join('voices', name)):
253
- response = requests.get(base_url + name)
254
- with open(os.path.join('voices', name), 'wb') as f:
255
- f.write(response.content)
 
 
 
 
 
256
 
257
- # --- Define the Gradio Interface ---
258
  demo = gr.Interface(
259
  fn=generate_story_and_speech,
260
  inputs=[
261
- gr.Text(label='Secret Token', type='password', value=SECRET_TOKEN),
262
  gr.Textbox(placeholder="What should the story be about?", label="Story Prompt"),
263
- gr.Dropdown(choices=ROLES, label="Select a Storyteller", value="Cloée")
264
  ],
265
  outputs=gr.JSON(label="Story and Audio Output"),
266
  title="AI Storyteller with ZeroGPU",
267
  description="Enter a prompt to generate a short story with voice narration using on-demand GPU.",
268
- allow_flagging="never"
269
  )
270
 
271
- # --- Launch the App ---
272
  if __name__ == "__main__":
273
- demo.queue().launch()
 
5
  import os
6
  import requests
7
  import base64
 
8
  import struct
9
  import re
10
  import textwrap
 
11
  import uuid
12
+ from typing import List, Dict, Tuple, Generator
13
+
14
+ # --- Load .env early (for HF_TOKEN / SECRET_TOKEN) ---
15
+ from dotenv import load_dotenv
16
+ load_dotenv()
17
 
18
  # --- Hugging Face Spaces & ZeroGPU ---
19
+ try:
20
+ import spaces # Required for ZeroGPU on HF
21
+ except Exception:
22
+ # Allow local runs without the spaces package
23
+ class _SpacesShim:
24
+ def GPU(self, *args, **kwargs):
25
+ def _wrap(fn):
26
+ return fn
27
+ return _wrap
28
+ spaces = _SpacesShim()
29
+
30
  import gradio as gr
31
 
32
  # --- Core ML & Data Libraries ---
 
46
  import langid
47
  import emoji
48
  import noisereduce as nr
 
49
 
50
  # ===================================================================================
51
  # 2. GLOBAL CONFIGURATION & HELPER FUNCTIONS
52
  # ===================================================================================
53
 
54
+ # Download NLTK data (punkt)
55
  nltk.download("punkt", quiet=True)
56
+
57
  os.environ["COQUI_TOS_AGREED"] = "1"
58
 
59
+ # Cached models
60
+ tts_model: Xtts | None = None
61
+ llm_model: Llama | None = None
 
62
 
63
+ # Configuration
64
  HF_TOKEN = os.environ.get("HF_TOKEN")
65
  api = HfApi(token=HF_TOKEN) if HF_TOKEN else None
66
  repo_id = "ruslanmv/ai-story-server"
67
+ SECRET_TOKEN = os.getenv("SECRET_TOKEN", "secret")
68
  SENTENCE_SPLIT_LENGTH = 250
69
  LLM_STOP_WORDS = ["</s>", "<|user|>", "/s>"]
70
 
71
+ # System prompts and roles
72
  default_system_message = (
73
  "You're a storyteller crafting a short tale for young listeners. Keep sentences short and simple. "
74
  "Use narrative style only, without lists or complex words. Type numbers as words (e.g., 'ten')."
 
81
  "Keep answers short, as if in a real conversation. Only provide the words AI Beard would speak."
82
  )
83
 
84
+ # --- Audio helpers ---
85
+ def pcm_to_wav(pcm_data: bytes, sample_rate: int = 24000, channels: int = 1, bit_depth: int = 16) -> bytes:
86
  if pcm_data.startswith(b"RIFF"):
87
  return pcm_data
88
  chunk_size = 36 + len(pcm_data)
89
+ header = struct.pack(
90
+ "<4sI4s4sIHHIIHH4sI",
91
+ b"RIFF", chunk_size, b"WAVE", b"fmt ",
92
+ 16, 1, channels, sample_rate,
93
+ sample_rate * channels * bit_depth // 8,
94
+ channels * bit_depth // 8, bit_depth,
95
+ b"data", len(pcm_data)
96
+ )
97
+ return header + pcm_data
98
+
99
+ def split_sentences(text: str, max_len: int) -> List[str]:
100
  sentences = nltk.sent_tokenize(text)
101
+ chunks: List[str] = []
102
+ for sent in sentences:
103
+ if len(sent) > max_len:
104
+ chunks.extend(textwrap.wrap(sent, max_len, break_long_words=True))
105
+ else:
106
+ chunks.append(sent)
107
+ return chunks
108
 
109
+ def format_prompt_zephyr(message: str, history: List[Tuple[str, str | None]], system_message: str) -> str:
110
  prompt = f"<|system|>\n{system_message}</s>"
111
  for user_prompt, bot_response in history:
112
+ if bot_response:
113
+ prompt += f"<|user|>\n{user_prompt}</s><|assistant|>\n{bot_response}</s>"
114
  prompt += f"<|user|>\n{message}</s><|assistant|>"
115
  return prompt
116
 
 
118
  # 3. CORE AI FUNCTIONS (Model Loading & Inference)
119
  # ===================================================================================
120
 
121
+ def _load_xtts(device: str) -> Xtts:
122
+ print("Loading Coqui XTTS V2 model (first run)...")
123
+ model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
124
+ ModelManager().download_model(model_name)
125
+ model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
126
+
127
+ config = XttsConfig()
128
+ config.load_json(os.path.join(model_path, "config.json"))
129
+ model = Xtts.init_from_config(config)
130
+ # NOTE: deepspeed not installed; keep False for Spaces
131
+ model.load_checkpoint(
132
+ config,
133
+ checkpoint_path=os.path.join(model_path, "model.pth"),
134
+ vocab_path=os.path.join(model_path, "vocab.json"),
135
+ eval=True,
136
+ use_deepspeed=False,
137
+ )
138
+ model.to(device)
139
+ print("XTTS model loaded.")
140
+ return model
141
+
142
+ def _load_llama() -> Llama:
143
+ print("Loading LLM (Zephyr) (first run)...")
144
+ zephyr_model_path = hf_hub_download(
145
+ repo_id="TheBloke/zephyr-7B-beta-GGUF",
146
+ filename="zephyr-7b-beta.Q5_K_M.gguf"
147
+ )
148
+ # Try GPU offload if available, else CPU
149
+ for n_gpu_layers in (-1, 0):
150
+ try:
151
+ llm = Llama(
152
+ model_path=zephyr_model_path,
153
+ n_gpu_layers=n_gpu_layers,
154
+ n_ctx=4096,
155
+ n_batch=512,
156
+ verbose=False
157
+ )
158
+ if n_gpu_layers == -1:
159
+ print("LLM loaded with GPU offload.")
160
+ else:
161
+ print("LLM loaded (CPU).")
162
+ return llm
163
+ except Exception as e:
164
+ print(f"LLM init with n_gpu_layers={n_gpu_layers} failed: {e}")
165
+ raise RuntimeError("Failed to initialize Llama model.")
166
+
167
+ def load_models() -> Tuple[Xtts, Llama]:
168
  global tts_model, llm_model
 
169
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
 
170
  if tts_model is None:
171
+ tts_model = _load_xtts(device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
  if llm_model is None:
173
+ llm_model = _load_llama()
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  return tts_model, llm_model
175
 
176
+ def generate_text_stream(llm_instance: Llama, prompt: str,
177
+ history: List[Tuple[str, str | None]],
178
+ system_message: str) -> Generator[str, None, None]:
179
  formatted_prompt = format_prompt_zephyr(prompt, history, system_message)
180
  stream = llm_instance(
181
  formatted_prompt,
 
186
  stream=True
187
  )
188
  for response in stream:
189
+ ch = response["choices"][0]["text"]
190
+ # Guard against control tokens & isolated emoji artefacts
191
+ if "<|user|>" in ch or (len(ch) == 1 and emoji.is_emoji(ch)):
192
+ continue
193
+ yield ch
194
 
195
+ def generate_audio_stream(tts_instance: Xtts, text: str, language: str,
196
+ latents: Tuple[np.ndarray, np.ndarray]) -> Generator[bytes, None, None]:
197
  gpt_cond_latent, speaker_embedding = latents
198
  try:
199
+ for chunk in tts_instance.inference_stream(
200
+ text=text,
201
+ language=language,
202
+ gpt_cond_latent=gpt_cond_latent,
203
+ speaker_embedding=speaker_embedding,
204
  temperature=0.85,
205
+ ):
 
206
  if chunk is not None:
207
  yield chunk.detach().cpu().numpy().squeeze().tobytes()
208
  except RuntimeError as e:
209
  print(f"Error during TTS inference: {e}")
210
+ # Soft-restart if GPU went bad and we can talk to the HF API
211
  if "device-side assert" in str(e) and api:
212
+ gr.Warning("Critical GPU error. Attempting to restart the Space...")
213
+ try:
214
+ api.restart_space(repo_id=repo_id)
215
+ except Exception as _:
216
+ pass
217
 
218
  # ===================================================================================
219
  # 4. MAIN GRADIO FUNCTION (Decorated for ZeroGPU)
220
  # ===================================================================================
221
 
222
+ @spaces.GPU(duration=120) # Request GPU for 120 seconds
223
+ def generate_story_and_speech(secret_token_input: str, input_text: str, chatbot_role: str) -> List[Dict[str, str]]:
 
224
  if secret_token_input != SECRET_TOKEN:
225
+ raise gr.Error("Invalid secret token provided.")
 
226
  if not input_text:
227
  return []
228
 
229
+ # Load models
230
  tts, llm = load_models()
231
+
232
+ # Pre-compute voice latents
233
+ latent_map: Dict[str, Tuple[np.ndarray, np.ndarray]] = {}
234
+ for role, filename in [
235
+ ("Cloée", "cloee-1.wav"),
236
+ ("Julian", "julian-bedtime-style-1.wav"),
237
+ ("Pirate", "pirate_by_coqui.wav"),
238
+ ("Thera", "thera-1.wav"),
239
+ ]:
240
  path = os.path.join("voices", filename)
241
+ latent_map[role] = tts.get_conditioning_latents(
242
+ audio_path=path, gpt_cond_len=30, max_ref_length=60
243
+ )
244
+
245
+ # Generate story text
246
+ history: List[Tuple[str, str | None]] = [(input_text, None)]
247
  full_story_text = "".join(
248
  generate_text_stream(llm, history[-1][0], history[:-1], system_message=ROLE_PROMPTS[chatbot_role])
249
+ ).strip()
250
+
 
 
251
  if not full_story_text:
252
  return []
253
 
254
+ # Tokenize into shorter sentences for TTS
255
  sentences = split_sentences(full_story_text, SENTENCE_SPLIT_LENGTH)
256
+ lang = langid.classify(sentences[0])[0] if sentences else "en"
257
+
258
+ results: List[Dict[str, str]] = []
259
  for sentence in sentences:
260
  if not any(c.isalnum() for c in sentence):
261
  continue
262
 
263
  audio_chunks = generate_audio_stream(tts, sentence, lang, latent_map[chatbot_role])
264
+ pcm_data = b"".join(chunk for chunk in audio_chunks if chunk)
265
+
266
+ # Optional noise reduction (best-effort)
267
+ try:
268
+ data_s16 = np.frombuffer(pcm_data, dtype=np.int16)
269
+ if data_s16.size > 0:
270
  float_data = data_s16.astype(np.float32) / 32767.0
271
+ reduced = nr.reduce_noise(y=float_data, sr=24000)
272
+ final_pcm = (reduced * 32767).astype(np.int16).tobytes()
273
+ else:
274
  final_pcm = pcm_data
275
+ except Exception:
276
+ final_pcm = pcm_data
277
+
278
+ b64_wav = base64.b64encode(pcm_to_wav(final_pcm)).decode("utf-8")
279
+ results.append({"text": sentence, "audio": b64_wav})
280
+
281
  return results
282
 
283
  # ===================================================================================
284
  # 5. GRADIO INTERFACE LAUNCH
285
  # ===================================================================================
286
 
287
+ # Download voice files on startup
288
  print("Downloading voice files...")
289
+ file_names = ["cloee-1.wav", "julian-bedtime-style-1.wav", "pirate_by_coqui.wav", "thera-1.wav"]
290
+ base_url = "https://raw.githubusercontent.com/ruslanmv/ai-story-server/main/voices/"
291
+ os.makedirs("voices", exist_ok=True)
292
  for name in file_names:
293
+ dst = os.path.join("voices", name)
294
+ if not os.path.exists(dst):
295
+ try:
296
+ resp = requests.get(base_url + name, timeout=30)
297
+ resp.raise_for_status()
298
+ with open(dst, "wb") as f:
299
+ f.write(resp.content)
300
+ except Exception as e:
301
+ print(f"Failed to download {name}: {e}")
302
 
303
+ # Define the Gradio Interface
304
  demo = gr.Interface(
305
  fn=generate_story_and_speech,
306
  inputs=[
307
+ gr.Textbox(label="Secret Token", type="password", value=SECRET_TOKEN),
308
  gr.Textbox(placeholder="What should the story be about?", label="Story Prompt"),
309
+ gr.Dropdown(choices=ROLES, label="Select a Storyteller", value="Cloée"),
310
  ],
311
  outputs=gr.JSON(label="Story and Audio Output"),
312
  title="AI Storyteller with ZeroGPU",
313
  description="Enter a prompt to generate a short story with voice narration using on-demand GPU.",
314
+ allow_flagging="never",
315
  )
316
 
 
317
  if __name__ == "__main__":
318
+ demo.queue().launch()
requirements.txt CHANGED
@@ -2,8 +2,11 @@
2
  torch==2.2.2
3
  torchaudio==2.2.2
4
  gradio==5.47.2
5
- huggingface-hub
6
  python-dotenv
 
 
 
7
 
8
  # TTS Dependencies
9
  TTS @ git+https://github.com/coqui-ai/TTS@v0.22.0
@@ -13,7 +16,7 @@ pydantic==2.5.3
13
  llama-cpp-python==0.2.79
14
 
15
  # Audio & Text Processing
16
- noisereduce==3.0.1
17
  pydub
18
  langid
19
  nltk
@@ -22,4 +25,4 @@ ffmpeg-python
22
 
23
  # Japanese Text (if needed by TTS)
24
  mecab-python3==1.0.9
25
- unidic-lite==1.0.8
 
2
  torch==2.2.2
3
  torchaudio==2.2.2
4
  gradio==5.47.2
5
+ huggingface-hub>=0.19
6
  python-dotenv
7
+ spaces
8
+ requests
9
+ numpy
10
 
11
  # TTS Dependencies
12
  TTS @ git+https://github.com/coqui-ai/TTS@v0.22.0
 
16
  llama-cpp-python==0.2.79
17
 
18
  # Audio & Text Processing
19
+ noisereduce==3.0.3
20
  pydub
21
  langid
22
  nltk
 
25
 
26
  # Japanese Text (if needed by TTS)
27
  mecab-python3==1.0.9
28
+ unidic-lite==1.0.8