MariaKaiser commited on
Commit
cf9540e
·
verified ·
1 Parent(s): fe0e5ad

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -29
app.py CHANGED
@@ -5,6 +5,7 @@ import torchaudio
5
  import os
6
  from pydantic import BaseModel
7
  from typing import List, Optional
 
8
 
9
 
10
  # MODEL_DIR = "my_model"
@@ -140,7 +141,7 @@ def inference_by_model(text: str, audio_file: str, save_path: str) -> str:
140
 
141
  #_______________generate audios and folder structure_______________________
142
 
143
- async def generate_story_audios(story: StoryCreationDTO, base_output: str = "stories"):
144
  """
145
  Generates audio files and folders for the entire story
146
  """
@@ -182,7 +183,7 @@ async def generate_story_audios(story: StoryCreationDTO, base_output: str = "sto
182
  from pydub import AudioSegment
183
  import asyncio
184
 
185
- async def concat_story_audio(story: StoryCreationDTO, base_output="stories", final_path: str = None,): # full path including filename
186
  story_dir = Path(base_output) / story.storyId
187
  story_dir.mkdir(parents=True, exist_ok=True)
188
 
@@ -275,28 +276,10 @@ def root():
275
 
276
  #-----------------------------------------------------------
277
 
278
-
279
- ########## creating a dummy audio file
280
- import torchaudio
281
- import torch
282
- import os
283
-
284
- OUTPUT_DIR = "outputs"
285
- os.makedirs(OUTPUT_DIR, exist_ok=True)
286
-
287
- dummy_path = os.path.join(OUTPUT_DIR, "dummy.wav")
288
-
289
- # Generate 1 second of silence at 24kHz
290
- if not os.path.exists(dummy_path):
291
- silent = torch.zeros(1, 24000) # 1 channel, 24000 samples
292
- torchaudio.save(dummy_path, silent, 24000)
293
-
294
- from pydantic import BaseModel
295
-
296
  class TTSResponse(BaseModel):
297
  file_name: str
298
  duration: float # seconds
299
- audio_base64: str
300
 
301
  ######## Convert your audio to Base64
302
  import base64
@@ -318,32 +301,34 @@ def audio_to_base64(audio_path: str) -> (str, float):
318
 
319
  @app.post("/tts/")
320
  async def process_story(story: StoryCreationDTO):
 
 
321
  print(story.storyId)
322
-
323
  for cast in story.cast:
324
  print(cast.name, cast.voiceReference)
325
-
326
  for chapter in story.chapters:
327
  for scene in chapter.scenes:
328
  for sentence in scene.sentences:
329
  print(sentence.speaker, sentence.sentence)
330
 
331
- # For testing, use your dummy WAV
332
- dummy_path = os.path.join(OUTPUT_DIR, "dummy.wav")
 
 
 
 
333
 
334
  # Convert to base64 and get duration
335
- audio_b64, duration = audio_to_base64(dummy_path)
336
 
337
  response = TTSResponse(
338
- file_name="chapter1_scene2.wav",
339
  duration=duration,
340
  audio_base64=audio_b64
341
  )
342
 
343
  return response
344
 
345
- #return {"status": "Story received"}
346
-
347
 
348
 
349
  # async def tts_endpoint(
 
5
  import os
6
  from pydantic import BaseModel
7
  from typing import List, Optional
8
+ from pathlib import Path
9
 
10
 
11
  # MODEL_DIR = "my_model"
 
141
 
142
  #_______________generate audios and folder structure_______________________
143
 
144
+ async def generate_story_audios(story: StoryCreationDTO, base_output: str):
145
  """
146
  Generates audio files and folders for the entire story
147
  """
 
183
  from pydub import AudioSegment
184
  import asyncio
185
 
186
+ async def concat_story_audio(story: StoryCreationDTO, base_output: str, final_path: str = None): # full path including filename
187
  story_dir = Path(base_output) / story.storyId
188
  story_dir.mkdir(parents=True, exist_ok=True)
189
 
 
276
 
277
  #-----------------------------------------------------------
278
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
  class TTSResponse(BaseModel):
280
  file_name: str
281
  duration: float # seconds
282
+ audio_base64: str
283
 
284
  ######## Convert your audio to Base64
285
  import base64
 
301
 
302
  @app.post("/tts/")
303
  async def process_story(story: StoryCreationDTO):
304
+
305
+ # Optional: print info for debugging
306
  print(story.storyId)
 
307
  for cast in story.cast:
308
  print(cast.name, cast.voiceReference)
 
309
  for chapter in story.chapters:
310
  for scene in chapter.scenes:
311
  for sentence in scene.sentences:
312
  print(sentence.speaker, sentence.sentence)
313
 
314
+ # 1️⃣ Generate all sentence audios and folder structure
315
+ await generate_story_audios(story, base_output=OUTPUT_DIR)
316
+
317
+ # 2️⃣ Concatenate all into final story audio
318
+ final_story_path = os.path.join(OUTPUT_DIR, story.storyId, f"{story.storyId}_full.wav")
319
+ final_generated_story_path = await concat_story_audio(story, base_output=OUTPUT_DIR, final_path=final_story_path)
320
 
321
  # Convert to base64 and get duration
322
+ audio_b64, duration = audio_to_base64(final_generated_story_path)
323
 
324
  response = TTSResponse(
325
+ file_name= os.path.basename(final_generated_story_path),
326
  duration=duration,
327
  audio_base64=audio_b64
328
  )
329
 
330
  return response
331
 
 
 
332
 
333
 
334
  # async def tts_endpoint(