Spaces:

Rajhuggingface4253
/

neu

Paused

App Files Files Community

Rajhuggingface4253 commited on Oct 19

Commit

a6aca15

verified ·

1 Parent(s): b662d71

Update app.py

Browse files

Files changed (1) hide show

app.py +85 -32

app.py CHANGED Viewed

@@ -299,41 +299,81 @@ async def text_to_speech(
     output_format: str = Form("wav", pattern="^(wav|mp3|flac)$"),
     reference_audio: UploadFile = File(...)):
     """
-    Standard blocking TTS endpoint with in-memory processing and caching.
     """
     if not hasattr(app.state, 'tts_wrapper'):
         raise HTTPException(status_code=503, detail="Service unavailable: Model not loaded")
     start_time = time.time()
     try:
-        # 1. Convert the uploaded file to WAV directly in memory
-        converted_wav_buffer = await convert_to_wav_in_memory(reference_audio)
         ref_audio_bytes = converted_wav_buffer.getvalue()
-        # 2. Offload the blocking AI process (now faster with caching)
-        audio_data = await run_blocking_task_async(
-            app.state.tts_wrapper.generate_speech_blocking,
-            text,
-            ref_audio_bytes, # Pass bytes, not a path
-            reference_text
         )
-        # 3. Convert to requested output format
-        audio_bytes = await run_blocking_task_async(
-            app.state.tts_wrapper._convert_to_streamable_format,
-            audio_data,
-            output_format
         )
         processing_time = time.time() - start_time
-        audio_duration = len(audio_data) / SAMPLE_RATE
         return Response(
             content=audio_bytes,
             media_type=f"audio/{'mpeg' if output_format == 'mp3' else output_format}",
             headers={
                 "Content-Disposition": f"attachment; filename=tts_output.{output_format}",
                 "X-Processing-Time": f"{processing_time:.2f}s",
-                "X-Audio-Duration": f"{audio_duration:.2f}s"
             }
         )
     except Exception as e:
@@ -341,7 +381,6 @@ async def text_to_speech(
         if isinstance(e, HTTPException):
              raise
         raise HTTPException(status_code=500, detail=f"Synthesis failed: {e}")
 @app.post("/synthesize/stream")
 async def stream_text_to_speech_cloning(
     text: str = Form(..., min_length=1, max_length=5000),
@@ -350,15 +389,16 @@ async def stream_text_to_speech_cloning(
     output_format: str = Form("mp3", pattern="^(wav|mp3|flac)$"),
     reference_audio: UploadFile = File(...)):
     """
-    Sentence-by-Sentence Streaming using a high-performance, asyncio-native
-    producer-consumer pipeline.
     """
     if not hasattr(app.state, 'tts_wrapper'):
         raise HTTPException(status_code=503, detail="Service unavailable: Model not loaded")
     async def stream_generator():
         loop = asyncio.get_event_loop()
-        q = asyncio.Queue(maxsize=2)
         async def producer():
             try:
@@ -366,7 +406,7 @@ async def stream_text_to_speech_cloning(
                 ref_audio_bytes = converted_wav_buffer.getvalue()
                 audio_hash = hashlib.sha256(ref_audio_bytes).hexdigest()
-                # ✅ Use LRU cache like blocking endpoint
                 ref_s = await loop.run_in_executor(
                     tts_executor,
                     app.state.tts_wrapper._get_or_create_reference_encoding,
@@ -375,39 +415,52 @@ async def stream_text_to_speech_cloning(
                 )
                 sentences = app.state.tts_wrapper._split_text_into_chunks(text)
                 def process_chunk(sentence_text):
                     with torch.no_grad():
                         audio_chunk = app.state.tts_wrapper.tts_model.infer(sentence_text, ref_s, reference_text)
                     return app.state.tts_wrapper._convert_to_streamable_format(audio_chunk, output_format)
-                # Schedule all chunks to be processed in the background.
                 for sentence in sentences:
                     task = loop.run_in_executor(tts_executor, process_chunk, sentence)
-                    await q.put(task) # Put the FUTURE, not the result, in the queue.
             except Exception as e:
                 logger.error(f"Error in producer task: {e}")
                 await q.put(e)
             finally:
-                await q.put(None) # Signal that all tasks have been scheduled.
         producer_task = asyncio.create_task(producer())
-        # The CONSUMER's job is to wait for each result and yield it.
-        while True:
             result = await q.get()
-            if result is None:
-                break
             if isinstance(result, Exception):
                 logger.error(f"Terminating stream due to producer error: {result}")
                 raise result
-            # Await the result of the background task
             chunk_bytes = await result
             yield chunk_bytes
         await producer_task
     return StreamingResponse(

     output_format: str = Form("wav", pattern="^(wav|mp3|flac)$"),
     reference_audio: UploadFile = File(...)):
     """
+    MAXIMUM SPEED TTS endpoint with full parallel processing.
     """
     if not hasattr(app.state, 'tts_wrapper'):
         raise HTTPException(status_code=503, detail="Service unavailable: Model not loaded")
     start_time = time.time()
     try:
+        # ✅ PARALLEL STEP 1: Convert audio AND split text concurrently
+        converted_wav_buffer, sentences = await asyncio.gather(
+            convert_to_wav_in_memory(reference_audio),
+            asyncio.get_event_loop().run_in_executor(
+                tts_executor,
+                app.state.tts_wrapper._split_text_into_chunks,
+                text
+            )
+        )
         ref_audio_bytes = converted_wav_buffer.getvalue()
+        audio_hash = hashlib.sha256(ref_audio_bytes).hexdigest()
+        logger.info(f"🚀 MAX PARALLEL: Processing {len(sentences)} chunks")
+        # ✅ PARALLEL STEP 2: Get reference encoding
+        ref_s = await asyncio.get_event_loop().run_in_executor(
+            tts_executor,
+            app.state.tts_wrapper._get_or_create_reference_encoding,
+            audio_hash,
+            ref_audio_bytes
         )
+        # ✅ MAX PARALLEL STEP 3: Process ALL chunks simultaneously
+        loop = asyncio.get_event_loop()
+        def process_single_chunk(sentence_text):
+            with torch.no_grad():
+                return app.state.tts_wrapper.tts_model.infer(sentence_text, ref_s, reference_text)
+        # Schedule ALL chunks in parallel (limited by MAX_WORKERS)
+        tasks = []
+        for sentence in sentences:
+            task = loop.run_in_executor(tts_executor, process_single_chunk, sentence)
+            tasks.append(task)
+        # Wait for ALL chunks to complete
+        chunk_audios = await asyncio.gather(*tasks)
+        # ✅ Combine all audio chunks (fast numpy concatenation)
+        combined_audio = np.concatenate(chunk_audios) if chunk_audios else np.array([])
+        # ✅ PARALLEL STEP 4: Convert format while calculating stats
+        audio_bytes, audio_duration = await asyncio.gather(
+            asyncio.get_event_loop().run_in_executor(
+                tts_executor,
+                app.state.tts_wrapper._convert_to_streamable_format,
+                combined_audio,
+                output_format
+            ),
+            asyncio.get_event_loop().run_in_executor(
+                tts_executor,
+                lambda: len(combined_audio) / SAMPLE_RATE
+            )
         )
         processing_time = time.time() - start_time
+        logger.info(f"✅ MAX SPEED Synthesis: {processing_time:.2f}s for {audio_duration:.2f}s audio ({len(sentences)} chunks)")
         return Response(
             content=audio_bytes,
             media_type=f"audio/{'mpeg' if output_format == 'mp3' else output_format}",
             headers={
                 "Content-Disposition": f"attachment; filename=tts_output.{output_format}",
                 "X-Processing-Time": f"{processing_time:.2f}s",
+                "X-Audio-Duration": f"{audio_duration:.2f}s",
+                "X-Parallel-Chunks": str(len(sentences)),
+                "X-Speed-Ratio": f"{audio_duration/processing_time:.2f}x"  # Real-time factor
             }
         )
     except Exception as e:
         if isinstance(e, HTTPException):
              raise
         raise HTTPException(status_code=500, detail=f"Synthesis failed: {e}")
 @app.post("/synthesize/stream")
 async def stream_text_to_speech_cloning(
     text: str = Form(..., min_length=1, max_length=5000),
     output_format: str = Form("mp3", pattern="^(wav|mp3|flac)$"),
     reference_audio: UploadFile = File(...)):
     """
+    TRUE Real-Time Streaming with 2 workers: Optimized for continuous audio.
     """
     if not hasattr(app.state, 'tts_wrapper'):
         raise HTTPException(status_code=503, detail="Service unavailable: Model not loaded")
     async def stream_generator():
         loop = asyncio.get_event_loop()
+        # ✅ Perfect queue size for 2 workers
+        q = asyncio.Queue(maxsize=3)  # Store 3 ready chunks for smooth streaming
         async def producer():
             try:
                 ref_audio_bytes = converted_wav_buffer.getvalue()
                 audio_hash = hashlib.sha256(ref_audio_bytes).hexdigest()
+                # Get reference encoding (uses 1 worker temporarily)
                 ref_s = await loop.run_in_executor(
                     tts_executor,
                     app.state.tts_wrapper._get_or_create_reference_encoding,
                 )
                 sentences = app.state.tts_wrapper._split_text_into_chunks(text)
+                logger.info(f"Streaming {len(sentences)} chunks with 2 workers")
                 def process_chunk(sentence_text):
                     with torch.no_grad():
                         audio_chunk = app.state.tts_wrapper.tts_model.infer(sentence_text, ref_s, reference_text)
                     return app.state.tts_wrapper._convert_to_streamable_format(audio_chunk, output_format)
+                # ✅ SCHEDULE ALL TASKS IMMEDIATELY
                 for sentence in sentences:
                     task = loop.run_in_executor(tts_executor, process_chunk, sentence)
+                    await q.put(task)  # Queue futures immediately
             except Exception as e:
                 logger.error(f"Error in producer task: {e}")
                 await q.put(e)
             finally:
+                await q.put(None)  # Signal end of tasks
         producer_task = asyncio.create_task(producer())
+        # ✅ EFFICIENT CONSUMER for 2 workers
+        pending_tasks = set()
+        completed_count = 0
+        total_chunks = len(app.state.tts_wrapper._split_text_into_chunks(text))
+        while completed_count < total_chunks:
+            # Get next item from queue
             result = await q.get()
             if isinstance(result, Exception):
                 logger.error(f"Terminating stream due to producer error: {result}")
                 raise result
+            if result is None:
+                break  # No more tasks coming
+            # ✅ Process this chunk immediately
             chunk_bytes = await result
             yield chunk_bytes
+            completed_count += 1
+            # ✅ Check if we can process next chunk without waiting
+            # This ensures continuous streaming
+            if completed_count < total_chunks and not q.empty():
+                continue  # Immediately process next ready chunk
         await producer_task
     return StreamingResponse(