Spaces:

Nick021402
/

PodXplain

Sleeping

App Files Files Community

Nick021402 commited on May 24, 2025

Commit

daae1c2

verified ·

1 Parent(s): 3491ff8

Update tts_engine.py

Browse files

Files changed (1) hide show

tts_engine.py +10 -29

tts_engine.py CHANGED Viewed

@@ -45,13 +45,13 @@ class NariDIAEngine:
             logger.info("Nari DIA model initialized successfully.")
         except Exception as e:
-            logger.error(f"Failed to initialize Nari DIA model: {e}")
             self.model = None
     def synthesize_segment(
         self,
         text: str,
-        speaker: str, # This will be 'speaker1', 'speaker2' from segmenter
         output_path: str
     ) -> Optional[str]:
         """
@@ -59,7 +59,7 @@ class NariDIAEngine:
         Args:
             text: Text to synthesize
-            speaker: Speaker identifier (e.g., 'speaker1', 'speaker2')
             output_path: Path to save the audio file
         Returns:
@@ -71,24 +71,14 @@ class NariDIAEngine:
         try:
             # Nari DIA expects [S1] or [S2] tags.
-            # Map your generic 'speaker1'/'speaker2' to Nari DIA's tags.
-            # Note: Nari DIA primarily supports S1 and S2. If your segmenter
-            # generates more speakers, they will be mapped to S1/S2.
-            # Map based on the speaker index derived from segmenter
-            if speaker == "speaker1":
-                dia_speaker_tag = "[S1]"
-            elif speaker == "speaker2":
-                dia_speaker_tag = "[S2]"
             else:
-                # For speaker3, speaker4, etc., we'll alternate or default to S1/S2
-                # A more sophisticated mapping might be needed if you want more than 2 distinct voices
-                # For simplicity, we'll just alternate beyond speaker2
-                if int(speaker.replace('speaker', '')) % 2 == 1: # Odd speakers to S1
-                    dia_speaker_tag = "[S1]"
-                else: # Even speakers to S2
-                    dia_speaker_tag = "[S2]"
-                logger.warning(f"Nari DIA primarily supports [S1] and [S2]. Mapping '{speaker}' to '{dia_speaker_tag}'.")
             # Nari DIA expects the speaker tag at the beginning of the segment
             full_text_input = f"{dia_speaker_tag} {text}"
@@ -102,7 +92,6 @@ class NariDIAEngine:
                 inputs = {k: v.to("cuda") for k, v in inputs.items()}
             with torch.no_grad():
-                # The .generate method should return audio waveform
                 audio_waveform = self.model.generate(**inputs).cpu().numpy().squeeze()
             # Nari DIA's sampling rate (check documentation if different)
@@ -119,11 +108,3 @@ class NariDIAEngine:
         except Exception as e:
             logger.error(f"Failed to synthesize segment with Nari DIA: {e}", exc_info=True) # exc_info to print full traceback
             return None
-    # Remove the mock audio generation function as it's no longer needed
-    # def _generate_mock_audio(self, text: str, speaker: str) -> np.ndarray:
-    #     """
-    #     Generate mock audio data for demonstration.
-    #     In a real implementation, this would be replaced with actual TTS.
-    #     """
-    #     # ... (existing mock audio generation code) ...

             logger.info("Nari DIA model initialized successfully.")
         except Exception as e:
+            logger.error(f"Failed to initialize Nari DIA model: {e}", exc_info=True)
             self.model = None
     def synthesize_segment(
         self,
         text: str,
+        speaker: str, # This will be 'S1' or 'S2' from segmenter
         output_path: str
     ) -> Optional[str]:
         """
         Args:
             text: Text to synthesize
+            speaker: Speaker identifier ('S1' or 'S2' expected from segmenter)
             output_path: Path to save the audio file
         Returns:
         try:
             # Nari DIA expects [S1] or [S2] tags.
+            # The segmenter is now directly outputting "S1" or "S2".
+            # We just need to wrap it in brackets.
+            if speaker in ["S1", "S2"]:
+                dia_speaker_tag = f"[{speaker}]"
             else:
+                # Fallback in case segmenter outputs something unexpected
+                logger.warning(f"Unexpected speaker tag '{speaker}' from segmenter. Defaulting to [S1].")
+                dia_speaker_tag = "[S1]"
             # Nari DIA expects the speaker tag at the beginning of the segment
             full_text_input = f"{dia_speaker_tag} {text}"
                 inputs = {k: v.to("cuda") for k, v in inputs.items()}
             with torch.no_grad():
                 audio_waveform = self.model.generate(**inputs).cpu().numpy().squeeze()
             # Nari DIA's sampling rate (check documentation if different)
         except Exception as e:
             logger.error(f"Failed to synthesize segment with Nari DIA: {e}", exc_info=True) # exc_info to print full traceback
             return None