Spaces:

gnosticdev
/

Podcastking2

Sleeping

App Files Files Community

gnosticdev commited on Jun 20, 2025

Commit

64bc311

verified ·

1 Parent(s): 00b6284

Update conver.py

Browse files

Files changed (1) hide show

conver.py +48 -95

conver.py CHANGED Viewed

@@ -9,7 +9,6 @@ import tempfile
 from pydub import AudioSegment
 import base64
 from pathlib import Path
-import numpy as np
 @dataclass
 class ConversationConfig:
@@ -24,7 +23,6 @@ class URLToAudioConverter:
         self.llm_out = None
     def fetch_text(self, url: str) -> str:
-        """Obtiene texto desde una URL"""
         if not url:
             raise ValueError("URL cannot be empty")
         full_url = f"{self.config.prefix_url}{url}"
@@ -36,7 +34,6 @@ class URLToAudioConverter:
             raise RuntimeError(f"Failed to fetch URL: {e}")
     def extract_conversation(self, text: str) -> Dict:
-        """Convierte texto plano a estructura de conversación"""
         if not text:
             raise ValueError("Input text cannot be empty")
         try:
@@ -56,7 +53,6 @@ class URLToAudioConverter:
             raise RuntimeError(f"Failed to extract conversation: {str(e)}")
     async def text_to_speech(self, conversation_json: Dict, voice_1: str, voice_2: str) -> Tuple[List[str], str]:
-        """Convierte JSON de conversación a archivos de audio"""
         output_dir = Path(self._create_output_directory())
         filenames = []
         try:
@@ -71,10 +67,8 @@ class URLToAudioConverter:
             raise RuntimeError(f"Text-to-speech failed: {e}")
     async def _generate_audio(self, text: str, voice: str) -> str:
-        """Genera audio temporal con edge-tts"""
         if not text.strip():
             raise ValueError("Text cannot be empty")
         communicate = edge_tts.Communicate(
             text,
             voice.split(" - ")[0],
@@ -86,123 +80,82 @@ class URLToAudioConverter:
             return tmp_file.name
     def _create_output_directory(self) -> str:
-        """Crea directorio único para los archivos"""
         folder_name = base64.urlsafe_b64encode(os.urandom(8)).decode("utf-8")
         os.makedirs(folder_name, exist_ok=True)
         return folder_name
     def combine_audio_files(self, filenames: List[str]) -> AudioSegment:
-        """Combina segmentos de audio"""
         if not filenames:
             raise ValueError("No audio files provided")
         combined = AudioSegment.empty()
         for filename in filenames:
             combined += AudioSegment.from_file(filename, format="mp3")
         return combined
-    def _detect_silences(self, audio: AudioSegment, min_len: int = 500, thresh: int = -40) -> List[Tuple[int, int]]:
-        """Detecta intervalos de silencio en el audio"""
-        silent_ranges = []
-        start = None
-        samples = np.array(audio.get_array_of_samples())
-        window_size = int(min_len * audio.frame_rate / 1000)
-        for i in range(0, len(samples) - window_size, window_size):
-            window = samples[i:i+window_size]
-            if np.max(window) < thresh:
-                if start is None:
-                    start = i
-            else:
-                if start is not None:
-                    silent_ranges.append((start, i))
-                    start = None
-        return silent_ranges
     def add_background_music_and_tags(
         self,
         speech_audio: AudioSegment,
         music_path: str,
         tags_paths: List[str]
     ) -> AudioSegment:
-        """Mezcla música de fondo y tags inteligentemente"""
-        # 1. Cargar y ajustar música
-        music = AudioSegment.from_file(music_path).fade_out(2000)
-        music = music - 25  # Reducir volumen
-        # 2. Loop inteligente (solo si es necesario)
         if len(music) < len(speech_audio):
-            loops = (len(speech_audio) // len(music)) + 1
-            music = music * loops
         music = music[:len(speech_audio)]
-        # 3. Mezclar voz y música
-        mixed = speech_audio.overlay(music, position=0)
-        # 4. Insertar tags
         tag_intro = AudioSegment.from_file(tags_paths[0]) - 10
-        tag_transition = AudioSegment.from_file(tags_paths[1]) - 10
-        # Tag inicial
         final_audio = tag_intro + mixed
-        # Tags en pausas (opcional)
-        silences = self._detect_silences(speech_audio)
-        for start, end in reversed(silences):
-            if (end - start) > len(tag_transition):
-                final_audio = final_audio.overlay(
-                    tag_transition,
-                    position=start + 100  # Pequeño margen
-                )
         return final_audio
-    async def process_content(
         self,
-        content: str,
         voice_1: str,
-        voice_2: str,
-        is_url: bool = False
     ) -> Tuple[str, str]:
-        """Procesa contenido (URL o texto) a audio final"""
-        try:
-            # 1. Obtener texto estructurado
-            if is_url:
-                text = self.fetch_text(content)
-                if len(words := text.split()) > self.config.max_words:
-                    text = " ".join(words[:self.config.max_words])
-                conversation = self.extract_conversation(text)
-            else:
-                conversation = self.extract_conversation(content)
-            # 2. Generar audio
-            audio_files, folder_name = await self.text_to_speech(conversation, voice_1, voice_2)
-            combined = self.combine_audio_files(audio_files)
-            # 3. Mezclar con música y tags
-            final_audio = self.add_background_music_and_tags(
-                combined,
-                "musica.mp3",
-                ["tag.mp3", "tag2.mp3"]
-            )
-            # 4. Exportar
-            output_path = os.path.join(folder_name, "podcast_final.mp3")
-            final_audio.export(output_path, format="mp3")
-            # 5. Limpieza
-            for f in audio_files:
-                os.remove(f)
-            # Texto de conversación
-            conversation_text = "\n".join(
-                f"{turn['speaker']}: {turn['text']}"
-                for turn in conversation["conversation"]
-            )
-            return output_path, conversation_text
-        except Exception as e:
-            raise RuntimeError(f"Processing failed: {str(e)}")

 from pydub import AudioSegment
 import base64
 from pathlib import Path
 @dataclass
 class ConversationConfig:
         self.llm_out = None
     def fetch_text(self, url: str) -> str:
         if not url:
             raise ValueError("URL cannot be empty")
         full_url = f"{self.config.prefix_url}{url}"
             raise RuntimeError(f"Failed to fetch URL: {e}")
     def extract_conversation(self, text: str) -> Dict:
         if not text:
             raise ValueError("Input text cannot be empty")
         try:
             raise RuntimeError(f"Failed to extract conversation: {str(e)}")
     async def text_to_speech(self, conversation_json: Dict, voice_1: str, voice_2: str) -> Tuple[List[str], str]:
         output_dir = Path(self._create_output_directory())
         filenames = []
         try:
             raise RuntimeError(f"Text-to-speech failed: {e}")
     async def _generate_audio(self, text: str, voice: str) -> str:
         if not text.strip():
             raise ValueError("Text cannot be empty")
         communicate = edge_tts.Communicate(
             text,
             voice.split(" - ")[0],
             return tmp_file.name
     def _create_output_directory(self) -> str:
         folder_name = base64.urlsafe_b64encode(os.urandom(8)).decode("utf-8")
         os.makedirs(folder_name, exist_ok=True)
         return folder_name
     def combine_audio_files(self, filenames: List[str]) -> AudioSegment:
         if not filenames:
             raise ValueError("No audio files provided")
         combined = AudioSegment.empty()
         for filename in filenames:
             combined += AudioSegment.from_file(filename, format="mp3")
         return combined
     def add_background_music_and_tags(
         self,
         speech_audio: AudioSegment,
         music_path: str,
         tags_paths: List[str]
     ) -> AudioSegment:
+        music = AudioSegment.from_file(music_path).fade_out(2000) - 25
         if len(music) < len(speech_audio):
+            music = music * ((len(speech_audio) // len(music)) + 1)
         music = music[:len(speech_audio)]
+        mixed = speech_audio.overlay(music)
         tag_intro = AudioSegment.from_file(tags_paths[0]) - 10
+        tag_trans = AudioSegment.from_file(tags_paths[1]) - 10
         final_audio = tag_intro + mixed
+        silent_ranges = []
+        for i in range(0, len(speech_audio) - 500, 100):
+            chunk = speech_audio[i:i+500]
+            if chunk.dBFS < -40:
+                silent_ranges.append((i, i + 500))
+        for start, end in reversed(silent_ranges):
+            if (end - start) >= len(tag_trans):
+                final_audio = final_audio.overlay(tag_trans, position=start + 50)
         return final_audio
+    async def url_to_audio(self, url: str, voice_1: str, voice_2: str) -> Tuple[str, str]:
+        text = self.fetch_text(url)
+        if len(words := text.split()) > self.config.max_words:
+            text = " ".join(words[:self.config.max_words])
+        conversation = self.extract_conversation(text)
+        return await self._process_to_audio(conversation, voice_1, voice_2)
+    async def text_to_audio(self, text: str, voice_1: str, voice_2: str) -> Tuple[str, str]:
+        conversation = self.extract_conversation(text)
+        return await self._process_to_audio(conversation, voice_1, voice_2)
+    async def raw_text_to_audio(self, text: str, voice_1: str, voice_2: str) -> Tuple[str, str]:
+        conversation = {"conversation": [{"speaker": "Narrator", "text": text}]}
+        return await self._process_to_audio(conversation, voice_1, voice_2)
+    async def _process_to_audio(
         self,
+        conversation: Dict,
         voice_1: str,
+        voice_2: str
     ) -> Tuple[str, str]:
+        audio_files, folder_name = await self.text_to_speech(conversation, voice_1, voice_2)
+        combined = self.combine_audio_files(audio_files)
+        final_audio = self.add_background_music_and_tags(
+            combined,
+            "musica.mp3",
+            ["tag.mp3", "tag2.mp3"]
+        )
+        output_path = os.path.join(folder_name, "output.mp3")
+        final_audio.export(output_path, format="mp3")
+        for f in audio_files:
+            os.remove(f)
+        text_output = "\n".join(
+            f"{turn['speaker']}: {turn['text']}"
+            for turn in conversation["conversation"]
+        )
+        return output_path, text_output