Spaces:

jvaldi
/

epigen-chatbot

Sleeping

App Files Files Community

Jesus del Carmen Valdiviezo commited on Apr 25, 2025

Commit

cc568ab

1 Parent(s): 0160120

clean text for audio

Browse files

Files changed (2) hide show

app.py +89 -18
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -21,7 +21,9 @@ from audio_recorder_streamlit import audio_recorder
 import asyncio
 import edge_tts
 from io import BytesIO
 # Define your knowledge content at the top level of your script
 knowledge_content = """
@@ -521,33 +523,102 @@ def convert_audio_to_text(path):
 #    os.remove(tmp.name)
-# pip install edge-tts
-import asyncio
-import edge_tts
-from io import BytesIO
-import streamlit as st
 def convert_text_to_audio(text: str) -> bytes:
     """
-    Usa edge-tts para generar MP3 en memoria.
     """
-    #voice = "es-MX-JorgeNeural"
     voice = "es-MX-DaliaNeural"
-    communicate = edge_tts.Communicate(text, voice)
-    # Guardar en disco y luego leer, porque save() no admite BytesIO directo
     tmp_path = "temp_edge.mp3"
     loop = asyncio.new_event_loop()
     loop.run_until_complete(communicate.save(tmp_path))
     with open(tmp_path, "rb") as f:
         data = f.read()
-    # cleanup
-    try:
-        import os; os.remove(tmp_path)
-    except:
-        pass
     return data

 import asyncio
 import edge_tts
 from io import BytesIO
+#
+import re
+import emoji
 # Define your knowledge content at the top level of your script
 knowledge_content = """
 #    os.remove(tmp.name)
+def remove_emojis(text: str) -> str:
+    # usa emoji.replace_emoji para borrar cualquier icono
+    return emoji.replace_emoji(text, replace="")
+#def remove_emojis(text: str) -> str:
+#    """
+#    Elimina emojis basándose en rangos Unicode.
+#    """
+#    emoji_pattern = re.compile(
+#        "["
+#        "\U0001F600-\U0001F64F"  # emoticons
+#        "\U0001F300-\U0001F5FF"  # symbols & pictographs
+#        "\U0001F680-\U0001F6FF"  # transport & map symbols
+#        "\U0001F1E0-\U0001F1FF"  # flags
+#        "]",
+#        flags=re.UNICODE
+#    )
+#    return emoji_pattern.sub("", text)
+def remove_urls(text: str) -> str:
+    """
+    Elimina cualquier substring que empiece con http:// o https://
+    """
+    return re.sub(r"https?://\S+", "", text)
+def remove_bullets(text: str) -> str:
+    # elimina los marcadores de lista (•, –, *, etc.) al inicio de cada línea
+    text = re.sub(r'(?m)^[\s]*[•\-\*]\s*', '', text)
+    # quita cualquier • suelto en el resto del texto
+    return text.replace('•', '')
+def remove_markdown(text: str) -> str:
+    """
+    Quita **bold**, *italic*, `code`, y enlaces [texto](url).
+    """
+    # Bold: **algo** → algo
+    text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)
+    # Italic or single *: *algo* → algo
+    text = re.sub(r'\*(.*?)\*', r'\1', text)
+    # Inline code: `algo` → algo
+    text = re.sub(r'`([^`]*)`', r'\1', text)
+    # Links: [texto](url) → texto
+    text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)
+    return text
+#def convert_text_to_audio(text: str) -> bytes:
+#    """
+#    Usa edge-tts para generar MP3 en memoria.
+#    """
+#    #voice = "es-MX-JorgeNeural"
+#    voice = "es-MX-DaliaNeural"
+#    communicate = edge_tts.Communicate(text, voice)
+#
+#    # Guardar en disco y luego leer, porque save() no admite BytesIO directo
+#    tmp_path = "temp_edge.mp3"
+#    loop = asyncio.new_event_loop()
+#    loop.run_until_complete(communicate.save(tmp_path))
+#
+#    with open(tmp_path, "rb") as f:
+#        data = f.read()
+#    # cleanup
+#    try:
+#        import os; os.remove(tmp_path)
+#    except:
+#        pass
+#
+#    return data
 def convert_text_to_audio(text: str) -> bytes:
     """
+    Usa edge-tts para generar MP3 en memoria, tras sanear emojis y URLs.
     """
+    # 1) Limpieza en serie
+    clean = remove_urls(text)
+    clean = remove_emojis(clean)
+    clean = remove_markdown(clean)
+    clean = remove_bullets(clean)
+    # 2) Selección de voz y síntesis
     voice = "es-MX-DaliaNeural"
+    communicate = edge_tts.Communicate(clean, voice)
+    # 3) Guardar y leer en memoria
     tmp_path = "temp_edge.mp3"
     loop = asyncio.new_event_loop()
     loop.run_until_complete(communicate.save(tmp_path))
     with open(tmp_path, "rb") as f:
         data = f.read()
+    os.remove(tmp_path)
     return data

requirements.txt CHANGED Viewed

@@ -18,3 +18,4 @@ SpeechRecognition
 gtts
 audio_recorder_streamlit
 edge-tts

 gtts
 audio_recorder_streamlit
 edge-tts
+emoji