Spaces:

build-small-hackathon
/

Sign2Voice

Build error

App Files Files Community

lilblueyes commited on 25 days ago

Commit

de2df4e

1 Parent(s): 94bf482

Add TTS MVP

Browse files

Files changed (4) hide show

README.md +7 -7
app.py +158 -0
packages.txt +2 -0
requirements.txt +4 -0

README.md CHANGED Viewed

@@ -1,13 +1,13 @@
 ---
-title: Tts Test
-emoji: 📈
-colorFrom: indigo
-colorTo: green
 sdk: gradio
-sdk_version: 6.16.0
-python_version: '3.13'
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: ASL TTS Test
+emoji: 🗣️
+colorFrom: blue
+colorTo: purple
 sdk: gradio
 app_file: app.py
 pinned: false
 ---
+# ASL TTS Test
+MVP Gradio pour tester la brique TTS avant de brancher le pipeline vidéo ASL.

app.py ADDED Viewed

	@@ -0,0 +1,158 @@

+import os
+import time
+import tempfile
+import gradio as gr
+import soundfile as sf
+import torch
+from qwen_tts import Qwen3TTSModel
+MODEL_ID = os.getenv("MODEL_ID", "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice")
+model = None
+def get_model():
+    global model
+    if model is not None:
+        return model
+    if torch.cuda.is_available():
+        model = Qwen3TTSModel.from_pretrained(
+            MODEL_ID,
+            device_map="cuda:0",
+            dtype=torch.bfloat16,
+        )
+    else:
+        model = Qwen3TTSModel.from_pretrained(
+            MODEL_ID,
+            device_map="cpu",
+            dtype=torch.float32,
+        )
+    return model
+def generate_tts(text, language, speaker, instruction):
+    text = (text or "").strip()
+    instruction = (instruction or "").strip()
+    if not text:
+        raise gr.Error("Écris une phrase à synthétiser.")
+    tts = get_model()
+    wavs, sr = tts.generate_custom_voice(
+        text=text,
+        language=language,
+        speaker=speaker,
+        instruct=instruction,
+    )
+    output_path = os.path.join(
+        tempfile.gettempdir(),
+        f"qwen_tts_{int(time.time() * 1000)}.wav",
+    )
+    sf.write(output_path, wavs[0], sr)
+    intent_json = {
+        "detected_glosses": [],
+        "detected_facial_expression": "not_connected_yet",
+        "subtitle": text,
+        "voice_instruction": instruction,
+        "language": language,
+        "speaker": speaker,
+        "pipeline_stage": "tts_only_mvp",
+    }
+    return output_path, text, intent_json
+with gr.Blocks(title="ASL to TTS MVP") as demo:
+    gr.Markdown(
+        """
+        # ASL to TTS MVP
+        Première version: on teste seulement la brique TTS.
+        Ensuite, on branchera:
+        video ASL -> glosses -> emotion -> intent JSON -> subtitle -> voice instruction -> TTS.
+        """
+    )
+    with gr.Row():
+        with gr.Column():
+            text_input = gr.Textbox(
+                label="Subtitle temporaire",
+                value="Hello, I am happy to see you today.",
+                lines=3,
+            )
+            instruction_input = gr.Textbox(
+                label="Voice instruction",
+                value="Speak with a warm, happy, expressive voice.",
+                lines=2,
+            )
+            language_input = gr.Dropdown(
+                label="Language",
+                choices=[
+                    "Auto",
+                    "Chinese",
+                    "English",
+                    "Japanese",
+                    "Korean",
+                    "German",
+                    "French",
+                    "Russian",
+                    "Portuguese",
+                    "Spanish",
+                    "Italian",
+                ],
+                value="English",
+            )
+            speaker_input = gr.Dropdown(
+                label="Speaker",
+                choices=[
+                    "Vivian",
+                    "Serena",
+                    "Uncle_Fu",
+                    "Dylan",
+                    "Eric",
+                    "Ryan",
+                    "Aiden",
+                    "Ono_Anna",
+                    "Sohee",
+                ],
+                value="Ryan",
+            )
+            button = gr.Button("Generate speech")
+        with gr.Column():
+            audio_output = gr.Audio(label="Generated audio", type="filepath")
+            subtitle_output = gr.Textbox(label="Subtitle")
+            json_output = gr.JSON(label="Intent JSON")
+    button.click(
+        fn=generate_tts,
+        inputs=[
+            text_input,
+            language_input,
+            speaker_input,
+            instruction_input,
+        ],
+        outputs=[
+            audio_output,
+            subtitle_output,
+            json_output,
+        ],
+    )
+if __name__ == "__main__":
+    demo.queue().launch()

packages.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ libsndfile1
2	+ ffmpeg

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+gradio
+qwen-tts
+soundfile
+torch