lilblueyes commited on
Commit
de2df4e
·
1 Parent(s): 94bf482

Add TTS MVP

Browse files
Files changed (4) hide show
  1. README.md +7 -7
  2. app.py +158 -0
  3. packages.txt +2 -0
  4. requirements.txt +4 -0
README.md CHANGED
@@ -1,13 +1,13 @@
1
  ---
2
- title: Tts Test
3
- emoji: 📈
4
- colorFrom: indigo
5
- colorTo: green
6
  sdk: gradio
7
- sdk_version: 6.16.0
8
- python_version: '3.13'
9
  app_file: app.py
10
  pinned: false
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
1
  ---
2
+ title: ASL TTS Test
3
+ emoji: 🗣️
4
+ colorFrom: blue
5
+ colorTo: purple
6
  sdk: gradio
 
 
7
  app_file: app.py
8
  pinned: false
9
  ---
10
 
11
+ # ASL TTS Test
12
+
13
+ MVP Gradio pour tester la brique TTS avant de brancher le pipeline vidéo ASL.
app.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import tempfile
4
+
5
+ import gradio as gr
6
+ import soundfile as sf
7
+ import torch
8
+ from qwen_tts import Qwen3TTSModel
9
+
10
+
11
+ MODEL_ID = os.getenv("MODEL_ID", "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice")
12
+
13
+ model = None
14
+
15
+
16
+ def get_model():
17
+ global model
18
+
19
+ if model is not None:
20
+ return model
21
+
22
+ if torch.cuda.is_available():
23
+ model = Qwen3TTSModel.from_pretrained(
24
+ MODEL_ID,
25
+ device_map="cuda:0",
26
+ dtype=torch.bfloat16,
27
+ )
28
+ else:
29
+ model = Qwen3TTSModel.from_pretrained(
30
+ MODEL_ID,
31
+ device_map="cpu",
32
+ dtype=torch.float32,
33
+ )
34
+
35
+ return model
36
+
37
+
38
+ def generate_tts(text, language, speaker, instruction):
39
+ text = (text or "").strip()
40
+ instruction = (instruction or "").strip()
41
+
42
+ if not text:
43
+ raise gr.Error("Écris une phrase à synthétiser.")
44
+
45
+ tts = get_model()
46
+
47
+ wavs, sr = tts.generate_custom_voice(
48
+ text=text,
49
+ language=language,
50
+ speaker=speaker,
51
+ instruct=instruction,
52
+ )
53
+
54
+ output_path = os.path.join(
55
+ tempfile.gettempdir(),
56
+ f"qwen_tts_{int(time.time() * 1000)}.wav",
57
+ )
58
+
59
+ sf.write(output_path, wavs[0], sr)
60
+
61
+ intent_json = {
62
+ "detected_glosses": [],
63
+ "detected_facial_expression": "not_connected_yet",
64
+ "subtitle": text,
65
+ "voice_instruction": instruction,
66
+ "language": language,
67
+ "speaker": speaker,
68
+ "pipeline_stage": "tts_only_mvp",
69
+ }
70
+
71
+ return output_path, text, intent_json
72
+
73
+
74
+ with gr.Blocks(title="ASL to TTS MVP") as demo:
75
+ gr.Markdown(
76
+ """
77
+ # ASL to TTS MVP
78
+
79
+ Première version: on teste seulement la brique TTS.
80
+
81
+ Ensuite, on branchera:
82
+ video ASL -> glosses -> emotion -> intent JSON -> subtitle -> voice instruction -> TTS.
83
+ """
84
+ )
85
+
86
+ with gr.Row():
87
+ with gr.Column():
88
+ text_input = gr.Textbox(
89
+ label="Subtitle temporaire",
90
+ value="Hello, I am happy to see you today.",
91
+ lines=3,
92
+ )
93
+
94
+ instruction_input = gr.Textbox(
95
+ label="Voice instruction",
96
+ value="Speak with a warm, happy, expressive voice.",
97
+ lines=2,
98
+ )
99
+
100
+ language_input = gr.Dropdown(
101
+ label="Language",
102
+ choices=[
103
+ "Auto",
104
+ "Chinese",
105
+ "English",
106
+ "Japanese",
107
+ "Korean",
108
+ "German",
109
+ "French",
110
+ "Russian",
111
+ "Portuguese",
112
+ "Spanish",
113
+ "Italian",
114
+ ],
115
+ value="English",
116
+ )
117
+
118
+ speaker_input = gr.Dropdown(
119
+ label="Speaker",
120
+ choices=[
121
+ "Vivian",
122
+ "Serena",
123
+ "Uncle_Fu",
124
+ "Dylan",
125
+ "Eric",
126
+ "Ryan",
127
+ "Aiden",
128
+ "Ono_Anna",
129
+ "Sohee",
130
+ ],
131
+ value="Ryan",
132
+ )
133
+
134
+ button = gr.Button("Generate speech")
135
+
136
+ with gr.Column():
137
+ audio_output = gr.Audio(label="Generated audio", type="filepath")
138
+ subtitle_output = gr.Textbox(label="Subtitle")
139
+ json_output = gr.JSON(label="Intent JSON")
140
+
141
+ button.click(
142
+ fn=generate_tts,
143
+ inputs=[
144
+ text_input,
145
+ language_input,
146
+ speaker_input,
147
+ instruction_input,
148
+ ],
149
+ outputs=[
150
+ audio_output,
151
+ subtitle_output,
152
+ json_output,
153
+ ],
154
+ )
155
+
156
+
157
+ if __name__ == "__main__":
158
+ demo.queue().launch()
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ libsndfile1
2
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio
2
+ qwen-tts
3
+ soundfile
4
+ torch