VishnuPJ commited on
Commit
d83b514
·
verified ·
1 Parent(s): ecf28b9

Upload 10 files

Browse files
.gitattributes CHANGED
@@ -148,3 +148,4 @@ LanPaint/examples/Original_No_Mask-example18.gif filter=lfs diff=lfs merge=lfs -
148
  LanPaint/examples/Original_No_Mask_example17.gif filter=lfs diff=lfs merge=lfs -text
149
  LanPaint/examples/Outpainted_40frames_Drag_Me_to_ComfyUI_example19.gif filter=lfs diff=lfs merge=lfs -text
150
  LanPaint/Nodes.JPG filter=lfs diff=lfs merge=lfs -text
 
 
148
  LanPaint/examples/Original_No_Mask_example17.gif filter=lfs diff=lfs merge=lfs -text
149
  LanPaint/examples/Outpainted_40frames_Drag_Me_to_ComfyUI_example19.gif filter=lfs diff=lfs merge=lfs -text
150
  LanPaint/Nodes.JPG filter=lfs diff=lfs merge=lfs -text
151
+ ComfyUI-Qwen3-ASR/assets/intro.png filter=lfs diff=lfs merge=lfs -text
ComfyUI-Qwen3-ASR/.github/workflows/publish.yml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Publish to Comfy registry
2
+ on:
3
+ workflow_dispatch:
4
+ push:
5
+ branches:
6
+ - main
7
+ paths:
8
+ - "pyproject.toml"
9
+
10
+ jobs:
11
+ publish-node:
12
+ name: Publish Custom Node to registry
13
+ runs-on: ubuntu-latest
14
+ steps:
15
+ - name: Check out code
16
+ uses: actions/checkout@v4
17
+ - name: Publish Custom Node
18
+ uses: Comfy-Org/publish-node-action@main
19
+ with:
20
+ personal_access_token: ${{ secrets.REGISTRY_ACCESS_TOKEN }} ## Add your own personal access token to your Github Repository secrets and reference it here.
ComfyUI-Qwen3-ASR/.gitignore ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.py[cod]
3
+ *$py.class
4
+ *.so
5
+ .Python
6
+ build/
7
+ develop-eggs/
8
+ dist/
9
+ downloads/
10
+ eggs/
11
+ .eggs/
12
+ lib/
13
+ lib64/
14
+ parts/
15
+ sdist/
16
+ var/
17
+ wheels/
18
+ *.egg-info/
19
+ .installed.cfg
20
+ *.egg
21
+ .env
22
+ .venv
23
+ env/
24
+ venv/
25
+ .idea/
26
+ .vscode/
27
+ *.swp
28
+ *.swo
29
+ .DS_Store
ComfyUI-Qwen3-ASR/README.md ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ComfyUI-Qwen3-ASR
2
+
3
+ ComfyUI custom nodes for **Qwen3-ASR** (Automatic Speech Recognition) - audio-to-text transcription supporting 52 languages and dialects.
4
+
5
+ > 🔗 Compatible with [ComfyUI-Qwen3-TTS](https://github.com/DarioFT/ComfyUI-Qwen3-TTS) for complete speech workflows
6
+
7
+ <p align="center">
8
+ <img src="https://raw.githubusercontent.com/DarioFT/ComfyUI-Qwen3-ASR/refs/heads/main/assets/intro.png"/>
9
+ <p>
10
+
11
+ ## Features
12
+
13
+ - **Multi-language**: 30 languages + 22 Chinese dialects
14
+ - **Two model sizes**: 1.7B (best quality) and 0.6B (faster)
15
+ - **Auto language detection**: No need to specify language
16
+ - **Timestamps**: Optional word/character-level timing via Forced Aligner
17
+ - **Batch processing**: Transcribe multiple audio files
18
+ - **Auto-download**: Models download automatically on first use
19
+
20
+ ## Installation
21
+
22
+ ### Via ComfyUI Manager (Recommended)
23
+ Search for "Qwen3-ASR" in ComfyUI Manager
24
+
25
+ ### Manual Installation
26
+ ```bash
27
+ cd ComfyUI/custom_nodes
28
+ git clone https://github.com/DarioFT/ComfyUI-Qwen3-ASR.git
29
+ cd ComfyUI-Qwen3-ASR
30
+ pip install -r requirements.txt
31
+ ```
32
+
33
+ ## Nodes
34
+
35
+ ### Qwen3-ASR Loader
36
+ Loads the ASR model with auto-download support.
37
+
38
+ | Input | Type | Description |
39
+ |-------|------|-------------|
40
+ | repo_id | dropdown | Model: 1.7B or 0.6B |
41
+ | source | dropdown | HuggingFace or ModelScope |
42
+ | precision | dropdown | fp16, bf16, fp32 |
43
+ | attention | dropdown | auto, flash_attention_2, sdpa, eager |
44
+ | forced_aligner | dropdown | Optional aligner for timestamps |
45
+ | local_model_path | string | Optional custom model path |
46
+
47
+ ### Qwen3-ASR Transcribe
48
+ Transcribes a single audio input to text.
49
+
50
+ | Input | Type | Description |
51
+ |-------|------|-------------|
52
+ | model | QWEN3_ASR_MODEL | Loaded model |
53
+ | audio | AUDIO | Audio input (ComfyUI format) |
54
+ | language | dropdown | Force language or "auto" |
55
+ | context | string | Optional context hints |
56
+ | return_timestamps | boolean | Enable timestamp output |
57
+
58
+ | Output | Type | Description |
59
+ |--------|------|-------------|
60
+ | text | STRING | Transcribed text |
61
+ | language | STRING | Detected language |
62
+ | timestamps | STRING | Word-level timestamps (if enabled) |
63
+
64
+ ### Qwen3-ASR Batch Transcribe
65
+ Batch transcription for multiple audio files.
66
+
67
+ ## Supported Languages
68
+
69
+ Chinese, English, Cantonese, Arabic, German, French, Spanish, Portuguese, Indonesian, Italian, Korean, Russian, Thai, Vietnamese, Japanese, Turkish, Hindi, Malay, Dutch, Swedish, Danish, Finnish, Polish, Czech, Filipino, Persian, Greek, Hungarian, Macedonian, Romanian
70
+
71
+ Plus 22 Chinese dialects including Sichuan, Cantonese (HK/Guangdong), Wu, Minnan, and regional accents.
72
+
73
+ ## Workflow Examples
74
+
75
+ ### Basic Transcription
76
+ ```
77
+ LoadAudio → Qwen3-ASR Loader → Qwen3-ASR Transcribe → ShowText
78
+ ```
79
+
80
+ ### With TTS (Speech-to-Speech)
81
+ ```
82
+ LoadAudio → Qwen3-ASR Transcribe → [process text] → Qwen3-TTS → SaveAudio
83
+ ```
84
+
85
+ ## Model Storage
86
+
87
+ Models are stored in: `ComfyUI/models/Qwen3-ASR/`
88
+
89
+ ## Credits
90
+
91
+ - [Qwen3-ASR](https://huggingface.co/Qwen/Qwen3-ASR-1.7B) by Alibaba Qwen Team
92
+ - [qwen-asr](https://pypi.org/project/qwen-asr/) Python package
93
+
94
+ ## License
95
+
96
+ Apache-2.0
ComfyUI-Qwen3-ASR/__init__.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .nodes import (
2
+ Qwen3ASRLoader,
3
+ Qwen3ASRTranscribe,
4
+ Qwen3ASRBatchTranscribe,
5
+ )
6
+
7
+ NODE_CLASS_MAPPINGS = {
8
+ "Qwen3ASRLoader": Qwen3ASRLoader,
9
+ "Qwen3ASRTranscribe": Qwen3ASRTranscribe,
10
+ "Qwen3ASRBatchTranscribe": Qwen3ASRBatchTranscribe,
11
+ }
12
+
13
+ NODE_DISPLAY_NAME_MAPPINGS = {
14
+ "Qwen3ASRLoader": "Qwen3-ASR Loader",
15
+ "Qwen3ASRTranscribe": "Qwen3-ASR Transcribe",
16
+ "Qwen3ASRBatchTranscribe": "Qwen3-ASR Batch Transcribe",
17
+ }
18
+
19
+ __all__ = ["NODE_CLASS_MAPPINGS", "NODE_DISPLAY_NAME_MAPPINGS"]
ComfyUI-Qwen3-ASR/assets/intro.png ADDED

Git LFS Details

  • SHA256: cf528a961ff1f3929d7f10a65ab92501d49143fa2c0bfb70c06ebf4dc0b79a5c
  • Pointer size: 131 Bytes
  • Size of remote file: 148 kB
ComfyUI-Qwen3-ASR/example_workflows/base.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "id": "560c0123-a3c9-4148-b0cb-7b705dd02044",
3
+ "revision": 0,
4
+ "last_node_id": 5,
5
+ "last_link_id": 3,
6
+ "nodes": [
7
+ {
8
+ "id": 4,
9
+ "type": "LoadAudio",
10
+ "pos": [
11
+ 26.0944883333357,
12
+ 248.1008013888877
13
+ ],
14
+ "size": [
15
+ 282.83333587646484,
16
+ 136
17
+ ],
18
+ "flags": {},
19
+ "order": 0,
20
+ "mode": 0,
21
+ "inputs": [],
22
+ "outputs": [
23
+ {
24
+ "name": "AUDIO",
25
+ "type": "AUDIO",
26
+ "links": [
27
+ 1
28
+ ]
29
+ }
30
+ ],
31
+ "properties": {
32
+ "cnr_id": "comfy-core",
33
+ "ver": "0.10.0",
34
+ "Node name for S&R": "LoadAudio"
35
+ },
36
+ "widgets_values": [
37
+ "1.wav",
38
+ null,
39
+ ""
40
+ ]
41
+ },
42
+ {
43
+ "id": 1,
44
+ "type": "Qwen3ASRLoader",
45
+ "pos": [
46
+ 31.015860833335807,
47
+ -4.379546388890003
48
+ ],
49
+ "size": [
50
+ 270,
51
+ 178
52
+ ],
53
+ "flags": {},
54
+ "order": 1,
55
+ "mode": 0,
56
+ "inputs": [],
57
+ "outputs": [
58
+ {
59
+ "name": "model",
60
+ "type": "QWEN3_ASR_MODEL",
61
+ "links": [
62
+ 2
63
+ ]
64
+ }
65
+ ],
66
+ "properties": {
67
+ "Node name for S&R": "Qwen3ASRLoader"
68
+ },
69
+ "widgets_values": [
70
+ "Qwen/Qwen3-ASR-1.7B",
71
+ "HuggingFace",
72
+ "bf16",
73
+ "auto",
74
+ "None",
75
+ ""
76
+ ]
77
+ },
78
+ {
79
+ "id": 2,
80
+ "type": "Qwen3ASRTranscribe",
81
+ "pos": [
82
+ 366.6877755555577,
83
+ 51.57828166666546
84
+ ],
85
+ "size": [
86
+ 400,
87
+ 200
88
+ ],
89
+ "flags": {},
90
+ "order": 2,
91
+ "mode": 0,
92
+ "inputs": [
93
+ {
94
+ "name": "model",
95
+ "type": "QWEN3_ASR_MODEL",
96
+ "link": 2
97
+ },
98
+ {
99
+ "name": "audio",
100
+ "type": "AUDIO",
101
+ "link": 1
102
+ }
103
+ ],
104
+ "outputs": [
105
+ {
106
+ "name": "text",
107
+ "type": "STRING",
108
+ "links": [
109
+ 3
110
+ ]
111
+ },
112
+ {
113
+ "name": "language",
114
+ "type": "STRING",
115
+ "links": null
116
+ },
117
+ {
118
+ "name": "timestamps",
119
+ "type": "STRING",
120
+ "links": null
121
+ }
122
+ ],
123
+ "properties": {
124
+ "Node name for S&R": "Qwen3ASRTranscribe"
125
+ },
126
+ "widgets_values": [
127
+ "auto",
128
+ "",
129
+ false
130
+ ]
131
+ },
132
+ {
133
+ "id": 5,
134
+ "type": "PreviewAny",
135
+ "pos": [
136
+ 837.7456827777796,
137
+ 51.88921805555435
138
+ ],
139
+ "size": [
140
+ 210,
141
+ 166
142
+ ],
143
+ "flags": {},
144
+ "order": 3,
145
+ "mode": 0,
146
+ "inputs": [
147
+ {
148
+ "name": "source",
149
+ "type": "*",
150
+ "link": 3
151
+ }
152
+ ],
153
+ "outputs": [],
154
+ "properties": {
155
+ "cnr_id": "comfy-core",
156
+ "ver": "0.10.0",
157
+ "Node name for S&R": "PreviewAny"
158
+ },
159
+ "widgets_values": [
160
+ null,
161
+ null,
162
+ false
163
+ ]
164
+ }
165
+ ],
166
+ "links": [
167
+ [
168
+ 1,
169
+ 4,
170
+ 0,
171
+ 2,
172
+ 1,
173
+ "AUDIO"
174
+ ],
175
+ [
176
+ 2,
177
+ 1,
178
+ 0,
179
+ 2,
180
+ 0,
181
+ "QWEN3_ASR_MODEL"
182
+ ],
183
+ [
184
+ 3,
185
+ 2,
186
+ 0,
187
+ 5,
188
+ 0,
189
+ "STRING"
190
+ ]
191
+ ],
192
+ "groups": [],
193
+ "config": {},
194
+ "extra": {
195
+ "workflowRendererVersion": "LG",
196
+ "ue_links": [],
197
+ "ds": {
198
+ "scale": 1.128526645768025,
199
+ "offset": [
200
+ 249.0430116666644,
201
+ 262.42325416666796
202
+ ]
203
+ },
204
+ "frontendVersion": "1.37.11"
205
+ },
206
+ "version": 0.4
207
+ }
ComfyUI-Qwen3-ASR/example_workflows/simple_voice_clone-REQUIRES-TTS.json ADDED
@@ -0,0 +1,377 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "id": "560c0123-a3c9-4148-b0cb-7b705dd02044",
3
+ "revision": 0,
4
+ "last_node_id": 9,
5
+ "last_link_id": 10,
6
+ "nodes": [
7
+ {
8
+ "id": 1,
9
+ "type": "Qwen3ASRLoader",
10
+ "pos": [
11
+ 31.015860833335807,
12
+ -4.379546388890003
13
+ ],
14
+ "size": [
15
+ 270,
16
+ 178
17
+ ],
18
+ "flags": {},
19
+ "order": 0,
20
+ "mode": 0,
21
+ "inputs": [],
22
+ "outputs": [
23
+ {
24
+ "name": "model",
25
+ "type": "QWEN3_ASR_MODEL",
26
+ "links": [
27
+ 2
28
+ ]
29
+ }
30
+ ],
31
+ "properties": {
32
+ "aux_id": "DarioFT/ComfyUI-Qwen3-ASR",
33
+ "ver": "5cdfee1b78f5b92a3f9d6baeabfb3bc688f551c4",
34
+ "Node name for S&R": "Qwen3ASRLoader"
35
+ },
36
+ "widgets_values": [
37
+ "Qwen/Qwen3-ASR-1.7B",
38
+ "HuggingFace",
39
+ "bf16",
40
+ "auto",
41
+ "None",
42
+ ""
43
+ ]
44
+ },
45
+ {
46
+ "id": 5,
47
+ "type": "PreviewAny",
48
+ "pos": [
49
+ 439.96154388889096,
50
+ 249.17299583333204
51
+ ],
52
+ "size": [
53
+ 210,
54
+ 166
55
+ ],
56
+ "flags": {},
57
+ "order": 4,
58
+ "mode": 0,
59
+ "inputs": [
60
+ {
61
+ "name": "source",
62
+ "type": "*",
63
+ "link": 3
64
+ }
65
+ ],
66
+ "outputs": [],
67
+ "properties": {
68
+ "cnr_id": "comfy-core",
69
+ "ver": "0.10.0",
70
+ "Node name for S&R": "PreviewAny"
71
+ },
72
+ "widgets_values": [
73
+ null,
74
+ null,
75
+ null
76
+ ]
77
+ },
78
+ {
79
+ "id": 6,
80
+ "type": "Qwen3Loader",
81
+ "pos": [
82
+ 899.2585502500009,
83
+ -216.610786916668
84
+ ],
85
+ "size": [
86
+ 270,
87
+ 154
88
+ ],
89
+ "flags": {},
90
+ "order": 1,
91
+ "mode": 0,
92
+ "inputs": [],
93
+ "outputs": [
94
+ {
95
+ "name": "model",
96
+ "type": "QWEN3_MODEL",
97
+ "links": [
98
+ 6
99
+ ]
100
+ }
101
+ ],
102
+ "properties": {
103
+ "cnr_id": "comfyui-qwen3-tts",
104
+ "ver": "6289ee949a75455e9fe1f90ac6d5f51445f03c73",
105
+ "Node name for S&R": "Qwen3Loader"
106
+ },
107
+ "widgets_values": [
108
+ "Qwen/Qwen3-TTS-12Hz-1.7B-Base",
109
+ "HuggingFace",
110
+ "bf16",
111
+ "auto",
112
+ ""
113
+ ]
114
+ },
115
+ {
116
+ "id": 4,
117
+ "type": "LoadAudio",
118
+ "pos": [
119
+ 441.0337383333358,
120
+ -206.50964305555672
121
+ ],
122
+ "size": [
123
+ 282.83333587646484,
124
+ 136
125
+ ],
126
+ "flags": {},
127
+ "order": 2,
128
+ "mode": 0,
129
+ "inputs": [],
130
+ "outputs": [
131
+ {
132
+ "name": "AUDIO",
133
+ "type": "AUDIO",
134
+ "links": [
135
+ 1,
136
+ 7
137
+ ]
138
+ }
139
+ ],
140
+ "properties": {
141
+ "cnr_id": "comfy-core",
142
+ "ver": "0.10.0",
143
+ "Node name for S&R": "LoadAudio"
144
+ },
145
+ "widgets_values": [
146
+ "1.wav",
147
+ null,
148
+ null
149
+ ]
150
+ },
151
+ {
152
+ "id": 2,
153
+ "type": "Qwen3ASRTranscribe",
154
+ "pos": [
155
+ 363.4711922222244,
156
+ -3.1036350000012103
157
+ ],
158
+ "size": [
159
+ 400,
160
+ 200
161
+ ],
162
+ "flags": {},
163
+ "order": 3,
164
+ "mode": 0,
165
+ "inputs": [
166
+ {
167
+ "name": "model",
168
+ "type": "QWEN3_ASR_MODEL",
169
+ "link": 2
170
+ },
171
+ {
172
+ "name": "audio",
173
+ "type": "AUDIO",
174
+ "link": 1
175
+ }
176
+ ],
177
+ "outputs": [
178
+ {
179
+ "name": "text",
180
+ "type": "STRING",
181
+ "links": [
182
+ 3,
183
+ 10
184
+ ]
185
+ },
186
+ {
187
+ "name": "language",
188
+ "type": "STRING",
189
+ "links": null
190
+ },
191
+ {
192
+ "name": "timestamps",
193
+ "type": "STRING",
194
+ "links": null
195
+ }
196
+ ],
197
+ "properties": {
198
+ "aux_id": "DarioFT/ComfyUI-Qwen3-ASR",
199
+ "ver": "5cdfee1b78f5b92a3f9d6baeabfb3bc688f551c4",
200
+ "Node name for S&R": "Qwen3ASRTranscribe"
201
+ },
202
+ "widgets_values": [
203
+ "auto",
204
+ "",
205
+ false
206
+ ]
207
+ },
208
+ {
209
+ "id": 9,
210
+ "type": "Qwen3VoiceClone",
211
+ "pos": [
212
+ 841.1541889166685,
213
+ -5.106494222223706
214
+ ],
215
+ "size": [
216
+ 402.1443888888889,
217
+ 324.5160833333333
218
+ ],
219
+ "flags": {},
220
+ "order": 5,
221
+ "mode": 0,
222
+ "inputs": [
223
+ {
224
+ "name": "model",
225
+ "type": "QWEN3_MODEL",
226
+ "link": 6
227
+ },
228
+ {
229
+ "name": "ref_audio",
230
+ "shape": 7,
231
+ "type": "AUDIO",
232
+ "link": 7
233
+ },
234
+ {
235
+ "name": "prompt",
236
+ "shape": 7,
237
+ "type": "QWEN3_PROMPT",
238
+ "link": null
239
+ },
240
+ {
241
+ "name": "ref_text",
242
+ "shape": 7,
243
+ "type": "STRING",
244
+ "widget": {
245
+ "name": "ref_text"
246
+ },
247
+ "link": 10
248
+ }
249
+ ],
250
+ "outputs": [
251
+ {
252
+ "name": "AUDIO",
253
+ "type": "AUDIO",
254
+ "links": [
255
+ 9
256
+ ]
257
+ }
258
+ ],
259
+ "properties": {
260
+ "cnr_id": "comfyui-qwen3-tts",
261
+ "ver": "80a0fd71c2ed791285d552727e2a4b77e9b91a3d",
262
+ "Node name for S&R": "Qwen3VoiceClone"
263
+ },
264
+ "widgets_values": [
265
+ "The Qwen3-ASR family ASR models maintains high-quality and robust recognition under complex acoustic environments and challenging text patterns.",
266
+ 73052475174351,
267
+ "randomize",
268
+ "Auto",
269
+ "",
270
+ 2048,
271
+ 30
272
+ ]
273
+ },
274
+ {
275
+ "id": 7,
276
+ "type": "PreviewAudio",
277
+ "pos": [
278
+ 1307.8932808055577,
279
+ -3.6268658888902534
280
+ ],
281
+ "size": [
282
+ 270,
283
+ 88
284
+ ],
285
+ "flags": {},
286
+ "order": 6,
287
+ "mode": 0,
288
+ "inputs": [
289
+ {
290
+ "name": "audio",
291
+ "type": "AUDIO",
292
+ "link": 9
293
+ }
294
+ ],
295
+ "outputs": [],
296
+ "properties": {
297
+ "cnr_id": "comfy-core",
298
+ "ver": "0.10.0",
299
+ "Node name for S&R": "PreviewAudio"
300
+ },
301
+ "widgets_values": []
302
+ }
303
+ ],
304
+ "links": [
305
+ [
306
+ 1,
307
+ 4,
308
+ 0,
309
+ 2,
310
+ 1,
311
+ "AUDIO"
312
+ ],
313
+ [
314
+ 2,
315
+ 1,
316
+ 0,
317
+ 2,
318
+ 0,
319
+ "QWEN3_ASR_MODEL"
320
+ ],
321
+ [
322
+ 3,
323
+ 2,
324
+ 0,
325
+ 5,
326
+ 0,
327
+ "STRING"
328
+ ],
329
+ [
330
+ 6,
331
+ 6,
332
+ 0,
333
+ 9,
334
+ 0,
335
+ "QWEN3_MODEL"
336
+ ],
337
+ [
338
+ 7,
339
+ 4,
340
+ 0,
341
+ 9,
342
+ 1,
343
+ "AUDIO"
344
+ ],
345
+ [
346
+ 9,
347
+ 9,
348
+ 0,
349
+ 7,
350
+ 0,
351
+ "AUDIO"
352
+ ],
353
+ [
354
+ 10,
355
+ 2,
356
+ 0,
357
+ 9,
358
+ 3,
359
+ "STRING"
360
+ ]
361
+ ],
362
+ "groups": [],
363
+ "config": {},
364
+ "extra": {
365
+ "workflowRendererVersion": "LG",
366
+ "ue_links": [],
367
+ "ds": {
368
+ "scale": 0.932666649395062,
369
+ "offset": [
370
+ 166.70858886110872,
371
+ 451.1393831111121
372
+ ]
373
+ },
374
+ "frontendVersion": "1.37.11"
375
+ },
376
+ "version": 0.4
377
+ }
ComfyUI-Qwen3-ASR/nodes.py ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import torch
4
+ import numpy as np
5
+ import folder_paths
6
+ import comfy.model_management as mm
7
+ from qwen_asr import Qwen3ASRModel
8
+
9
+ # Register Qwen3-ASR models folder with ComfyUI
10
+ QWEN3_ASR_MODELS_DIR = os.path.join(folder_paths.models_dir, "Qwen3-ASR")
11
+ os.makedirs(QWEN3_ASR_MODELS_DIR, exist_ok=True)
12
+ folder_paths.add_model_folder_path("Qwen3-ASR", QWEN3_ASR_MODELS_DIR)
13
+
14
+ # Model repo mappings
15
+ QWEN3_ASR_MODELS = {
16
+ "Qwen/Qwen3-ASR-1.7B": "Qwen3-ASR-1.7B",
17
+ "Qwen/Qwen3-ASR-0.6B": "Qwen3-ASR-0.6B",
18
+ }
19
+
20
+ QWEN3_FORCED_ALIGNERS = {
21
+ "None": None,
22
+ "Qwen/Qwen3-ForcedAligner-0.6B": "Qwen3-ForcedAligner-0.6B",
23
+ }
24
+
25
+ # Supported languages
26
+ SUPPORTED_LANGUAGES = [
27
+ "auto",
28
+ "Chinese", "English", "Cantonese", "Arabic", "German", "French", "Spanish",
29
+ "Portuguese", "Indonesian", "Italian", "Korean", "Russian", "Thai",
30
+ "Vietnamese", "Japanese", "Turkish", "Hindi", "Malay", "Dutch", "Swedish",
31
+ "Danish", "Finnish", "Polish", "Czech", "Filipino", "Persian", "Greek",
32
+ "Hungarian", "Macedonian", "Romanian"
33
+ ]
34
+
35
+
36
+ def get_local_model_path(repo_id: str) -> str:
37
+ folder_name = QWEN3_ASR_MODELS.get(repo_id) or QWEN3_FORCED_ALIGNERS.get(repo_id) or repo_id.replace("/", "_")
38
+ return os.path.join(QWEN3_ASR_MODELS_DIR, folder_name)
39
+
40
+
41
+ def migrate_cached_model(repo_id: str, target_path: str) -> bool:
42
+ if os.path.exists(target_path) and os.listdir(target_path):
43
+ return True
44
+
45
+ hf_cache = os.path.join(os.path.expanduser("~"), ".cache", "huggingface", "hub")
46
+ hf_model_dir = os.path.join(hf_cache, f"models--{repo_id.replace('/', '--')}")
47
+ if os.path.exists(hf_model_dir):
48
+ snapshots_dir = os.path.join(hf_model_dir, "snapshots")
49
+ if os.path.exists(snapshots_dir):
50
+ snapshots = os.listdir(snapshots_dir)
51
+ if snapshots:
52
+ source = os.path.join(snapshots_dir, snapshots[0])
53
+ print(f"Migrating model from HuggingFace cache: {source} -> {target_path}")
54
+ shutil.copytree(source, target_path, dirs_exist_ok=True)
55
+ return True
56
+
57
+ ms_cache = os.path.join(os.path.expanduser("~"), ".cache", "modelscope", "hub")
58
+ ms_model_dir = os.path.join(ms_cache, repo_id.replace("/", os.sep))
59
+ if os.path.exists(ms_model_dir):
60
+ print(f"Migrating model from ModelScope cache: {ms_model_dir} -> {target_path}")
61
+ shutil.copytree(ms_model_dir, target_path, dirs_exist_ok=True)
62
+ return True
63
+
64
+ return False
65
+
66
+
67
+ def download_model_to_comfyui(repo_id: str, source: str) -> str:
68
+ target_path = get_local_model_path(repo_id)
69
+
70
+ if migrate_cached_model(repo_id, target_path):
71
+ print(f"Model available at: {target_path}")
72
+ return target_path
73
+
74
+ os.makedirs(target_path, exist_ok=True)
75
+
76
+ if source == "ModelScope":
77
+ from modelscope import snapshot_download
78
+ print(f"Downloading {repo_id} from ModelScope to {target_path}...")
79
+ snapshot_download(repo_id, local_dir=target_path)
80
+ else:
81
+ from huggingface_hub import snapshot_download
82
+ print(f"Downloading {repo_id} from HuggingFace to {target_path}...")
83
+ snapshot_download(repo_id, local_dir=target_path)
84
+
85
+ return target_path
86
+
87
+
88
+ def load_audio_input(audio_input):
89
+ if audio_input is None:
90
+ return None
91
+
92
+ waveform = audio_input["waveform"]
93
+ sr = audio_input["sample_rate"]
94
+
95
+ wav = waveform[0]
96
+
97
+ if wav.shape[0] > 1:
98
+ wav = torch.mean(wav, dim=0)
99
+ else:
100
+ wav = wav.squeeze(0)
101
+
102
+ return (wav.numpy().astype(np.float32), sr)
103
+
104
+
105
+ class Qwen3ASRLoader:
106
+ @classmethod
107
+ def INPUT_TYPES(s):
108
+ return {
109
+ "required": {
110
+ "repo_id": (list(QWEN3_ASR_MODELS.keys()), {"default": "Qwen/Qwen3-ASR-1.7B"}),
111
+ "source": (["HuggingFace", "ModelScope"], {"default": "HuggingFace"}),
112
+ "precision": (["fp16", "bf16", "fp32"], {"default": "bf16"}),
113
+ "attention": (["auto", "flash_attention_2", "sdpa", "eager"], {"default": "auto"}),
114
+ },
115
+ "optional": {
116
+ "forced_aligner": (list(QWEN3_FORCED_ALIGNERS.keys()), {"default": "None"}),
117
+ "local_model_path": ("STRING", {"default": "", "multiline": False}),
118
+ }
119
+ }
120
+
121
+ RETURN_TYPES = ("QWEN3_ASR_MODEL",)
122
+ RETURN_NAMES = ("model",)
123
+ FUNCTION = "load_model"
124
+ CATEGORY = "Qwen3-ASR"
125
+
126
+ def load_model(self, repo_id, source, precision, attention, forced_aligner="None", local_model_path=""):
127
+ device = mm.get_torch_device()
128
+
129
+ dtype = torch.float32
130
+ if precision == "bf16":
131
+ if device.type == "mps":
132
+ dtype = torch.float16
133
+ print("Note: Using fp16 on MPS (bf16 has limited support)")
134
+ else:
135
+ dtype = torch.bfloat16
136
+ elif precision == "fp16":
137
+ dtype = torch.float16
138
+
139
+ if local_model_path and local_model_path.strip() != "":
140
+ model_path = local_model_path.strip()
141
+ print(f"Loading from local path: {model_path}")
142
+ else:
143
+ local_path = get_local_model_path(repo_id)
144
+ if os.path.exists(local_path) and os.listdir(local_path):
145
+ model_path = local_path
146
+ print(f"Loading from ComfyUI models folder: {model_path}")
147
+ else:
148
+ model_path = download_model_to_comfyui(repo_id, source)
149
+
150
+ model_kwargs = dict(
151
+ dtype=dtype,
152
+ device_map=str(device),
153
+ max_inference_batch_size=32,
154
+ max_new_tokens=256,
155
+ )
156
+ if attention != "auto":
157
+ model_kwargs["attn_implementation"] = attention
158
+
159
+ if forced_aligner and forced_aligner != "None":
160
+ aligner_local = get_local_model_path(forced_aligner)
161
+ if not (os.path.exists(aligner_local) and os.listdir(aligner_local)):
162
+ aligner_local = download_model_to_comfyui(forced_aligner, source)
163
+ model_kwargs["forced_aligner"] = aligner_local
164
+ model_kwargs["forced_aligner_kwargs"] = dict(
165
+ dtype=dtype,
166
+ device_map=str(device),
167
+ )
168
+ if attention != "auto":
169
+ model_kwargs["forced_aligner_kwargs"]["attn_implementation"] = attention
170
+
171
+ print(f"Loading Qwen3-ASR model from {model_path}...")
172
+ model = Qwen3ASRModel.from_pretrained(model_path, **model_kwargs)
173
+
174
+ return (model,)
175
+
176
+
177
+ class Qwen3ASRTranscribe:
178
+ @classmethod
179
+ def INPUT_TYPES(s):
180
+ return {
181
+ "required": {
182
+ "model": ("QWEN3_ASR_MODEL",),
183
+ "audio": ("AUDIO",),
184
+ },
185
+ "optional": {
186
+ "language": (SUPPORTED_LANGUAGES, {"default": "auto"}),
187
+ "context": ("STRING", {"default": "", "multiline": True}),
188
+ "return_timestamps": ("BOOLEAN", {"default": False}),
189
+ }
190
+ }
191
+
192
+ RETURN_TYPES = ("STRING", "STRING", "STRING")
193
+ RETURN_NAMES = ("text", "language", "timestamps")
194
+ FUNCTION = "transcribe"
195
+ CATEGORY = "Qwen3-ASR"
196
+
197
+ def transcribe(self, model, audio, language="auto", context="", return_timestamps=False):
198
+ audio_data = load_audio_input(audio)
199
+ if audio_data is None:
200
+ return ("", "", "")
201
+
202
+ lang = None if language == "auto" else language
203
+ ctx = context if context.strip() else ""
204
+
205
+ results = model.transcribe(
206
+ audio=audio_data,
207
+ language=lang,
208
+ context=ctx if ctx else None,
209
+ return_time_stamps=return_timestamps,
210
+ )
211
+
212
+ result = results[0]
213
+ text = result.text
214
+ detected_lang = result.language or ""
215
+
216
+ timestamps_str = ""
217
+ if return_timestamps and result.time_stamps:
218
+ ts_lines = []
219
+ for ts in result.time_stamps:
220
+ ts_lines.append(f"{ts.start_time:.2f}-{ts.end_time:.2f}: {ts.text}")
221
+ timestamps_str = "\n".join(ts_lines)
222
+
223
+ return (text, detected_lang, timestamps_str)
224
+
225
+
226
+ class Qwen3ASRBatchTranscribe:
227
+ @classmethod
228
+ def INPUT_TYPES(s):
229
+ return {
230
+ "required": {
231
+ "model": ("QWEN3_ASR_MODEL",),
232
+ "audio_list": ("AUDIO",),
233
+ },
234
+ "optional": {
235
+ "language": (SUPPORTED_LANGUAGES, {"default": "auto"}),
236
+ "return_timestamps": ("BOOLEAN", {"default": False}),
237
+ }
238
+ }
239
+
240
+ RETURN_TYPES = ("STRING",)
241
+ RETURN_NAMES = ("transcriptions",)
242
+ FUNCTION = "batch_transcribe"
243
+ CATEGORY = "Qwen3-ASR"
244
+
245
+ def batch_transcribe(self, model, audio_list, language="auto", return_timestamps=False):
246
+ if not isinstance(audio_list, list):
247
+ audio_list = [audio_list]
248
+
249
+ audio_inputs = []
250
+ for audio in audio_list:
251
+ audio_data = load_audio_input(audio)
252
+ if audio_data:
253
+ audio_inputs.append(audio_data)
254
+
255
+ if not audio_inputs:
256
+ return ("",)
257
+
258
+ lang = None if language == "auto" else language
259
+ languages = [lang] * len(audio_inputs) if lang else None
260
+
261
+ results = model.transcribe(
262
+ audio=audio_inputs,
263
+ language=languages,
264
+ return_time_stamps=return_timestamps,
265
+ )
266
+
267
+ output_lines = []
268
+ for i, result in enumerate(results):
269
+ line = f"[{i}] ({result.language}): {result.text}"
270
+ output_lines.append(line)
271
+ if return_timestamps and result.time_stamps:
272
+ for ts in result.time_stamps:
273
+ output_lines.append(f" {ts.start_time:.2f}-{ts.end_time:.2f}: {ts.text}")
274
+
275
+ return ("\n".join(output_lines),)
ComfyUI-Qwen3-ASR/pyproject.toml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "comfyui-qwen-asr"
3
+ description = "A ComfyUI custom node suite for Qwen3-ASR, supporting speech-to-text transcription with 1.7B and 0.6B models, 52 languages/dialects, and optional timestamp alignment."
4
+ version = "1.0.0"
5
+ license = { text = "Apache-2.0" }
6
+
7
+ dependencies = ["qwen-asr", "modelscope", "soundfile", "numpy", "torch", "transformers", "accelerate"]
8
+
9
+ [project.urls]
10
+ Repository = "https://github.com/DarioFT/ComfyUI-Qwen3-ASR"
11
+ Documentation = "https://github.com/DarioFT/ComfyUI-Qwen3-ASR/wiki"
12
+ "Bug Tracker" = "https://github.com/DarioFT/ComfyUI-Qwen3-ASR/issues"
13
+
14
+ [tool.comfy]
15
+ PublisherId = "darioft"
16
+ DisplayName = "ComfyUI-Qwen3-ASR"
ComfyUI-Qwen3-ASR/requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ qwen-asr
2
+ modelscope
3
+ soundfile
4
+ numpy
5
+ torch
6
+ transformers
7
+ accelerate