Upload 10 files

Browse files

Files changed (11) hide show

.gitattributes +1 -0
ComfyUI-Qwen3-ASR/.github/workflows/publish.yml +20 -0
ComfyUI-Qwen3-ASR/.gitignore +29 -0
ComfyUI-Qwen3-ASR/README.md +96 -0
ComfyUI-Qwen3-ASR/__init__.py +19 -0
ComfyUI-Qwen3-ASR/assets/intro.png +3 -0
ComfyUI-Qwen3-ASR/example_workflows/base.json +207 -0
ComfyUI-Qwen3-ASR/example_workflows/simple_voice_clone-REQUIRES-TTS.json +377 -0
ComfyUI-Qwen3-ASR/nodes.py +275 -0
ComfyUI-Qwen3-ASR/pyproject.toml +16 -0
ComfyUI-Qwen3-ASR/requirements.txt +7 -0

.gitattributes CHANGED Viewed

@@ -148,3 +148,4 @@ LanPaint/examples/Original_No_Mask-example18.gif filter=lfs diff=lfs merge=lfs -
 LanPaint/examples/Original_No_Mask_example17.gif filter=lfs diff=lfs merge=lfs -text
 LanPaint/examples/Outpainted_40frames_Drag_Me_to_ComfyUI_example19.gif filter=lfs diff=lfs merge=lfs -text
 LanPaint/Nodes.JPG filter=lfs diff=lfs merge=lfs -text

 LanPaint/examples/Original_No_Mask_example17.gif filter=lfs diff=lfs merge=lfs -text
 LanPaint/examples/Outpainted_40frames_Drag_Me_to_ComfyUI_example19.gif filter=lfs diff=lfs merge=lfs -text
 LanPaint/Nodes.JPG filter=lfs diff=lfs merge=lfs -text
+ComfyUI-Qwen3-ASR/assets/intro.png filter=lfs diff=lfs merge=lfs -text

ComfyUI-Qwen3-ASR/.github/workflows/publish.yml ADDED Viewed

	@@ -0,0 +1,20 @@

+name: Publish to Comfy registry
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - main
+    paths:
+      - "pyproject.toml"
+jobs:
+  publish-node:
+    name: Publish Custom Node to registry
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out code
+        uses: actions/checkout@v4
+      - name: Publish Custom Node
+        uses: Comfy-Org/publish-node-action@main
+        with:
+          personal_access_token: ${{ secrets.REGISTRY_ACCESS_TOKEN }} ## Add your own personal access token to your Github Repository secrets and reference it here.

ComfyUI-Qwen3-ASR/.gitignore ADDED Viewed

	@@ -0,0 +1,29 @@

+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+.env
+.venv
+env/
+venv/
+.idea/
+.vscode/
+*.swp
+*.swo
+.DS_Store

ComfyUI-Qwen3-ASR/README.md ADDED Viewed

	@@ -0,0 +1,96 @@

+# ComfyUI-Qwen3-ASR
+ComfyUI custom nodes for **Qwen3-ASR** (Automatic Speech Recognition) - audio-to-text transcription supporting 52 languages and dialects.
+> 🔗 Compatible with [ComfyUI-Qwen3-TTS](https://github.com/DarioFT/ComfyUI-Qwen3-TTS) for complete speech workflows
+<p align="center">
+    <img src="https://raw.githubusercontent.com/DarioFT/ComfyUI-Qwen3-ASR/refs/heads/main/assets/intro.png"/>
+<p>
+## Features
+- **Multi-language**: 30 languages + 22 Chinese dialects
+- **Two model sizes**: 1.7B (best quality) and 0.6B (faster)
+- **Auto language detection**: No need to specify language
+- **Timestamps**: Optional word/character-level timing via Forced Aligner
+- **Batch processing**: Transcribe multiple audio files
+- **Auto-download**: Models download automatically on first use
+## Installation
+### Via ComfyUI Manager (Recommended)
+Search for "Qwen3-ASR" in ComfyUI Manager
+### Manual Installation
+```bash
+cd ComfyUI/custom_nodes
+git clone https://github.com/DarioFT/ComfyUI-Qwen3-ASR.git
+cd ComfyUI-Qwen3-ASR
+pip install -r requirements.txt
+```
+## Nodes
+### Qwen3-ASR Loader
+Loads the ASR model with auto-download support.
+| Input | Type | Description |
+|-------|------|-------------|
+| repo_id | dropdown | Model: 1.7B or 0.6B |
+| source | dropdown | HuggingFace or ModelScope |
+| precision | dropdown | fp16, bf16, fp32 |
+| attention | dropdown | auto, flash_attention_2, sdpa, eager |
+| forced_aligner | dropdown | Optional aligner for timestamps |
+| local_model_path | string | Optional custom model path |
+### Qwen3-ASR Transcribe
+Transcribes a single audio input to text.
+| Input | Type | Description |
+|-------|------|-------------|
+| model | QWEN3_ASR_MODEL | Loaded model |
+| audio | AUDIO | Audio input (ComfyUI format) |
+| language | dropdown | Force language or "auto" |
+| context | string | Optional context hints |
+| return_timestamps | boolean | Enable timestamp output |
+| Output | Type | Description |
+|--------|------|-------------|
+| text | STRING | Transcribed text |
+| language | STRING | Detected language |
+| timestamps | STRING | Word-level timestamps (if enabled) |
+### Qwen3-ASR Batch Transcribe
+Batch transcription for multiple audio files.
+## Supported Languages
+Chinese, English, Cantonese, Arabic, German, French, Spanish, Portuguese, Indonesian, Italian, Korean, Russian, Thai, Vietnamese, Japanese, Turkish, Hindi, Malay, Dutch, Swedish, Danish, Finnish, Polish, Czech, Filipino, Persian, Greek, Hungarian, Macedonian, Romanian
+Plus 22 Chinese dialects including Sichuan, Cantonese (HK/Guangdong), Wu, Minnan, and regional accents.
+## Workflow Examples
+### Basic Transcription
+```
+LoadAudio → Qwen3-ASR Loader → Qwen3-ASR Transcribe → ShowText
+```
+### With TTS (Speech-to-Speech)
+```
+LoadAudio → Qwen3-ASR Transcribe → [process text] → Qwen3-TTS → SaveAudio
+```
+## Model Storage
+Models are stored in: `ComfyUI/models/Qwen3-ASR/`
+## Credits
+- [Qwen3-ASR](https://huggingface.co/Qwen/Qwen3-ASR-1.7B) by Alibaba Qwen Team
+- [qwen-asr](https://pypi.org/project/qwen-asr/) Python package
+## License
+Apache-2.0

ComfyUI-Qwen3-ASR/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from .nodes import (
+    Qwen3ASRLoader,
+    Qwen3ASRTranscribe,
+    Qwen3ASRBatchTranscribe,
+)
+NODE_CLASS_MAPPINGS = {
+    "Qwen3ASRLoader": Qwen3ASRLoader,
+    "Qwen3ASRTranscribe": Qwen3ASRTranscribe,
+    "Qwen3ASRBatchTranscribe": Qwen3ASRBatchTranscribe,
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+    "Qwen3ASRLoader": "Qwen3-ASR Loader",
+    "Qwen3ASRTranscribe": "Qwen3-ASR Transcribe",
+    "Qwen3ASRBatchTranscribe": "Qwen3-ASR Batch Transcribe",
+}
+__all__ = ["NODE_CLASS_MAPPINGS", "NODE_DISPLAY_NAME_MAPPINGS"]

ComfyUI-Qwen3-ASR/assets/intro.png ADDED Viewed

Git LFS Details

SHA256: cf528a961ff1f3929d7f10a65ab92501d49143fa2c0bfb70c06ebf4dc0b79a5c
Pointer size: 131 Bytes
Size of remote file: 148 kB

ComfyUI-Qwen3-ASR/example_workflows/base.json ADDED Viewed

	@@ -0,0 +1,207 @@

+{
+  "id": "560c0123-a3c9-4148-b0cb-7b705dd02044",
+  "revision": 0,
+  "last_node_id": 5,
+  "last_link_id": 3,
+  "nodes": [
+    {
+      "id": 4,
+      "type": "LoadAudio",
+      "pos": [
+        26.0944883333357,
+        248.1008013888877
+      ],
+      "size": [
+        282.83333587646484,
+        136
+      ],
+      "flags": {},
+      "order": 0,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "AUDIO",
+          "type": "AUDIO",
+          "links": [
+            1
+          ]
+        }
+      ],
+      "properties": {
+        "cnr_id": "comfy-core",
+        "ver": "0.10.0",
+        "Node name for S&R": "LoadAudio"
+      },
+      "widgets_values": [
+        "1.wav",
+        null,
+        ""
+      ]
+    },
+    {
+      "id": 1,
+      "type": "Qwen3ASRLoader",
+      "pos": [
+        31.015860833335807,
+        -4.379546388890003
+      ],
+      "size": [
+        270,
+        178
+      ],
+      "flags": {},
+      "order": 1,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "model",
+          "type": "QWEN3_ASR_MODEL",
+          "links": [
+            2
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "Qwen3ASRLoader"
+      },
+      "widgets_values": [
+        "Qwen/Qwen3-ASR-1.7B",
+        "HuggingFace",
+        "bf16",
+        "auto",
+        "None",
+        ""
+      ]
+    },
+    {
+      "id": 2,
+      "type": "Qwen3ASRTranscribe",
+      "pos": [
+        366.6877755555577,
+        51.57828166666546
+      ],
+      "size": [
+        400,
+        200
+      ],
+      "flags": {},
+      "order": 2,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "model",
+          "type": "QWEN3_ASR_MODEL",
+          "link": 2
+        },
+        {
+          "name": "audio",
+          "type": "AUDIO",
+          "link": 1
+        }
+      ],
+      "outputs": [
+        {
+          "name": "text",
+          "type": "STRING",
+          "links": [
+            3
+          ]
+        },
+        {
+          "name": "language",
+          "type": "STRING",
+          "links": null
+        },
+        {
+          "name": "timestamps",
+          "type": "STRING",
+          "links": null
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "Qwen3ASRTranscribe"
+      },
+      "widgets_values": [
+        "auto",
+        "",
+        false
+      ]
+    },
+    {
+      "id": 5,
+      "type": "PreviewAny",
+      "pos": [
+        837.7456827777796,
+        51.88921805555435
+      ],
+      "size": [
+        210,
+        166
+      ],
+      "flags": {},
+      "order": 3,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "source",
+          "type": "*",
+          "link": 3
+        }
+      ],
+      "outputs": [],
+      "properties": {
+        "cnr_id": "comfy-core",
+        "ver": "0.10.0",
+        "Node name for S&R": "PreviewAny"
+      },
+      "widgets_values": [
+        null,
+        null,
+        false
+      ]
+    }
+  ],
+  "links": [
+    [
+      1,
+      4,
+      0,
+      2,
+      1,
+      "AUDIO"
+    ],
+    [
+      2,
+      1,
+      0,
+      2,
+      0,
+      "QWEN3_ASR_MODEL"
+    ],
+    [
+      3,
+      2,
+      0,
+      5,
+      0,
+      "STRING"
+    ]
+  ],
+  "groups": [],
+  "config": {},
+  "extra": {
+    "workflowRendererVersion": "LG",
+    "ue_links": [],
+    "ds": {
+      "scale": 1.128526645768025,
+      "offset": [
+        249.0430116666644,
+        262.42325416666796
+      ]
+    },
+    "frontendVersion": "1.37.11"
+  },
+  "version": 0.4
+}

ComfyUI-Qwen3-ASR/example_workflows/simple_voice_clone-REQUIRES-TTS.json ADDED Viewed

	@@ -0,0 +1,377 @@

+{
+  "id": "560c0123-a3c9-4148-b0cb-7b705dd02044",
+  "revision": 0,
+  "last_node_id": 9,
+  "last_link_id": 10,
+  "nodes": [
+    {
+      "id": 1,
+      "type": "Qwen3ASRLoader",
+      "pos": [
+        31.015860833335807,
+        -4.379546388890003
+      ],
+      "size": [
+        270,
+        178
+      ],
+      "flags": {},
+      "order": 0,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "model",
+          "type": "QWEN3_ASR_MODEL",
+          "links": [
+            2
+          ]
+        }
+      ],
+      "properties": {
+        "aux_id": "DarioFT/ComfyUI-Qwen3-ASR",
+        "ver": "5cdfee1b78f5b92a3f9d6baeabfb3bc688f551c4",
+        "Node name for S&R": "Qwen3ASRLoader"
+      },
+      "widgets_values": [
+        "Qwen/Qwen3-ASR-1.7B",
+        "HuggingFace",
+        "bf16",
+        "auto",
+        "None",
+        ""
+      ]
+    },
+    {
+      "id": 5,
+      "type": "PreviewAny",
+      "pos": [
+        439.96154388889096,
+        249.17299583333204
+      ],
+      "size": [
+        210,
+        166
+      ],
+      "flags": {},
+      "order": 4,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "source",
+          "type": "*",
+          "link": 3
+        }
+      ],
+      "outputs": [],
+      "properties": {
+        "cnr_id": "comfy-core",
+        "ver": "0.10.0",
+        "Node name for S&R": "PreviewAny"
+      },
+      "widgets_values": [
+        null,
+        null,
+        null
+      ]
+    },
+    {
+      "id": 6,
+      "type": "Qwen3Loader",
+      "pos": [
+        899.2585502500009,
+        -216.610786916668
+      ],
+      "size": [
+        270,
+        154
+      ],
+      "flags": {},
+      "order": 1,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "model",
+          "type": "QWEN3_MODEL",
+          "links": [
+            6
+          ]
+        }
+      ],
+      "properties": {
+        "cnr_id": "comfyui-qwen3-tts",
+        "ver": "6289ee949a75455e9fe1f90ac6d5f51445f03c73",
+        "Node name for S&R": "Qwen3Loader"
+      },
+      "widgets_values": [
+        "Qwen/Qwen3-TTS-12Hz-1.7B-Base",
+        "HuggingFace",
+        "bf16",
+        "auto",
+        ""
+      ]
+    },
+    {
+      "id": 4,
+      "type": "LoadAudio",
+      "pos": [
+        441.0337383333358,
+        -206.50964305555672
+      ],
+      "size": [
+        282.83333587646484,
+        136
+      ],
+      "flags": {},
+      "order": 2,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "AUDIO",
+          "type": "AUDIO",
+          "links": [
+            1,
+            7
+          ]
+        }
+      ],
+      "properties": {
+        "cnr_id": "comfy-core",
+        "ver": "0.10.0",
+        "Node name for S&R": "LoadAudio"
+      },
+      "widgets_values": [
+        "1.wav",
+        null,
+        null
+      ]
+    },
+    {
+      "id": 2,
+      "type": "Qwen3ASRTranscribe",
+      "pos": [
+        363.4711922222244,
+        -3.1036350000012103
+      ],
+      "size": [
+        400,
+        200
+      ],
+      "flags": {},
+      "order": 3,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "model",
+          "type": "QWEN3_ASR_MODEL",
+          "link": 2
+        },
+        {
+          "name": "audio",
+          "type": "AUDIO",
+          "link": 1
+        }
+      ],
+      "outputs": [
+        {
+          "name": "text",
+          "type": "STRING",
+          "links": [
+            3,
+            10
+          ]
+        },
+        {
+          "name": "language",
+          "type": "STRING",
+          "links": null
+        },
+        {
+          "name": "timestamps",
+          "type": "STRING",
+          "links": null
+        }
+      ],
+      "properties": {
+        "aux_id": "DarioFT/ComfyUI-Qwen3-ASR",
+        "ver": "5cdfee1b78f5b92a3f9d6baeabfb3bc688f551c4",
+        "Node name for S&R": "Qwen3ASRTranscribe"
+      },
+      "widgets_values": [
+        "auto",
+        "",
+        false
+      ]
+    },
+    {
+      "id": 9,
+      "type": "Qwen3VoiceClone",
+      "pos": [
+        841.1541889166685,
+        -5.106494222223706
+      ],
+      "size": [
+        402.1443888888889,
+        324.5160833333333
+      ],
+      "flags": {},
+      "order": 5,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "model",
+          "type": "QWEN3_MODEL",
+          "link": 6
+        },
+        {
+          "name": "ref_audio",
+          "shape": 7,
+          "type": "AUDIO",
+          "link": 7
+        },
+        {
+          "name": "prompt",
+          "shape": 7,
+          "type": "QWEN3_PROMPT",
+          "link": null
+        },
+        {
+          "name": "ref_text",
+          "shape": 7,
+          "type": "STRING",
+          "widget": {
+            "name": "ref_text"
+          },
+          "link": 10
+        }
+      ],
+      "outputs": [
+        {
+          "name": "AUDIO",
+          "type": "AUDIO",
+          "links": [
+            9
+          ]
+        }
+      ],
+      "properties": {
+        "cnr_id": "comfyui-qwen3-tts",
+        "ver": "80a0fd71c2ed791285d552727e2a4b77e9b91a3d",
+        "Node name for S&R": "Qwen3VoiceClone"
+      },
+      "widgets_values": [
+        "The Qwen3-ASR family ASR models maintains high-quality and robust recognition under complex acoustic environments and challenging text patterns.",
+        73052475174351,
+        "randomize",
+        "Auto",
+        "",
+        2048,
+        30
+      ]
+    },
+    {
+      "id": 7,
+      "type": "PreviewAudio",
+      "pos": [
+        1307.8932808055577,
+        -3.6268658888902534
+      ],
+      "size": [
+        270,
+        88
+      ],
+      "flags": {},
+      "order": 6,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "audio",
+          "type": "AUDIO",
+          "link": 9
+        }
+      ],
+      "outputs": [],
+      "properties": {
+        "cnr_id": "comfy-core",
+        "ver": "0.10.0",
+        "Node name for S&R": "PreviewAudio"
+      },
+      "widgets_values": []
+    }
+  ],
+  "links": [
+    [
+      1,
+      4,
+      0,
+      2,
+      1,
+      "AUDIO"
+    ],
+    [
+      2,
+      1,
+      0,
+      2,
+      0,
+      "QWEN3_ASR_MODEL"
+    ],
+    [
+      3,
+      2,
+      0,
+      5,
+      0,
+      "STRING"
+    ],
+    [
+      6,
+      6,
+      0,
+      9,
+      0,
+      "QWEN3_MODEL"
+    ],
+    [
+      7,
+      4,
+      0,
+      9,
+      1,
+      "AUDIO"
+    ],
+    [
+      9,
+      9,
+      0,
+      7,
+      0,
+      "AUDIO"
+    ],
+    [
+      10,
+      2,
+      0,
+      9,
+      3,
+      "STRING"
+    ]
+  ],
+  "groups": [],
+  "config": {},
+  "extra": {
+    "workflowRendererVersion": "LG",
+    "ue_links": [],
+    "ds": {
+      "scale": 0.932666649395062,
+      "offset": [
+        166.70858886110872,
+        451.1393831111121
+      ]
+    },
+    "frontendVersion": "1.37.11"
+  },
+  "version": 0.4
+}

ComfyUI-Qwen3-ASR/nodes.py ADDED Viewed

	@@ -0,0 +1,275 @@

+import os
+import shutil
+import torch
+import numpy as np
+import folder_paths
+import comfy.model_management as mm
+from qwen_asr import Qwen3ASRModel
+# Register Qwen3-ASR models folder with ComfyUI
+QWEN3_ASR_MODELS_DIR = os.path.join(folder_paths.models_dir, "Qwen3-ASR")
+os.makedirs(QWEN3_ASR_MODELS_DIR, exist_ok=True)
+folder_paths.add_model_folder_path("Qwen3-ASR", QWEN3_ASR_MODELS_DIR)
+# Model repo mappings
+QWEN3_ASR_MODELS = {
+    "Qwen/Qwen3-ASR-1.7B": "Qwen3-ASR-1.7B",
+    "Qwen/Qwen3-ASR-0.6B": "Qwen3-ASR-0.6B",
+}
+QWEN3_FORCED_ALIGNERS = {
+    "None": None,
+    "Qwen/Qwen3-ForcedAligner-0.6B": "Qwen3-ForcedAligner-0.6B",
+}
+# Supported languages
+SUPPORTED_LANGUAGES = [
+    "auto",
+    "Chinese", "English", "Cantonese", "Arabic", "German", "French", "Spanish",
+    "Portuguese", "Indonesian", "Italian", "Korean", "Russian", "Thai",
+    "Vietnamese", "Japanese", "Turkish", "Hindi", "Malay", "Dutch", "Swedish",
+    "Danish", "Finnish", "Polish", "Czech", "Filipino", "Persian", "Greek",
+    "Hungarian", "Macedonian", "Romanian"
+]
+def get_local_model_path(repo_id: str) -> str:
+    folder_name = QWEN3_ASR_MODELS.get(repo_id) or QWEN3_FORCED_ALIGNERS.get(repo_id) or repo_id.replace("/", "_")
+    return os.path.join(QWEN3_ASR_MODELS_DIR, folder_name)
+def migrate_cached_model(repo_id: str, target_path: str) -> bool:
+    if os.path.exists(target_path) and os.listdir(target_path):
+        return True
+    hf_cache = os.path.join(os.path.expanduser("~"), ".cache", "huggingface", "hub")
+    hf_model_dir = os.path.join(hf_cache, f"models--{repo_id.replace('/', '--')}")
+    if os.path.exists(hf_model_dir):
+        snapshots_dir = os.path.join(hf_model_dir, "snapshots")
+        if os.path.exists(snapshots_dir):
+            snapshots = os.listdir(snapshots_dir)
+            if snapshots:
+                source = os.path.join(snapshots_dir, snapshots[0])
+                print(f"Migrating model from HuggingFace cache: {source} -> {target_path}")
+                shutil.copytree(source, target_path, dirs_exist_ok=True)
+                return True
+    ms_cache = os.path.join(os.path.expanduser("~"), ".cache", "modelscope", "hub")
+    ms_model_dir = os.path.join(ms_cache, repo_id.replace("/", os.sep))
+    if os.path.exists(ms_model_dir):
+        print(f"Migrating model from ModelScope cache: {ms_model_dir} -> {target_path}")
+        shutil.copytree(ms_model_dir, target_path, dirs_exist_ok=True)
+        return True
+    return False
+def download_model_to_comfyui(repo_id: str, source: str) -> str:
+    target_path = get_local_model_path(repo_id)
+    if migrate_cached_model(repo_id, target_path):
+        print(f"Model available at: {target_path}")
+        return target_path
+    os.makedirs(target_path, exist_ok=True)
+    if source == "ModelScope":
+        from modelscope import snapshot_download
+        print(f"Downloading {repo_id} from ModelScope to {target_path}...")
+        snapshot_download(repo_id, local_dir=target_path)
+    else:
+        from huggingface_hub import snapshot_download
+        print(f"Downloading {repo_id} from HuggingFace to {target_path}...")
+        snapshot_download(repo_id, local_dir=target_path)
+    return target_path
+def load_audio_input(audio_input):
+    if audio_input is None:
+        return None
+    waveform = audio_input["waveform"]
+    sr = audio_input["sample_rate"]
+    wav = waveform[0]
+    if wav.shape[0] > 1:
+        wav = torch.mean(wav, dim=0)
+    else:
+        wav = wav.squeeze(0)
+    return (wav.numpy().astype(np.float32), sr)
+class Qwen3ASRLoader:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "repo_id": (list(QWEN3_ASR_MODELS.keys()), {"default": "Qwen/Qwen3-ASR-1.7B"}),
+                "source": (["HuggingFace", "ModelScope"], {"default": "HuggingFace"}),
+                "precision": (["fp16", "bf16", "fp32"], {"default": "bf16"}),
+                "attention": (["auto", "flash_attention_2", "sdpa", "eager"], {"default": "auto"}),
+            },
+            "optional": {
+                "forced_aligner": (list(QWEN3_FORCED_ALIGNERS.keys()), {"default": "None"}),
+                "local_model_path": ("STRING", {"default": "", "multiline": False}),
+            }
+        }
+    RETURN_TYPES = ("QWEN3_ASR_MODEL",)
+    RETURN_NAMES = ("model",)
+    FUNCTION = "load_model"
+    CATEGORY = "Qwen3-ASR"
+    def load_model(self, repo_id, source, precision, attention, forced_aligner="None", local_model_path=""):
+        device = mm.get_torch_device()
+        dtype = torch.float32
+        if precision == "bf16":
+            if device.type == "mps":
+                dtype = torch.float16
+                print("Note: Using fp16 on MPS (bf16 has limited support)")
+            else:
+                dtype = torch.bfloat16
+        elif precision == "fp16":
+            dtype = torch.float16
+        if local_model_path and local_model_path.strip() != "":
+            model_path = local_model_path.strip()
+            print(f"Loading from local path: {model_path}")
+        else:
+            local_path = get_local_model_path(repo_id)
+            if os.path.exists(local_path) and os.listdir(local_path):
+                model_path = local_path
+                print(f"Loading from ComfyUI models folder: {model_path}")
+            else:
+                model_path = download_model_to_comfyui(repo_id, source)
+        model_kwargs = dict(
+            dtype=dtype,
+            device_map=str(device),
+            max_inference_batch_size=32,
+            max_new_tokens=256,
+        )
+        if attention != "auto":
+            model_kwargs["attn_implementation"] = attention
+        if forced_aligner and forced_aligner != "None":
+            aligner_local = get_local_model_path(forced_aligner)
+            if not (os.path.exists(aligner_local) and os.listdir(aligner_local)):
+                aligner_local = download_model_to_comfyui(forced_aligner, source)
+            model_kwargs["forced_aligner"] = aligner_local
+            model_kwargs["forced_aligner_kwargs"] = dict(
+                dtype=dtype,
+                device_map=str(device),
+            )
+            if attention != "auto":
+                model_kwargs["forced_aligner_kwargs"]["attn_implementation"] = attention
+        print(f"Loading Qwen3-ASR model from {model_path}...")
+        model = Qwen3ASRModel.from_pretrained(model_path, **model_kwargs)
+        return (model,)
+class Qwen3ASRTranscribe:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "model": ("QWEN3_ASR_MODEL",),
+                "audio": ("AUDIO",),
+            },
+            "optional": {
+                "language": (SUPPORTED_LANGUAGES, {"default": "auto"}),
+                "context": ("STRING", {"default": "", "multiline": True}),
+                "return_timestamps": ("BOOLEAN", {"default": False}),
+            }
+        }
+    RETURN_TYPES = ("STRING", "STRING", "STRING")
+    RETURN_NAMES = ("text", "language", "timestamps")
+    FUNCTION = "transcribe"
+    CATEGORY = "Qwen3-ASR"
+    def transcribe(self, model, audio, language="auto", context="", return_timestamps=False):
+        audio_data = load_audio_input(audio)
+        if audio_data is None:
+            return ("", "", "")
+        lang = None if language == "auto" else language
+        ctx = context if context.strip() else ""
+        results = model.transcribe(
+            audio=audio_data,
+            language=lang,
+            context=ctx if ctx else None,
+            return_time_stamps=return_timestamps,
+        )
+        result = results[0]
+        text = result.text
+        detected_lang = result.language or ""
+        timestamps_str = ""
+        if return_timestamps and result.time_stamps:
+            ts_lines = []
+            for ts in result.time_stamps:
+                ts_lines.append(f"{ts.start_time:.2f}-{ts.end_time:.2f}: {ts.text}")
+            timestamps_str = "\n".join(ts_lines)
+        return (text, detected_lang, timestamps_str)
+class Qwen3ASRBatchTranscribe:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "model": ("QWEN3_ASR_MODEL",),
+                "audio_list": ("AUDIO",),
+            },
+            "optional": {
+                "language": (SUPPORTED_LANGUAGES, {"default": "auto"}),
+                "return_timestamps": ("BOOLEAN", {"default": False}),
+            }
+        }
+    RETURN_TYPES = ("STRING",)
+    RETURN_NAMES = ("transcriptions",)
+    FUNCTION = "batch_transcribe"
+    CATEGORY = "Qwen3-ASR"
+    def batch_transcribe(self, model, audio_list, language="auto", return_timestamps=False):
+        if not isinstance(audio_list, list):
+            audio_list = [audio_list]
+        audio_inputs = []
+        for audio in audio_list:
+            audio_data = load_audio_input(audio)
+            if audio_data:
+                audio_inputs.append(audio_data)
+        if not audio_inputs:
+            return ("",)
+        lang = None if language == "auto" else language
+        languages = [lang] * len(audio_inputs) if lang else None
+        results = model.transcribe(
+            audio=audio_inputs,
+            language=languages,
+            return_time_stamps=return_timestamps,
+        )
+        output_lines = []
+        for i, result in enumerate(results):
+            line = f"[{i}] ({result.language}): {result.text}"
+            output_lines.append(line)
+            if return_timestamps and result.time_stamps:
+                for ts in result.time_stamps:
+                    output_lines.append(f"    {ts.start_time:.2f}-{ts.end_time:.2f}: {ts.text}")
+        return ("\n".join(output_lines),)

ComfyUI-Qwen3-ASR/pyproject.toml ADDED Viewed

	@@ -0,0 +1,16 @@

+[project]
+name = "comfyui-qwen-asr"
+description = "A ComfyUI custom node suite for Qwen3-ASR, supporting speech-to-text transcription with 1.7B and 0.6B models, 52 languages/dialects, and optional timestamp alignment."
+version = "1.0.0"
+license = { text = "Apache-2.0" }
+dependencies = ["qwen-asr", "modelscope", "soundfile", "numpy", "torch", "transformers", "accelerate"]
+[project.urls]
+Repository = "https://github.com/DarioFT/ComfyUI-Qwen3-ASR"
+Documentation = "https://github.com/DarioFT/ComfyUI-Qwen3-ASR/wiki"
+"Bug Tracker" = "https://github.com/DarioFT/ComfyUI-Qwen3-ASR/issues"
+[tool.comfy]
+PublisherId = "darioft"
+DisplayName = "ComfyUI-Qwen3-ASR"

ComfyUI-Qwen3-ASR/requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+qwen-asr
+modelscope
+soundfile
+numpy
+torch
+transformers
+accelerate