metadata
datasets:
- islomov/it_youtube_uzbek_speech_dataset
- islomov/news_youtube_uzbek_speech_dataset
- islomov/podcasts_tashkent_dialect_youtube_uzbek_speech_dataset
language:
- uz
base_model:
- Qwen/Qwen3-ASR-0.6B
pip install -U qwen-asr
import torch
from qwen_asr import Qwen3ASRModel
model = Qwen3ASRModel.from_pretrained(
"Saidakmal/ASR_Qwen_0.6B_UZ",
dtype=torch.bfloat16,
device_map="cuda:0",
# attn_implementation="flash_attention_2",
max_inference_batch_size=32, # Batch size limit for inference. -1 means unlimited. Smaller values can help avoid OOM.
max_new_tokens=256, # Maximum number of tokens to generate. Set a larger value for long audio input.
)
results = model.transcribe(
audio="path_audio",
language=None,
print(results[0].language)
print(results[0].text)