import gradio as gr import torch from transformers import WhisperProcessor, WhisperForConditionalGeneration import librosa import numpy as np import spaces MODEL_ID = "userdotcs/whisper-small-turkish-phono-finetuned" device = "cuda" if torch.cuda.is_available() else "cpu" # İşlemci ve Modeli yükle processor = WhisperProcessor.from_pretrained(MODEL_ID) model = WhisperForConditionalGeneration.from_pretrained(MODEL_ID).to(device) @spaces.GPU def transcribe(audio): if audio is None: return "Ses dosyası algılanamadı." sr, y = audio # 1. Float32 Dönüşümü ve Normalizasyon (KRİTİK ADIM) y = y.astype(np.float32) y /= np.max(np.abs(y)) if np.max(np.abs(y)) > 0 else 1 # 2. Mono Yapma if y.ndim > 1: y = y.mean(axis=1) # 3. Resampling if sr != 16000: y = librosa.resample(y, orig_sr=sr, target_sr=16000) sr = 16000 # 4. Input Hazırlama input_features = processor(y, sampling_rate=sr, return_tensors="pt").input_features.to(device) # 5. Inference with torch.no_grad(): # forced_decoder_ids ile modeli Türkçe'ye zorlayabilirsin (opsiyonel) predicted_ids = model.generate(input_features, max_new_tokens=128) transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] return transcription # Arayüz demo = gr.Interface( fn=transcribe, inputs=gr.Audio(type="numpy"), # Gradio 4+ kullanıyorsan bu kalsın outputs="text", title="Whisper Small Turkish Phonetic Trancription" ) demo.launch()